--- public_html/misc/log-analytics/import_logs_orig.py 2012-04-06 15:04:12.000000000 +0100 +++ public_html/misc/log-analytics/import_IIS_logs.py 2012-05-13 16:49:09.841982130 +0100 @@ -6,6 +6,7 @@ # @link http://piwik.org # @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later # @version $Id: import_logs.py 6170 2012-04-06 14:04:12Z Cyril $ +# @version $Id: import_IIS_logs.py 006 2012-05-13 16:40:00 TIOUK $ # # For more info see: http://piwik.org/log-analytics/ @@ -62,21 +63,92 @@ '"(?P.*?)" "(?P.*?)"' ) +#IIS 6 All Log Otions +#Fields: date time s-sitename s-computername s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs-version cs(User-Agent) cs(Cookie) cs(Referer) cs-host sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken +#(\d+[-\d+]+ [\d+:]+) (W3SVC\d+) (/\S*) ([\d*.]+) ([G|P|O|H|D|T|C]\S+) (\S+) (\S+) (\d+) (\S+) ([\d*.]+) ([H|F]\S+) (\S+) (\S+) (\S+) (\S+) (\d+) (\S+) (\S+) (\S+) (\d+) (\d+) + +_IIS6_W3C_All = ( + '(?P^\d+[-\d+]+ [\d+:]+) ' + 'W3SVC\d+ \S+ [\d*.]+ [G|P|O|H|D|T|C]\S+ ' + '(?P/\S*) ' + '\S+ \d+ \S+ ' + '(?P[\d*.]*) ' + '[H|F]\S+ ' + '(?P\S+) ' + '\S+ ' + '(?P\S+) ' + '\S+ ' + '(?P\d+) ' + '\S+ \S+ ' + '(?P\S+) ' + '\d+ \d+' +) + +#IIS 6 Default Log Otions +#Fields: date time s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus +#(\d+[-\d+]+ [\d+:]+) ([\d*.]+) ([G|P|O|H|D|T|C]\S+) (/\S*) (\S+) (\d+) (\S+) ([\d*.]+) (\S+) (\d+) (\d+) + +_IIS6_W3C_Default = ( + '(?P^\d+[-\d+]+ [\d+:]+) ' + '[\d*.]+ [G|P|O|H|D|T|C]\S+ ' + '(?P/\S*) ' + '\S+ \d+ \S+ ' + '(?P[\d*.]*) ' + '(?P\S+) ' + '(?P\d+) ' + '\d+' +) + +#IIS 7 Default Log Otions +#Fields: date time s-sitename s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken +#(\d+[-\d+]+) [\d+:]+) (\S+) ([\d+.]+) ([G|P|O|H|D|T|C]\S+) (/\S*) (\S+) (\d+) (\S+) ([\d*.]*) (\S+) (\d+) (\d+) (\d+) (\d+) (\d+) (\d+) + +_IIS7_W3C_Default = ( + '(?P\d+[-\d+]+ [\d+:]+) ' + '\S+ [\d+.]+ [G|P|O|H|D|T|C]\S+ ' + '(?P/\S*) ' + '\S+ \d+ \S+ ' + '(?P[\d*.]*) ' + '(?P\S+) ' + '(?P\d+) ' + '\d+ \d+ ' + '(?P\d+) ' + '\d+ \d+' +) + +#IIS 7.5 Default Log Otions +#Fields: date time s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status time-taken +#(\d+[-\d+]+ [\d+:]+) ([\d*.]+) ([G|P|O|H|D|T|C]\S+) (/\S*) (\S+) (\d+) (\S+) ([\d*.]+) (\S+) (\d+) (\d+) (\d+) (\d+) + +_IIS75_W3C_Default = ( + '(?P^\d+[-\d+]+ [\d+:]+) ' + '[\d*.]+ [G|P|O|H|D|T|C]\S+ ' + '(?P/\S*) ' + '\S+ \d+ \S+ ' + '(?P[\d*.]*) ' + '(?P\S+) ' + '(?P\d+) ' + '\d+ \d+ \d+' +) + FORMATS = { 'common': _COMMON_LOG_FORMAT, 'common_vhost': '(?P[\w\-\.]*)(?::\d+)? ' + _COMMON_LOG_FORMAT, 'ncsa_extended': _NCSA_EXTENDED_LOG_FORMAT, 'common_complete': _COMMON_COMPLETE_LOG_FORMAT, + 'iis6_w3c_default': _IIS6_W3C_Default, + 'iis6_w3c_all': _IIS6_W3C_All, + 'iis7_w3c_default': _IIS7_W3C_Default, + 'iis75_w3c_default': _IIS75_W3C_Default, } DATE_FORMAT = '%d/%b/%Y:%H:%M:%S' - +IIS_DATE_FORMAT = '%Y-%m-%d %H:%M:%S' STATIC_EXTENSIONS = ( 'gif jpg jpeg png bmp ico svg ttf eot woff class swf css js xml robots.txt' ).split() - DOWNLOAD_EXTENSIONS = ( '7z aac arc arj asf asx avi bin csv deb dmg doc exe flv gz gzip hqx ' 'jar mpg mp2 mp3 mp4 mpeg mov movie msi msp odb odf odg odp ' @@ -84,7 +156,6 @@ 'bz2 tbz tgz torrent txt wav wma wmv wpd xls xml z zip' ).split() - # A good source is: http://phpbb-bots.blogspot.com/ EXCLUDED_USER_AGENTS = ( 'adsbot-google', @@ -263,14 +334,22 @@ help="The query string delimiter (default: %default)" ) option_parser.add_option( - '--log-format-name', dest='log_format_name', default=None, - help=("Access log format to detect (supported are: common, common_vhost, ncsa_extended, common_complete). " - "When not specified, the log format will be autodetected by trying all supported log formats." + '--log-format-name', dest='log_format_name', default=False, + help=("Access log format to detect (supported are: common, common_vhost, ncsa_extended, common_complete, " + "iis6_w3c_default, iis6_w3c_all, iis7_w3c_default or iis75_w3c_default). When not specified, the log format will be " + "autodetected by trying all supported log formats. Performance will be enhanced by using this option. " + "Do not use with --check-iis-log-format" + )) option_parser.add_option( + '--check-iis-log-format', dest='check_iis_log_format', action='store_true', default=None, + help="Access to detect IIS log options [Skips first 4 lines of IIS logs, use --debug to see log options line] " + "Do not use with --log-format-name or --log-format-regex" + ) + option_parser.add_option( '--log-format-regex', dest='log_format_regex', default=None, help="Access log regular expression. For an example of a supported Regex, see the source code of this file. " - "Overrides --log-format-name" + "Overrides --log-format-name. Do not use with --check-iis-log-format" ) option_parser.add_option( '--skip', dest='skip', default=0, type='int', @@ -1031,7 +1110,12 @@ return False return True - + def detect_IISformat(line): + """ + Placeholder, maybe return regexp matching the format line (4) in logfile, or None if not found. + """ + logging.debug('Detecting the IIS log format...') + @staticmethod def detect_format(line): """ @@ -1073,7 +1157,15 @@ if config.options.show_progress: print 'Parsing log %s...' % filename + iis_lines = 0 + format_name = None for lineno, line in enumerate(file): + if config.options.check_iis_log_format and iis_lines <4: + iis_lines +=1 + if iis_lines == 4: + logging.debug(line[9:]) + config.format_regexp = None + continue # Guess the format if needed. if not config.format_regexp: logging.debug('Guessing the log format...') @@ -1081,7 +1173,9 @@ if not format_name: return fatal_error( 'Cannot guess the logs format. Please give one using ' - 'either the --log-format-name or --log-format-regex option' + 'either the --log-format-name or --log-format-regex option. \n' + 'For IIS, use --log-format-name=iis6_w3c_default iis6_w3c_all \n' + 'iis7_w3c_default or iis75_w3c_default. Do not use with --check-iis-log-format' ) format = FORMATS[format_name] config.format = format @@ -1089,6 +1183,9 @@ # Make sure the format is compatible with the resolver. resolver.check_format(format) + if config.options.log_format_name: + format_name = config.options.log_format_name + stats.count_lines_parsed.increment() if stats.count_lines_parsed.value <= config.options.skip: continue @@ -1108,7 +1205,7 @@ is_error=False, is_redirect=False, ) - + # Strip query string if config.options.strip_query_string: hit.path = hit.full_path.split(config.options.query_string_delimiter, 1)[0] @@ -1118,14 +1215,17 @@ # Parse date _with_ timezone to get an UTC timestamp. date_string = match.group('date') try: - tz = float(date_string[-5:]) - hit.date = datetime.datetime.strptime(date_string[:-6], '%d/%b/%Y:%H:%M:%S') + if 'iis' in format_name: + tz = 0 + hit.date = datetime.datetime.strptime(date_string, IIS_DATE_FORMAT) + else: + tz = float(date_string[-5:]) + hit.date = datetime.datetime.strptime(date_string[:-6], DATE_FORMAT) except ValueError: # Date format is incorrect, the line is probably badly formatted. invalid_line(line) continue hit.date -= datetime.timedelta(hours=tz/100) - try: hit.referrer = match.group('referrer') except IndexError: @@ -1144,6 +1244,9 @@ except ValueError: # Not all lines have a length (e.g. 304 redirects) hit.length = 0 + except IndexError: + # IIS6 & 7.5 Default has no length + hit.length = 0 try: hit.host = match.group('host') except IndexError: @@ -1157,7 +1260,6 @@ - def main(): """ Start the importing process.