--- public_html/misc/log-analytics/import_logs_orig.py 2012-04-06 15:04:12.000000000 +0100
+++ public_html/misc/log-analytics/import_IIS_logs.py 2012-05-13 16:49:09.841982130 +0100
@@ -6,6 +6,7 @@
# @link http://piwik.org
# @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
# @version $Id: import_logs.py 6170 2012-04-06 14:04:12Z Cyril $
+# @version $Id: import_IIS_logs.py 006 2012-05-13 16:40:00 TIOUK $
#
# For more info see: http://piwik.org/log-analytics/
@@ -62,21 +63,92 @@
'"(?P.*?)" "(?P.*?)"'
)
+#IIS 6 All Log Otions
+#Fields: date time s-sitename s-computername s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs-version cs(User-Agent) cs(Cookie) cs(Referer) cs-host sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken
+#(\d+[-\d+]+ [\d+:]+) (W3SVC\d+) (/\S*) ([\d*.]+) ([G|P|O|H|D|T|C]\S+) (\S+) (\S+) (\d+) (\S+) ([\d*.]+) ([H|F]\S+) (\S+) (\S+) (\S+) (\S+) (\d+) (\S+) (\S+) (\S+) (\d+) (\d+)
+
+_IIS6_W3C_All = (
+ '(?P^\d+[-\d+]+ [\d+:]+) '
+ 'W3SVC\d+ \S+ [\d*.]+ [G|P|O|H|D|T|C]\S+ '
+ '(?P/\S*) '
+ '\S+ \d+ \S+ '
+ '(?P[\d*.]*) '
+ '[H|F]\S+ '
+ '(?P\S+) '
+ '\S+ '
+ '(?P\S+) '
+ '\S+ '
+ '(?P\d+) '
+ '\S+ \S+ '
+ '(?P\S+) '
+ '\d+ \d+'
+)
+
+#IIS 6 Default Log Otions
+#Fields: date time s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus
+#(\d+[-\d+]+ [\d+:]+) ([\d*.]+) ([G|P|O|H|D|T|C]\S+) (/\S*) (\S+) (\d+) (\S+) ([\d*.]+) (\S+) (\d+) (\d+)
+
+_IIS6_W3C_Default = (
+ '(?P^\d+[-\d+]+ [\d+:]+) '
+ '[\d*.]+ [G|P|O|H|D|T|C]\S+ '
+ '(?P/\S*) '
+ '\S+ \d+ \S+ '
+ '(?P[\d*.]*) '
+ '(?P\S+) '
+ '(?P\d+) '
+ '\d+'
+)
+
+#IIS 7 Default Log Otions
+#Fields: date time s-sitename s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken
+#(\d+[-\d+]+) [\d+:]+) (\S+) ([\d+.]+) ([G|P|O|H|D|T|C]\S+) (/\S*) (\S+) (\d+) (\S+) ([\d*.]*) (\S+) (\d+) (\d+) (\d+) (\d+) (\d+) (\d+)
+
+_IIS7_W3C_Default = (
+ '(?P\d+[-\d+]+ [\d+:]+) '
+ '\S+ [\d+.]+ [G|P|O|H|D|T|C]\S+ '
+ '(?P/\S*) '
+ '\S+ \d+ \S+ '
+ '(?P[\d*.]*) '
+ '(?P\S+) '
+ '(?P\d+) '
+ '\d+ \d+ '
+ '(?P\d+) '
+ '\d+ \d+'
+)
+
+#IIS 7.5 Default Log Otions
+#Fields: date time s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) sc-status sc-substatus sc-win32-status time-taken
+#(\d+[-\d+]+ [\d+:]+) ([\d*.]+) ([G|P|O|H|D|T|C]\S+) (/\S*) (\S+) (\d+) (\S+) ([\d*.]+) (\S+) (\d+) (\d+) (\d+) (\d+)
+
+_IIS75_W3C_Default = (
+ '(?P^\d+[-\d+]+ [\d+:]+) '
+ '[\d*.]+ [G|P|O|H|D|T|C]\S+ '
+ '(?P/\S*) '
+ '\S+ \d+ \S+ '
+ '(?P[\d*.]*) '
+ '(?P\S+) '
+ '(?P\d+) '
+ '\d+ \d+ \d+'
+)
+
FORMATS = {
'common': _COMMON_LOG_FORMAT,
'common_vhost': '(?P[\w\-\.]*)(?::\d+)? ' + _COMMON_LOG_FORMAT,
'ncsa_extended': _NCSA_EXTENDED_LOG_FORMAT,
'common_complete': _COMMON_COMPLETE_LOG_FORMAT,
+ 'iis6_w3c_default': _IIS6_W3C_Default,
+ 'iis6_w3c_all': _IIS6_W3C_All,
+ 'iis7_w3c_default': _IIS7_W3C_Default,
+ 'iis75_w3c_default': _IIS75_W3C_Default,
}
DATE_FORMAT = '%d/%b/%Y:%H:%M:%S'
-
+IIS_DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
STATIC_EXTENSIONS = (
'gif jpg jpeg png bmp ico svg ttf eot woff class swf css js xml robots.txt'
).split()
-
DOWNLOAD_EXTENSIONS = (
'7z aac arc arj asf asx avi bin csv deb dmg doc exe flv gz gzip hqx '
'jar mpg mp2 mp3 mp4 mpeg mov movie msi msp odb odf odg odp '
@@ -84,7 +156,6 @@
'bz2 tbz tgz torrent txt wav wma wmv wpd xls xml z zip'
).split()
-
# A good source is: http://phpbb-bots.blogspot.com/
EXCLUDED_USER_AGENTS = (
'adsbot-google',
@@ -263,14 +334,22 @@
help="The query string delimiter (default: %default)"
)
option_parser.add_option(
- '--log-format-name', dest='log_format_name', default=None,
- help=("Access log format to detect (supported are: common, common_vhost, ncsa_extended, common_complete). "
- "When not specified, the log format will be autodetected by trying all supported log formats."
+ '--log-format-name', dest='log_format_name', default=False,
+ help=("Access log format to detect (supported are: common, common_vhost, ncsa_extended, common_complete, "
+ "iis6_w3c_default, iis6_w3c_all, iis7_w3c_default or iis75_w3c_default). When not specified, the log format will be "
+ "autodetected by trying all supported log formats. Performance will be enhanced by using this option. "
+ "Do not use with --check-iis-log-format"
+
))
option_parser.add_option(
+ '--check-iis-log-format', dest='check_iis_log_format', action='store_true', default=None,
+ help="Access to detect IIS log options [Skips first 4 lines of IIS logs, use --debug to see log options line] "
+ "Do not use with --log-format-name or --log-format-regex"
+ )
+ option_parser.add_option(
'--log-format-regex', dest='log_format_regex', default=None,
help="Access log regular expression. For an example of a supported Regex, see the source code of this file. "
- "Overrides --log-format-name"
+ "Overrides --log-format-name. Do not use with --check-iis-log-format"
)
option_parser.add_option(
'--skip', dest='skip', default=0, type='int',
@@ -1031,7 +1110,12 @@
return False
return True
-
+ def detect_IISformat(line):
+ """
+ Placeholder, maybe return regexp matching the format line (4) in logfile, or None if not found.
+ """
+ logging.debug('Detecting the IIS log format...')
+
@staticmethod
def detect_format(line):
"""
@@ -1073,7 +1157,15 @@
if config.options.show_progress:
print 'Parsing log %s...' % filename
+ iis_lines = 0
+ format_name = None
for lineno, line in enumerate(file):
+ if config.options.check_iis_log_format and iis_lines <4:
+ iis_lines +=1
+ if iis_lines == 4:
+ logging.debug(line[9:])
+ config.format_regexp = None
+ continue
# Guess the format if needed.
if not config.format_regexp:
logging.debug('Guessing the log format...')
@@ -1081,7 +1173,9 @@
if not format_name:
return fatal_error(
'Cannot guess the logs format. Please give one using '
- 'either the --log-format-name or --log-format-regex option'
+ 'either the --log-format-name or --log-format-regex option. \n'
+ 'For IIS, use --log-format-name=iis6_w3c_default iis6_w3c_all \n'
+ 'iis7_w3c_default or iis75_w3c_default. Do not use with --check-iis-log-format'
)
format = FORMATS[format_name]
config.format = format
@@ -1089,6 +1183,9 @@
# Make sure the format is compatible with the resolver.
resolver.check_format(format)
+ if config.options.log_format_name:
+ format_name = config.options.log_format_name
+
stats.count_lines_parsed.increment()
if stats.count_lines_parsed.value <= config.options.skip:
continue
@@ -1108,7 +1205,7 @@
is_error=False,
is_redirect=False,
)
-
+
# Strip query string
if config.options.strip_query_string:
hit.path = hit.full_path.split(config.options.query_string_delimiter, 1)[0]
@@ -1118,14 +1215,17 @@
# Parse date _with_ timezone to get an UTC timestamp.
date_string = match.group('date')
try:
- tz = float(date_string[-5:])
- hit.date = datetime.datetime.strptime(date_string[:-6], '%d/%b/%Y:%H:%M:%S')
+ if 'iis' in format_name:
+ tz = 0
+ hit.date = datetime.datetime.strptime(date_string, IIS_DATE_FORMAT)
+ else:
+ tz = float(date_string[-5:])
+ hit.date = datetime.datetime.strptime(date_string[:-6], DATE_FORMAT)
except ValueError:
# Date format is incorrect, the line is probably badly formatted.
invalid_line(line)
continue
hit.date -= datetime.timedelta(hours=tz/100)
-
try:
hit.referrer = match.group('referrer')
except IndexError:
@@ -1144,6 +1244,9 @@
except ValueError:
# Not all lines have a length (e.g. 304 redirects)
hit.length = 0
+ except IndexError:
+ # IIS6 & 7.5 Default has no length
+ hit.length = 0
try:
hit.host = match.group('host')
except IndexError:
@@ -1157,7 +1260,6 @@
-
def main():
"""
Start the importing process.