While working with some IIS logs, I decided to start practicing my Python. I put together some handy Python functions to work with IIS Log files. These will come in handy. On a 3GB, 2.5GHz, running WinXP machine, these functions take about 3 seconds to process a 180MB Text file. Python code could be optimized to be faster if you’re dealing with larger sized files.
#!/usr/bin/env python
# An IIS log file can have various log properties. Everytime you add new columns to log for
# in IIS, it creates a new row full of columns.
import re
import os
MainLogDelimiter = "#Software: Microsoft Internet Information Services 6.0"
TestFile = "C:\\Dan\\IIS-Log-Import\\Logs\\not-the-same.txt"
BigTestFile = "C:\\Dan\\IIS-Log-Import\\Logs\\ex090914\\ex090914.log"
LogsDir = "C:\\Dan\\IIS-Log-Import\\Logs"
def SearchForFile( rootpath, searchfor, includepath = 0 ):
# Search for a file recursively from a root directory.
# rootpath = root directory to start searching from.
# searchfor = regexp to search for, e.g.:
# search for *.jpg : \.exe$
# includepath = appends the full path to the file
# this attribute is optional
# Returns a list of filenames that can be used to loop
# through.
#
# TODO: Use the glob module instead. Could be faster.
names = []
append = ""
for root, dirs, files in os.walk( rootpath ):
for name in files:
if re.search( searchfor, name ):
if includepath == 0:
root = ""
else:
append = "\\"
names.append( root + append + name )
return names
def isSameLogProperties( FILE ):
# Tests to see if a log file has the same number of columns throughout
# This is in case new column properties were added/subtracted in the course
# of the log file.
FILE.seek( 0, 0 )
SubLogs = FILE.read().split( MainLogDelimiter )
# SubLogs[0] Stores the number of different log variations in the log file
SubLogs[0] = len( SubLogs ) - 1
# Grab the column names from the log file, separated by space
columns = re.search( "^#Fields:\s([\w\-()\s]+)$", SubLogs[1], re.IGNORECASE | re.MULTILINE ).group(1)
LogSameProperties = True
for i in range( 2, SubLogs[0] + 1 ):
# If there are columns
if ( len( columns ) > 0 ):
if ( columns != re.search( "^#Fields:\s([\w\-()\s]+)$", SubLogs[i], re.IGNORECASE | re.MULTILINE ).group(1) ):
LogSameProperties = False
break
return LogSameProperties
def getFirstColumn( FILE ):
# This gets the columns from a log file. It returns only the first columns, and ignores another column
# row that may exist in case new columns were added/subtracted in IIS.
# input: FILE
# output: 1 single element List
FILE.seek( 0, 0 )
names = []
# Grab the column names from the log file, separated by space
names.append( re.search( "^#Fields:\s([\w\-()\s]+)$", FILE.read().split( MainLogDelimiter )[1], re.IGNORECASE | re.MULTILINE ).group(1).strip() )
return names
def getAllColumns( FILE ):
# This gets all the columns from a log file.
# input: FILE
# output: List
FILE.seek( 0, 0 )
names = []
SubLogs = FILE.read().split( MainLogDelimiter )
# SubLogs[0] Stores the number of different log variations in the log file
SubLogs[0] = len( SubLogs ) - 1
for i in range( 1, SubLogs[0] + 1 ):
names.append( re.search( "^#Fields:\s([\w\-()\s]+)$", SubLogs[i], re.IGNORECASE | re.MULTILINE ).group(1).strip() )
return names
# EXAMPLE:
# Loop through all the IIS log files in the directory
# for file in SearchForFile( LogsDir, "\.txt$", 1 ):
LogFile = open( file, "r" )
if ( isSameLogProperties( LogFile ) ):
print file, "the same"
else:
print file, "not the same"
LogFile.close()