Skip to content

Instantly share code, notes, and snippets.

@hamletbatista
Created July 24, 2019 22:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hamletbatista/b18d7129ea7ea4581ef43aa9cddf20ce to your computer and use it in GitHub Desktop.
Save hamletbatista/b18d7129ea7ea4581ef43aa9cddf20ce to your computer and use it in GitHub Desktop.
from urllib.parse import urlparse
from datetime import timezone
#Convert log to CSV
csvfile = "practicalecommerce.com-ssl_log-Jul-2019.csv"
logfile="practicalecommerce.com-ssl_log-Jul-2019.log"
with open(logfile) as f:
with open(csvfile, "w") as csvfile:
#create output CSV file
csvwriter = csv.writer(csvfile)
#save headings
csvwriter.writerow(["ip", "date", "url", "path", "file_extension", "query", "status_code", "ua"])
for logline in f:
m = re.match(p, logline)
if m:
ip, date, url, status_code, ua = m.groups()
#if not validated, validate
if ip not in validatedips:
#verify and cache googlebot ip
validatedips[ip] = verify_googlebot(ip)
#exclude fake Googlebot requests
if validatedips[ip]:
#parse date into ISO format
dt = datetime.strptime(date, '%d/%b/%Y:%H:%M:%S %z') #you need to create a precise date format string
#convert to utc
dt = dt.replace(tzinfo=timezone.utc)
#break url into path and query string
parsed_url = urlparse(url)
path_parts = parsed_url.path.split(".")
file_extension= ""
if len(path_parts) > 1:
file_extension = path_parts[-1]
csvwriter.writerow([ip, dt.isoformat(), url, parsed_url.path, file_extension, parsed_url.query, status_code, ua])
#else:
# print("Fake Googlebot: {ip}".format(ip=ip))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment