Created
March 4, 2013 18:16
-
-
Save myano/5084229 to your computer and use it in GitHub Desktop.
This is the script that powers reqs.py on lmddgtf.net. This script actually runs in a cronjob every 20 minutes. The reqs.py on http://lmddgtfy.net/reqs.py prints the output of this to the screen.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf8 -*- | |
from subprocess import Popen, PIPE | |
import datetime | |
import re | |
def main(): | |
f = open("/var/www/lmddgtfy.net/logs/reqs.txt", "w") | |
f.write("""\ | |
<!DOCTYPE html> | |
<head> | |
<title>Let Me DuckDuckGo That For You Logs</title> | |
<meta charset="utf-8"> | |
<meta name="robots" content="noindex"> | |
</head> | |
<body> | |
""") | |
days = list() | |
total_new = 0 | |
total_raw = 0 | |
total_qs = 0 | |
diff = datetime.datetime.now() - datetime.datetime(2012, 01, 12) | |
max_days = diff.days + 1 | |
re_bots = re.compile("(?i)\"(\-|\S+)\"\s\".*(crawl|spider|bot).*\"") | |
for x in range(0,max_days): | |
days.append((datetime.date.today() - datetime.timedelta(days=x)).strftime("%d/%b/%Y")) | |
f.write("Maximum number of days: %s<br />\n<br />\n" % (str(max_days))) | |
f.write("<table width=\"1000\" border=\"1\">\n") | |
f.write("""<tr> | |
<td>Date</td> | |
<td>New Formula</td> | |
<td># of RAW reqs</td> | |
<td>Difference</td> | |
<td>/?q='s</td> | |
<td>robots</td> | |
<td>comments</td> | |
</tr>\n""") | |
for day in days: | |
f.write("\n<tr>\n") | |
### original algorithm | |
#original = Popen(["grep", "-c", str(day), "/var/www/lmddgtfy.net/logs/access.log"], | |
#stdout=PIPE).stdout.read().split("\n") | |
### new algorithm | |
s1 = Popen(["grep", '%s' % (str(day)), "/var/www/lmddgtfy.net/logs/access.log"], | |
stdout=PIPE) | |
s2 = Popen(["grep", '%s' % (str(day)), "/var/www/lmddgtfy.net/logs/access.log"], | |
stdout=PIPE) | |
## filter out bots/spiders/crawlers | |
#t = Popen(["grep", "-iv", "-P", "\"\-\"\s\".*(crawl|spider|bot).*\""], | |
t = Popen(["grep", "-iv", "-P", "\"(\-|\S+)\"\s\".*(crawl|spider|bot).*\""], | |
stdin=s1.stdout, stdout=PIPE) | |
s_out = s2.stdout.read().split("\n") | |
try: | |
s_out.remove('') | |
except: | |
pass | |
#if day == "14/Jan/2012": | |
#print "length:", str(s_out) | |
s1.stdout.close() | |
s2.stdout.close() | |
## keep only valid responses 200 status code | |
u = Popen(["grep", "-i", "-P", "(200|206)\s(\d+|\-)"], | |
stdin=t.stdout, stdout=PIPE) | |
t.stdout.close() | |
## exclude call to local resources | |
v = Popen(["grep", "-iv", "-P", "/(js|images|css)/"], | |
stdin=u.stdout, stdout=PIPE) | |
u.stdout.close() | |
## exclude the favicon lookup | |
w = Popen(["grep", "-iv", "favicon"], | |
stdin=v.stdout, stdout=PIPE) | |
v.stdout.close() | |
x = Popen(["grep", "-iv", "-P", "(GET|HEAD|PUT|POST) /(reqs|logs|top10|\S+)(.py|.png|.js)\s"], | |
stdin=w.stdout, stdout=PIPE) | |
output = x.stdout.read().split("\n") | |
x.stdout.close() | |
## just /?q= | |
#second = Popen(["grep", "-c", "/\?q="], | |
#stdin=s.stdout, stdout=PIPE).stdout.read().split("\n") | |
#org = Popen(["wc", "-l"], | |
#stdin=s.stdout, stdout=PIPE).stdout.read().split("\n") | |
qs = 0 | |
robots = 0 | |
for line in s_out: | |
if '"GET /?q=' in line or '"HEAD /?q=' in line or '"POST /?q=' in line: | |
qs += 1 | |
#if '"GET /robots.txt' in line or '"HEAD /robots.txt' in line or '"POST /?q=' in line or re_bots.findall(line): | |
if '"GET /robots.txt' in line or '"HEAD /robots.txt' in line or re_bots.findall(line): | |
robots += 1 | |
total_qs += qs | |
#print "<pre>" | |
try: | |
output.remove("") | |
except: | |
pass | |
extra = str() | |
length = len(output) | |
s_out_len = len(s_out) | |
#if s_out_len > 1: | |
# s_out_len -= 1 | |
new_day = day | |
if day == "21/Dec/2012": | |
#extra = " <--- stopped using Cloudflare" | |
new_day = "<b>" + day + "</b>" | |
#f.write("%s -- %s%s<br />\n" % (day, length, extra)) | |
f.write("<td>%s</td>\n" % (new_day)) | |
f.write("<td>%s</td>\n" % (length)) | |
f.write("<td>%s</td>\n" % (str(s_out_len))) #original[0])) | |
f.write("<td>%s</td>\n" % (str(s_out_len - int(length)))) | |
f.write("<td>%s</td>\n" % (str(qs))) | |
f.write("<td>%s</td>\n" % (str(robots))) | |
if day == "21/Dec/2012": | |
f.write("<td> <-- stopped using Cloudflare</td>") | |
else: | |
f.write("<td></td>") | |
#f.write("<td>%s</td>\n" % (extra)) | |
f.write("</tr>\n") | |
total_new += int(length) | |
total_raw += s_out_len | |
#print "LAST OF s_out:", str(s_out[-1]) | |
#print "</pre>" | |
f.write("</table>\n<br /><br />\n") | |
f.write("\n<table border=\"1\">\n") | |
f.write("<tr>\n") | |
f.write("<td>Desc</td>\n") | |
f.write("<td>NEW Formula</td>\n") | |
f.write("<td>RAW</td>\n") | |
f.write("<td>/?q's</td>") | |
f.write("</tr>\n\n") | |
f.write("<tr>\n") | |
f.write("<td>Totals</td>\n") | |
f.write("<td>%s</td>\n" % (total_new)) | |
f.write("<td>%s</td>\n" % (total_raw)) | |
f.write("<td>%s</td>\n" % (total_qs)) | |
f.write("</tr>\n\n") | |
f.write("<tr>\n") | |
f.write("<td>Daily Average</td>\n") | |
f.write("<td>%s</td>\n" % (str(total_new/float(max_days)))) | |
f.write("<td>%s</td>\n" % (str(total_raw/float(max_days)))) | |
f.write("<td>%s</td>\n" % (str(total_qs/float(max_days)))) | |
f.write("</tr>\n</table>\n") | |
#f.write("Total (RAW): %s" % (total_raw)) | |
#f.write("\n<br />\n") | |
#f.write("Total (NEW): %s" % (total_new)) | |
#f.write("\n<br />\n") | |
#f.write("Daily Average (NEW): %s" % (str(total_new/float(max_days)))) | |
#f.write("\n<br />\n") | |
#f.write("Daily Average (RAW): %s" % (str(total_raw/float(max_days)))) | |
f.write("\n<br />\n") | |
f.write("Updated: %s" % (datetime.datetime.now().isoformat())) | |
f.write("\n</body>") | |
f.write("\n</html>") | |
f.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment