Skip to content

Instantly share code, notes, and snippets.

@myano
Created March 4, 2013 18:16
Show Gist options
  • Save myano/5084229 to your computer and use it in GitHub Desktop.
Save myano/5084229 to your computer and use it in GitHub Desktop.
This is the script that powers reqs.py on lmddgtf.net. This script actually runs in a cronjob every 20 minutes. The reqs.py on http://lmddgtfy.net/reqs.py prints the output of this to the screen.
#!/usr/bin/env python
# -*- coding: utf8 -*-
from subprocess import Popen, PIPE
import datetime
import re
def main():
f = open("/var/www/lmddgtfy.net/logs/reqs.txt", "w")
f.write("""\
<!DOCTYPE html>
<head>
<title>Let Me DuckDuckGo That For You Logs</title>
<meta charset="utf-8">
<meta name="robots" content="noindex">
</head>
<body>
""")
days = list()
total_new = 0
total_raw = 0
total_qs = 0
diff = datetime.datetime.now() - datetime.datetime(2012, 01, 12)
max_days = diff.days + 1
re_bots = re.compile("(?i)\"(\-|\S+)\"\s\".*(crawl|spider|bot).*\"")
for x in range(0,max_days):
days.append((datetime.date.today() - datetime.timedelta(days=x)).strftime("%d/%b/%Y"))
f.write("Maximum number of days: %s<br />\n<br />\n" % (str(max_days)))
f.write("<table width=\"1000\" border=\"1\">\n")
f.write("""<tr>
<td>Date</td>
<td>New Formula</td>
<td># of RAW reqs</td>
<td>Difference</td>
<td>/?q='s</td>
<td>robots</td>
<td>comments</td>
</tr>\n""")
for day in days:
f.write("\n<tr>\n")
### original algorithm
#original = Popen(["grep", "-c", str(day), "/var/www/lmddgtfy.net/logs/access.log"],
#stdout=PIPE).stdout.read().split("\n")
### new algorithm
s1 = Popen(["grep", '%s' % (str(day)), "/var/www/lmddgtfy.net/logs/access.log"],
stdout=PIPE)
s2 = Popen(["grep", '%s' % (str(day)), "/var/www/lmddgtfy.net/logs/access.log"],
stdout=PIPE)
## filter out bots/spiders/crawlers
#t = Popen(["grep", "-iv", "-P", "\"\-\"\s\".*(crawl|spider|bot).*\""],
t = Popen(["grep", "-iv", "-P", "\"(\-|\S+)\"\s\".*(crawl|spider|bot).*\""],
stdin=s1.stdout, stdout=PIPE)
s_out = s2.stdout.read().split("\n")
try:
s_out.remove('')
except:
pass
#if day == "14/Jan/2012":
#print "length:", str(s_out)
s1.stdout.close()
s2.stdout.close()
## keep only valid responses 200 status code
u = Popen(["grep", "-i", "-P", "(200|206)\s(\d+|\-)"],
stdin=t.stdout, stdout=PIPE)
t.stdout.close()
## exclude call to local resources
v = Popen(["grep", "-iv", "-P", "/(js|images|css)/"],
stdin=u.stdout, stdout=PIPE)
u.stdout.close()
## exclude the favicon lookup
w = Popen(["grep", "-iv", "favicon"],
stdin=v.stdout, stdout=PIPE)
v.stdout.close()
x = Popen(["grep", "-iv", "-P", "(GET|HEAD|PUT|POST) /(reqs|logs|top10|\S+)(.py|.png|.js)\s"],
stdin=w.stdout, stdout=PIPE)
output = x.stdout.read().split("\n")
x.stdout.close()
## just /?q=
#second = Popen(["grep", "-c", "/\?q="],
#stdin=s.stdout, stdout=PIPE).stdout.read().split("\n")
#org = Popen(["wc", "-l"],
#stdin=s.stdout, stdout=PIPE).stdout.read().split("\n")
qs = 0
robots = 0
for line in s_out:
if '"GET /?q=' in line or '"HEAD /?q=' in line or '"POST /?q=' in line:
qs += 1
#if '"GET /robots.txt' in line or '"HEAD /robots.txt' in line or '"POST /?q=' in line or re_bots.findall(line):
if '"GET /robots.txt' in line or '"HEAD /robots.txt' in line or re_bots.findall(line):
robots += 1
total_qs += qs
#print "<pre>"
try:
output.remove("")
except:
pass
extra = str()
length = len(output)
s_out_len = len(s_out)
#if s_out_len > 1:
# s_out_len -= 1
new_day = day
if day == "21/Dec/2012":
#extra = " <--- stopped using Cloudflare"
new_day = "<b>" + day + "</b>"
#f.write("%s -- %s%s<br />\n" % (day, length, extra))
f.write("<td>%s</td>\n" % (new_day))
f.write("<td>%s</td>\n" % (length))
f.write("<td>%s</td>\n" % (str(s_out_len))) #original[0]))
f.write("<td>%s</td>\n" % (str(s_out_len - int(length))))
f.write("<td>%s</td>\n" % (str(qs)))
f.write("<td>%s</td>\n" % (str(robots)))
if day == "21/Dec/2012":
f.write("<td> <-- stopped using Cloudflare</td>")
else:
f.write("<td></td>")
#f.write("<td>%s</td>\n" % (extra))
f.write("</tr>\n")
total_new += int(length)
total_raw += s_out_len
#print "LAST OF s_out:", str(s_out[-1])
#print "</pre>"
f.write("</table>\n<br /><br />\n")
f.write("\n<table border=\"1\">\n")
f.write("<tr>\n")
f.write("<td>Desc</td>\n")
f.write("<td>NEW Formula</td>\n")
f.write("<td>RAW</td>\n")
f.write("<td>/?q's</td>")
f.write("</tr>\n\n")
f.write("<tr>\n")
f.write("<td>Totals</td>\n")
f.write("<td>%s</td>\n" % (total_new))
f.write("<td>%s</td>\n" % (total_raw))
f.write("<td>%s</td>\n" % (total_qs))
f.write("</tr>\n\n")
f.write("<tr>\n")
f.write("<td>Daily Average</td>\n")
f.write("<td>%s</td>\n" % (str(total_new/float(max_days))))
f.write("<td>%s</td>\n" % (str(total_raw/float(max_days))))
f.write("<td>%s</td>\n" % (str(total_qs/float(max_days))))
f.write("</tr>\n</table>\n")
#f.write("Total (RAW): %s" % (total_raw))
#f.write("\n<br />\n")
#f.write("Total (NEW): %s" % (total_new))
#f.write("\n<br />\n")
#f.write("Daily Average (NEW): %s" % (str(total_new/float(max_days))))
#f.write("\n<br />\n")
#f.write("Daily Average (RAW): %s" % (str(total_raw/float(max_days))))
f.write("\n<br />\n")
f.write("Updated: %s" % (datetime.datetime.now().isoformat()))
f.write("\n</body>")
f.write("\n</html>")
f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment