Skip to content

Instantly share code, notes, and snippets.

@KayneWest
Last active August 29, 2015 14:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save KayneWest/a3bc52dd9c1b85d6303c to your computer and use it in GitHub Desktop.
Save KayneWest/a3bc52dd9c1b85d6303c to your computer and use it in GitHub Desktop.
import multiprocessing
from StringIO import StringIO
import gzip
import csv
from random import shuffle
import numpy as np
import json
from time import sleep
import pymongo
import datetime
from bs4 import BeautifulSoup as BS
import re
import urllib2
import itertools
#ignore 2007
Years=['2008','2009','2010','2011','2012','2013','2014']
Months=['01','02','03','04','05','06','07','08','09','10','11','12']
combos=list(itertools.product(Years,Months))
def log(msg):
print("{} {}".format(str(datetime.datetime.now()), msg))
def tryconv(x):
try:
return x.split()[0]
except:
pass
def removeNonAscii(s):
return "".join(filter(lambda x: ord(x)<128, s))
def dictafy(x):
result={}
result['year']=x[0]
result['month']=x[1]
result['day']=x[2]
result['term']=removeNonAscii(x[3])
result['hits']=x[4]
result['hour']=x[5]
return result
def helper(x):
if tryconv(x)=='en':
return dictafy([year]+[month]+[day]+x.split()[1:])
else:
pass
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
# create or retrieve the wiki database
db = client.wiki
return db
db = get_db()
log('mongo started')
def main():
for Year,Month in combos:
page="http://dumps.wikimedia.org/other/pagecounts-raw/"+Year+"/"+Year+"-"+Month+"/"
log('starting on '+Year+' '+Month)
request = urllib2.Request(page)
response = urllib2.urlopen(request).read()
log('page loaded.')
soup=BS(response)
links=[re.findall(r'"pagecounts.*"',str(x))[0][1:-1] for x in soup.findAll('li') if re.search(r'"pagecounts.*"',str(x))]
for i in links:
log("working on "+i)
year=re.findall(r'-.*-',i)[0][1:-1][0:4]
month=re.findall(r'-.*-',i)[0][1:-1][4:6]
day=re.findall(r'-.*-',i)[0][1:-1][6:]
request = urllib2.Request(page+i)
request.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(request)
log(i+' loaded')
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
#this opens the file in memory, so it's pretty
data = f.read().split('\n')
log('starting multiprocessing')
pool=multiprocessing.Pool(multiprocessing.cpu_count())
WIKI=filter(None,pool.map(helper,data))
#save some memory
del(data)
pool.close()
log('updating the db')
#bulk insert into db
db.wiki.insert(WIKI)
del(WIKI)
log('done updating')
if __name__ == "__main__":
print 'this is going to take a long time'
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment