Skip to content

Instantly share code, notes, and snippets.

@congkhoa
Forked from KayneWest/WikiMongoTrends.py
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save congkhoa/c62eb9ea775db29141d7 to your computer and use it in GitHub Desktop.
Save congkhoa/c62eb9ea775db29141d7 to your computer and use it in GitHub Desktop.
import multiprocessing
from StringIO import StringIO
import gzip
import csv
from random import shuffle
import numpy as np
import json
from time import sleep
import pymongo
import datetime
from bs4 import BeautifulSoup as BS
import re
import urllib2
import itertools
#ignore 2007
Years=['2008','2009','2010','2011','2012','2013','2014']
Months=['01','02','03','04','05','06','07','08','09','10','11','12']
combos=list(itertools.product(Years,Months))
def log(msg):
print("{} {}".format(str(datetime.datetime.now()), msg))
def tryconv(x):
try:
return x.split()[0]
except:
pass
def removeNonAscii(s):
return "".join(filter(lambda x: ord(x)<128, s))
def dictafy(x):
result={}
result['year']=x[0]
result['month']=x[1]
result['day']=x[2]
result['term']=removeNonAscii(x[3])
result['hits']=x[4]
result['hour']=x[5]
return result
def helper(x):
if tryconv(x)=='en':
return dictafy([year]+[month]+[day]+x.split()[1:])
else:
pass
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
# create or retrieve the wiki database
db = client.wiki
return db
db = get_db()
log('mongo started')
def main():
for Year,Month in combos:
page="http://dumps.wikimedia.org/other/pagecounts-raw/"+Year+"/"+Year+"-"+Month+"/"
log('starting on '+Year+' '+Month)
request = urllib2.Request(page)
response = urllib2.urlopen(request).read()
log('page loaded.')
soup=BS(response)
links=[re.findall(r'"pagecounts.*"',str(x))[0][1:-1] for x in soup.findAll('li') if re.search(r'"pagecounts.*"',str(x))]
for i in links:
log("working on "+i)
year=re.findall(r'-.*-',i)[0][1:-1][0:4]
month=re.findall(r'-.*-',i)[0][1:-1][4:6]
day=re.findall(r'-.*-',i)[0][1:-1][6:]
request = urllib2.Request(page+i)
request.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(request)
log(i+' loaded')
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
#this opens the file in memory, so it's pretty
data = f.read().split('\n')
log('starting multiprocessing')
pool=multiprocessing.Pool(multiprocessing.cpu_count())
WIKI=filter(None,pool.map(helper,data))
#save some memory
del(data)
pool.close()
log('updating the db')
#bulk insert into db
db.wiki.insert(WIKI)
del(WIKI)
log('done updating')
if __name__ == "__main__":
print 'this is going to take a long time'
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment