-
-
Save congkhoa/c62eb9ea775db29141d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
from StringIO import StringIO | |
import gzip | |
import csv | |
from random import shuffle | |
import numpy as np | |
import json | |
from time import sleep | |
import pymongo | |
import datetime | |
from bs4 import BeautifulSoup as BS | |
import re | |
import urllib2 | |
import itertools | |
#ignore 2007 | |
Years=['2008','2009','2010','2011','2012','2013','2014'] | |
Months=['01','02','03','04','05','06','07','08','09','10','11','12'] | |
combos=list(itertools.product(Years,Months)) | |
def log(msg): | |
print("{} {}".format(str(datetime.datetime.now()), msg)) | |
def tryconv(x): | |
try: | |
return x.split()[0] | |
except: | |
pass | |
def removeNonAscii(s): | |
return "".join(filter(lambda x: ord(x)<128, s)) | |
def dictafy(x): | |
result={} | |
result['year']=x[0] | |
result['month']=x[1] | |
result['day']=x[2] | |
result['term']=removeNonAscii(x[3]) | |
result['hits']=x[4] | |
result['hour']=x[5] | |
return result | |
def helper(x): | |
if tryconv(x)=='en': | |
return dictafy([year]+[month]+[day]+x.split()[1:]) | |
else: | |
pass | |
def get_db(): | |
from pymongo import MongoClient | |
client = MongoClient('localhost:27017') | |
# create or retrieve the wiki database | |
db = client.wiki | |
return db | |
db = get_db() | |
log('mongo started') | |
def main(): | |
for Year,Month in combos: | |
page="http://dumps.wikimedia.org/other/pagecounts-raw/"+Year+"/"+Year+"-"+Month+"/" | |
log('starting on '+Year+' '+Month) | |
request = urllib2.Request(page) | |
response = urllib2.urlopen(request).read() | |
log('page loaded.') | |
soup=BS(response) | |
links=[re.findall(r'"pagecounts.*"',str(x))[0][1:-1] for x in soup.findAll('li') if re.search(r'"pagecounts.*"',str(x))] | |
for i in links: | |
log("working on "+i) | |
year=re.findall(r'-.*-',i)[0][1:-1][0:4] | |
month=re.findall(r'-.*-',i)[0][1:-1][4:6] | |
day=re.findall(r'-.*-',i)[0][1:-1][6:] | |
request = urllib2.Request(page+i) | |
request.add_header('Accept-encoding', 'gzip') | |
response = urllib2.urlopen(request) | |
log(i+' loaded') | |
buf = StringIO( response.read()) | |
f = gzip.GzipFile(fileobj=buf) | |
#this opens the file in memory, so it's pretty | |
data = f.read().split('\n') | |
log('starting multiprocessing') | |
pool=multiprocessing.Pool(multiprocessing.cpu_count()) | |
WIKI=filter(None,pool.map(helper,data)) | |
#save some memory | |
del(data) | |
pool.close() | |
log('updating the db') | |
#bulk insert into db | |
db.wiki.insert(WIKI) | |
del(WIKI) | |
log('done updating') | |
if __name__ == "__main__": | |
print 'this is going to take a long time' | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment