congkhoa/WikiMongoTrends.py

## WikiMongoTrends.py
import multiprocessing
from StringIO import StringIO
import gzip
import csv
from random import shuffle
import numpy as np
import json
from time import sleep
import pymongo
import datetime
from bs4 import BeautifulSoup as BS
import re
import urllib2
import itertools

#ignore 2007
Years=['2008','2009','2010','2011','2012','2013','2014']
Months=['01','02','03','04','05','06','07','08','09','10','11','12']
combos=list(itertools.product(Years,Months))

def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))

def tryconv(x):
	try:
		return x.split()[0]
	except:
		pass

def removeNonAscii(s):
	return "".join(filter(lambda x: ord(x)<128, s))

def dictafy(x):
	result={}
	result['year']=x[0]
	result['month']=x[1]
	result['day']=x[2]
	result['term']=removeNonAscii(x[3])
	result['hits']=x[4]
	result['hour']=x[5]
	return result

def helper(x):
	if tryconv(x)=='en':
		return dictafy([year]+[month]+[day]+x.split()[1:])
	else:
		pass

def get_db():
	from pymongo import MongoClient
	client = MongoClient('localhost:27017')
	# create or retrieve the wiki database
	db = client.wiki
	return db

db = get_db()
log('mongo started')

def main():
	for Year,Month in combos:
		page="http://dumps.wikimedia.org/other/pagecounts-raw/"+Year+"/"+Year+"-"+Month+"/"
		log('starting on '+Year+' '+Month)
		request = urllib2.Request(page)
		response = urllib2.urlopen(request).read()
		log('page loaded.')
		soup=BS(response)
		links=[re.findall(r'"pagecounts.*"',str(x))[0][1:-1] for x in soup.findAll('li') if re.search(r'"pagecounts.*"',str(x))]
		for i in links:
			log("working on "+i)
			year=re.findall(r'-.*-',i)[0][1:-1][0:4]
			month=re.findall(r'-.*-',i)[0][1:-1][4:6]
			day=re.findall(r'-.*-',i)[0][1:-1][6:]
			request = urllib2.Request(page+i)
			request.add_header('Accept-encoding', 'gzip')
			response = urllib2.urlopen(request)
			log(i+' loaded')
			buf = StringIO( response.read())
			f = gzip.GzipFile(fileobj=buf)
			#this opens the file in memory, so it's pretty
			data = f.read().split('\n')
			log('starting multiprocessing')
			pool=multiprocessing.Pool(multiprocessing.cpu_count())
			WIKI=filter(None,pool.map(helper,data))
			#save some memory
			del(data)
			pool.close()
			log('updating the db')
			#bulk insert into db
			db.wiki.insert(WIKI)
			del(WIKI)
			log('done updating')

if __name__ == "__main__":
	print 'this is going to take a long time'
	main()
	import multiprocessing
	from StringIO import StringIO
	import gzip
	import csv
	from random import shuffle
	import numpy as np
	import json
	from time import sleep
	import pymongo
	import datetime
	from bs4 import BeautifulSoup as BS
	import re
	import urllib2
	import itertools

	#ignore 2007
	Years=['2008','2009','2010','2011','2012','2013','2014']
	Months=['01','02','03','04','05','06','07','08','09','10','11','12']
	combos=list(itertools.product(Years,Months))

	def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))

	def tryconv(x):
	try:
	return x.split()[0]
	except:
	pass

	def removeNonAscii(s):
	return "".join(filter(lambda x: ord(x)<128, s))

	def dictafy(x):
	result={}
	result['year']=x[0]
	result['month']=x[1]
	result['day']=x[2]
	result['term']=removeNonAscii(x[3])
	result['hits']=x[4]
	result['hour']=x[5]
	return result

	def helper(x):
	if tryconv(x)=='en':
	return dictafy([year]+[month]+[day]+x.split()[1:])
	else:
	pass

	def get_db():
	from pymongo import MongoClient
	client = MongoClient('localhost:27017')
	# create or retrieve the wiki database
	db = client.wiki
	return db

	db = get_db()
	log('mongo started')

	def main():
	for Year,Month in combos:
	page="http://dumps.wikimedia.org/other/pagecounts-raw/"+Year+"/"+Year+"-"+Month+"/"
	log('starting on '+Year+' '+Month)
	request = urllib2.Request(page)
	response = urllib2.urlopen(request).read()
	log('page loaded.')
	soup=BS(response)
	links=[re.findall(r'"pagecounts."',str(x))[0][1:-1] for x in soup.findAll('li') if re.search(r'"pagecounts."',str(x))]
	for i in links:
	log("working on "+i)
	year=re.findall(r'-.*-',i)[0][1:-1][0:4]
	month=re.findall(r'-.*-',i)[0][1:-1][4:6]
	day=re.findall(r'-.*-',i)[0][1:-1][6:]
	request = urllib2.Request(page+i)
	request.add_header('Accept-encoding', 'gzip')
	response = urllib2.urlopen(request)
	log(i+' loaded')
	buf = StringIO( response.read())
	f = gzip.GzipFile(fileobj=buf)
	#this opens the file in memory, so it's pretty
	data = f.read().split('\n')
	log('starting multiprocessing')
	pool=multiprocessing.Pool(multiprocessing.cpu_count())
	WIKI=filter(None,pool.map(helper,data))
	#save some memory
	del(data)
	pool.close()
	log('updating the db')
	#bulk insert into db
	db.wiki.insert(WIKI)
	del(WIKI)
	log('done updating')

	if __name__ == "__main__":
	print 'this is going to take a long time'
	main()