Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Created August 31, 2013 19:34
Show Gist options
  • Save abelsonlive/6400144 to your computer and use it in GitHub Desktop.
Save abelsonlive/6400144 to your computer and use it in GitHub Desktop.
get wikipedia pageviews
from thready import threaded
import requests
import gzip
from StringIO import StringIO
import re
from datetime import datetime
def url_to_date(url):
d = "".join(url.split("/")[-1].split(".")[0].split("-")[1:3])
return datetime.strptime(d, "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S")
def url_to_fp(url):
return "".join(url.split("/")[-1].split(".")[0].split("-")[1:3]) + ".tsv"
def decode_gzip(gzipped_string):
''' decode gzipped content '''
gzipper = gzip.GzipFile(fileobj=StringIO(gzipped_string))
return gzipper.read()
def get_data(url):
dt = url_to_date(url)
# raw_file = requests.get(url).text
raw_file = open(url).read()
tsv = decode_gzip(raw_file)
data = []
for line in tsv.split("\n"):
if re.match("en ", line):
line = unicode(line, errors="ignore")
fields = line.split(" ")
row = "\t".join([fields[1], fields[2], dt])
data.append(row)
print "writing %d rows to file" % len(data)
string = "\n".join(data)
f = open(url_to_fp(url), "w")
f.write(string)
if __name__ == '__main__':
urls = open('election-dumps.txt').read().split("\n")
threaded(urls, get_data, num_threads=100, max_queue=1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment