Created
August 31, 2013 19:34
-
-
Save abelsonlive/6400144 to your computer and use it in GitHub Desktop.
get wikipedia pageviews
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from thready import threaded | |
import requests | |
import gzip | |
from StringIO import StringIO | |
import re | |
from datetime import datetime | |
def url_to_date(url): | |
d = "".join(url.split("/")[-1].split(".")[0].split("-")[1:3]) | |
return datetime.strptime(d, "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S") | |
def url_to_fp(url): | |
return "".join(url.split("/")[-1].split(".")[0].split("-")[1:3]) + ".tsv" | |
def decode_gzip(gzipped_string): | |
''' decode gzipped content ''' | |
gzipper = gzip.GzipFile(fileobj=StringIO(gzipped_string)) | |
return gzipper.read() | |
def get_data(url): | |
dt = url_to_date(url) | |
# raw_file = requests.get(url).text | |
raw_file = open(url).read() | |
tsv = decode_gzip(raw_file) | |
data = [] | |
for line in tsv.split("\n"): | |
if re.match("en ", line): | |
line = unicode(line, errors="ignore") | |
fields = line.split(" ") | |
row = "\t".join([fields[1], fields[2], dt]) | |
data.append(row) | |
print "writing %d rows to file" % len(data) | |
string = "\n".join(data) | |
f = open(url_to_fp(url), "w") | |
f.write(string) | |
if __name__ == '__main__': | |
urls = open('election-dumps.txt').read().split("\n") | |
threaded(urls, get_data, num_threads=100, max_queue=1000) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment