Skip to content

Instantly share code, notes, and snippets.

@Rhomboid
Last active December 11, 2015 03:09
Show Gist options
  • Save Rhomboid/4535711 to your computer and use it in GitHub Desktop.
Save Rhomboid/4535711 to your computer and use it in GitHub Desktop.
MetaFilter yearly word frequency corpus combiner
import re
import csv
import sys
import glob
import collections
SMALLEST_YEAR = 1999
def year_from_filename(filename):
m = re.match(r'^\w+--(\d{4})', filename)
return m and int(m.group(1)) - SMALLEST_YEAR
def read_file(filename, data):
year_index = year_from_filename(filename)
dest_col = slice(year_index * 2, year_index * 2 + 2)
with open(filename, 'rb') as infile:
headers = [next(infile) for x in range(4)]
for count, ppm, word in csv.reader(infile, delimiter='\t'):
data[word][dest_col] = float(ppm), int(count)
def process_files(filelist, outfilename):
num_years = max(year_from_filename(fn) for fn in filelist) + 1
data = collections.defaultdict(lambda: [0] * (num_years * 2))
progress('Reading: ')
for filename in filelist:
read_file(filename, data)
progress('.')
progress('\n')
progress_amount = len(data) // num_years
with open(outfilename, 'wb') as outfile:
writer = csv.writer(outfile, delimiter='\t')
writer.writerow(['WORD'] + ['{0}_{1}'.format(label, SMALLEST_YEAR + y)
for y in range(num_years) for label in ('PPM', 'COUNT')])
progress('Sorting: .')
word_order = sorted(data.keys(), key=lambda word: (data[word][-1], word), reverse=True)
progress('\nWriting: ')
count = 0
for word in word_order:
writer.writerow([word] + data[word])
count += 1
if count % progress_amount == 0:
progress('.')
progress('\n')
def progress(what):
sys.stderr.write(what)
process_files(glob.glob('freqtable--*.txt') + glob.glob('allsites--*.txt'), 'yearly-combined.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment