Created
December 18, 2010 00:52
-
-
Save bmander/745984 to your computer and use it in GitHub Desktop.
Create markov chains using the Google ntuple dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Get a bunch of 2gram files. Then split the 2grams with spaceize(). Then set up a database like | |
create table digram (word1 text, word2 text, year integer, match_count integer, page_count integer, volume_count integer); | |
.separator "\t" | |
.import spaceized.csv digram | |
then you're set up to use chain()""" | |
import csv | |
import sqlite3 | |
from random import randint | |
import sys | |
def spaceize(filename): | |
fp = open( filename ) | |
fpout = open( "spaceized.csv", "w" ) | |
for i, line in enumerate(fp): | |
if i%10000==0: print "\r%d"%i | |
spaceline = line.replace(" ","\t") | |
if spaceline.count("\t") != 5: continue | |
fpout.write( spaceline ); | |
fpout.close() | |
class Spinner: | |
def __init__(self): | |
self.slices = [] | |
self.size = 0 | |
def add(self,name, size): | |
self.slices.append( [name,size] ) | |
self.size += size | |
def spin(self): | |
if len(self.slices)==0: | |
return None | |
spinner_value = randint(1,self.size) | |
aa = 0 | |
for name,size in self.slices: | |
aa += size | |
if aa>=spinner_value: | |
return name | |
def predict(c,word,year,year2=None): | |
if year2==None: | |
year2=year-10 | |
c.execute("""select word2, match_count from digram where word1="%s" and year BETWEEN %d AND %d"""%(word,year2,year)) | |
digrams = list(c) | |
spinner = Spinner() | |
for word2, match_count in digrams: | |
spinner.add(word2, match_count) | |
return spinner.spin() | |
def chain(term,year,database="ngram.db"): | |
conn = sqlite3.connect(database) | |
c = conn.cursor() | |
while term is not None: | |
print term, | |
sys.stdout.flush() | |
term = predict(c,term,int(year)) | |
c.close() | |
if __name__=='__main__': | |
from sys import argv | |
print argv | |
chain(argv[1], argv[2]) | |
#spaceize("googlebooks-eng-fiction-all-2gram-20090715-1.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment