Skip to content

Instantly share code, notes, and snippets.

@bmander
Created December 18, 2010 00:52
Show Gist options
  • Save bmander/745984 to your computer and use it in GitHub Desktop.
Save bmander/745984 to your computer and use it in GitHub Desktop.
Create markov chains using the Google ntuple dataset
"""Get a bunch of 2gram files. Then split the 2grams with spaceize(). Then set up a database like
create table digram (word1 text, word2 text, year integer, match_count integer, page_count integer, volume_count integer);
.separator "\t"
.import spaceized.csv digram
then you're set up to use chain()"""
import csv
import sqlite3
from random import randint
import sys
def spaceize(filename):
fp = open( filename )
fpout = open( "spaceized.csv", "w" )
for i, line in enumerate(fp):
if i%10000==0: print "\r%d"%i
spaceline = line.replace(" ","\t")
if spaceline.count("\t") != 5: continue
fpout.write( spaceline );
fpout.close()
class Spinner:
def __init__(self):
self.slices = []
self.size = 0
def add(self,name, size):
self.slices.append( [name,size] )
self.size += size
def spin(self):
if len(self.slices)==0:
return None
spinner_value = randint(1,self.size)
aa = 0
for name,size in self.slices:
aa += size
if aa>=spinner_value:
return name
def predict(c,word,year,year2=None):
if year2==None:
year2=year-10
c.execute("""select word2, match_count from digram where word1="%s" and year BETWEEN %d AND %d"""%(word,year2,year))
digrams = list(c)
spinner = Spinner()
for word2, match_count in digrams:
spinner.add(word2, match_count)
return spinner.spin()
def chain(term,year,database="ngram.db"):
conn = sqlite3.connect(database)
c = conn.cursor()
while term is not None:
print term,
sys.stdout.flush()
term = predict(c,term,int(year))
c.close()
if __name__=='__main__':
from sys import argv
print argv
chain(argv[1], argv[2])
#spaceize("googlebooks-eng-fiction-all-2gram-20090715-1.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment