Skip to content

Instantly share code, notes, and snippets.

@nikolajbaer
Created April 9, 2016 23:45
Show Gist options
  • Save nikolajbaer/8ed567371851b9f4dd66bc277976a38c to your computer and use it in GitHub Desktop.
Save nikolajbaer/8ed567371851b9f4dd66bc277976a38c to your computer and use it in GitHub Desktop.
import re
from operator import itemgetter
from collections import Counter
def clean_data(txt):
txt = txt.lower()
txt = txt.replace('mr.','mr').replace('mrs.','mrs').replace('ms.','ms')
sentences = re.findall(r"[^!\.\?]+[!\.\?]",txt.lower())
sentences = [s.replace('\n',' ').replace('\r',' ') for s in sentences]
sentences = [re.sub(r'[^a-z ]','',x) for x in sentences]
return sentences
def build_db(rows):
db = {}
for r in rows:
words = r.split()
lw = len(words) - 1
for i,w in enumerate(words):
if w not in db:
db[w] = {}
if i == lw: break
nxt = words[i+1]
if nxt not in db[w]:
db[w][nxt] = 1
else:
db[w][nxt] += 1
return db
if __name__=="__main__":
rows = clean_data(open("test.txt").read())
db = build_db(rows)
while True:
print "Word:"
word = raw_input().lower()
if word not in db: print "(word not found)"
c = Counter(db[word])
print ' '.join([i[0] for i in c.most_common()[:3]])
@nikolajbaer
Copy link
Author

foolish little hack to probabilistically predict the next word based upon what word you put in. Just came as a result of a conversation at pythonSD saturday meetup.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment