Skip to content

Instantly share code, notes, and snippets.

@shohei
Created April 12, 2012 10:38
Show Gist options
  • Save shohei/2366375 to your computer and use it in GitHub Desktop.
Save shohei/2366375 to your computer and use it in GitHub Desktop.
n-gram
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from eparser import Parser
from collections import defaultdict
import glob
p = Parser()
files = glob.glob('./result/*')
print files
for i in xrange(len(files)):
#for f in files
text = open('./result/'+str(i)+'.txt').readlines()
#text = open(f).readlines()
text = text[2:]
text = "¥n".join(text)
text = re.sub('¥n',' ',text)
text = re.sub('"',' ',text)
text = re.sub('¥.','',text)
text = re.sub(',',' ',text)
text = p.clean(text)
#splts = text.split()
splts = p.tokenise(text,stem=True)
splts = p.removeStopwords(splts)
#print splts
#set_splts=set(splts)#emit the doubling
d = defaultdict(int)
splts = [splt.lower() for splt in splts]
for splt in splts:
d[splt] += 1
#print d.items()
x= sorted(d.items(), key=lambda a:a[1],reverse=True)
output=open('./output/'+str(i)+'output.txt','w')
for k,v in x:
#print '%s¥t%s¥t' % (str(k),v)
#output.write('%s¥t%s¥t' % ("¥t".join(k),v))
output.write('%s¥t%s¥n' % (str(k),v))
output.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#2-gram version
import re
from eparser import Parser
from collections import defaultdict
p = Parser()
for i in xrange(200):
text = open('./result/'+str(i)+'.txt').readlines()
text = text[2:]
text = "¥n".join(text)
text = re.sub('¥n',' ',text)
text = re.sub('"',' ',text)
text = re.sub('¥.','',text)
text = re.sub(',',' ',text)
text = p.clean(text)
#splts = text.split()
splts = p.tokenise(text,stem=True)
splts = p.removeStopwords(splts)
splts = [splt.lower() for splt in splts]
nsplts=[]
for i in xrange(len(splts)-1):
nsplts+=[(splts[i],splts[i+1])]
#print nsplts
splts=nsplts
#set_splts=set(splts)#emit the doubling
d = defaultdict(int)
for splt in splts:
d[splt] += 1
#print d.items()
x= sorted(d.items(), key=lambda a:a[1],reverse=True)
output=open('./output/2-gram/'+str(i)+'output.txt','w')
for k,v in x:
#print '%s¥t%s¥t' % (str(k),v)
output.write('%s¥t%s¥n' % ("¥t".join(k),v))
output.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#3-gram version
import re
from collections import defaultdict
#text = open('./result/1.txt').read()
text=open('obama.txt').read()
text = re.sub('¥n',' ',text)
text = re.sub('"',' ',text)
splts = text.split()
splts = [splt.lower() for splt in splts]
nsplts=[]
for i in xrange(len(splts)-2):
nsplts+=[(splts[i],splts[i+1],splts[i+2])]
print nsplts
splts=nsplts
#set_splts=set(splts)#emit the doubling
d = defaultdict(int)
for splt in splts:
d[splt] += 1
#print d.items()
x= sorted(d.items(), key=lambda a:a[1],reverse=True)
for k,v in x[:50]:
print '%s¥t%s¥t' % (str(k),v)
#! /usr/bin/python
# -*- encoding: utf-8 -*-
from porterStemmer import PorterStemmer
import glob
import re
import os
class Parser:
#A processor for removing the commoner morphological and inflexional endings from words in English
stemmer=None
stopwords=[]
def __init__(self,):
self.stemmer = PorterStemmer()
self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-‾]")
for file in glob.glob(os.path.dirname(__file__)+'/stopwords/*/*.txt'):
self.stopwords += [ line.strip() for line in open(file).readlines() ]
self.stopwords.append('the')
def clean(self, string):
""" remove any nasty grammar tokens from string """
string = self.p.sub(' ',string)
string = string.lower()
return string
def removeStopwords(self,list):
""" Remove common words which have no search value """
return [word for word in list if word not in self.stopwords ]
def tokenise(self, string, stem=False):
""" break string up into tokens and stem words """
string = self.clean(string)
words = string.split()
if stem:
return [self.stemmer.stem(word,0,len(word)-1) for word in words]
else:
return words
def tokenize(self, string, stem=False):
tokenise(self, string, stem=stem)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment