shohei/1-gram.py

## 1-gram.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
from eparser import Parser
from collections import defaultdict
import glob


p = Parser()

files = glob.glob('./result/*')
print files

for i in xrange(len(files)):
#for f in files
    text = open('./result/'+str(i)+'.txt').readlines()
    #text = open(f).readlines()
    text = text[2:]
    text = "¥n".join(text)
    text = re.sub('¥n',' ',text)
    text = re.sub('&quot;',' ',text)
    text = re.sub('¥.','',text)
    text = re.sub(',',' ',text)
    text = p.clean(text)

    #splts = text.split()
    splts = p.tokenise(text,stem=True)
    splts = p.removeStopwords(splts)
    #print splts

    #set_splts=set(splts)#emit the doubling

    d = defaultdict(int)
    splts = [splt.lower() for splt in splts]
    for splt in splts:
        d[splt] += 1
        #print d.items()

    x= sorted(d.items(), key=lambda a:a[1],reverse=True)
    output=open('./output/'+str(i)+'output.txt','w')
    for k,v in x:
         #print '%s¥t%s¥t' % (str(k),v)
         #output.write('%s¥t%s¥t' % ("¥t".join(k),v))
         output.write('%s¥t%s¥n' % (str(k),v))
    output.close()


## 2-gram.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#2-gram version

import re
from eparser import Parser
from collections import defaultdict

p = Parser()

for i in xrange(200):

    text = open('./result/'+str(i)+'.txt').readlines()
    text = text[2:]
    text = "¥n".join(text)
    text = re.sub('¥n',' ',text)
    text = re.sub('&quot;',' ',text)
    text = re.sub('¥.','',text)
    text = re.sub(',',' ',text)
    text = p.clean(text)

    #splts = text.split()
    splts = p.tokenise(text,stem=True)
    splts = p.removeStopwords(splts)
    splts = [splt.lower() for splt in splts]
    nsplts=[]
    for i in xrange(len(splts)-1):
        nsplts+=[(splts[i],splts[i+1])]
        #print nsplts
    splts=nsplts

    #set_splts=set(splts)#emit the doubling

    d = defaultdict(int)
    for splt in splts:
        d[splt] += 1
        #print d.items()

    x= sorted(d.items(), key=lambda a:a[1],reverse=True)
    output=open('./output/2-gram/'+str(i)+'output.txt','w')
    for k,v in x:
        #print '%s¥t%s¥t' % (str(k),v)
        output.write('%s¥t%s¥n' % ("¥t".join(k),v))
    output.close()

## 3-gram.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#3-gram version

import re
from collections import defaultdict

#text = open('./result/1.txt').read()
text=open('obama.txt').read()
text = re.sub('¥n',' ',text)
text = re.sub('&quot;',' ',text)

splts = text.split()
splts = [splt.lower() for splt in splts]
nsplts=[]

for i in xrange(len(splts)-2):
    nsplts+=[(splts[i],splts[i+1],splts[i+2])]
print nsplts

splts=nsplts

#set_splts=set(splts)#emit the doubling

d = defaultdict(int)
for splt in splts:
    d[splt] += 1
#print d.items()

x= sorted(d.items(), key=lambda a:a[1],reverse=True)
for k,v in x[:50]:
    print '%s¥t%s¥t' % (str(k),v)


## eparser.py
#! /usr/bin/python
# -*- encoding: utf-8 -*-

from porterStemmer import PorterStemmer
import glob
import re
import os

class Parser:

    #A processor for removing the commoner morphological and inflexional endings from words in English
    stemmer=None
    stopwords=[]

    def __init__(self,):
        self.stemmer = PorterStemmer()
        self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-‾]")
        for file in glob.glob(os.path.dirname(__file__)+'/stopwords/*/*.txt'):
            self.stopwords += [ line.strip() for line in open(file).readlines() ]
        self.stopwords.append('the')

    def clean(self, string):
        """ remove any nasty grammar tokens from string """
        string = self.p.sub(' ',string)
        string = string.lower()
        return string

    def removeStopwords(self,list):
        """ Remove common words which have no search value """
        return [word for word in list if word not in self.stopwords ]

    def tokenise(self, string, stem=False):
        """ break string up into tokens and stem words """
        string = self.clean(string)
        words = string.split()

        if stem:
            return [self.stemmer.stem(word,0,len(word)-1) for word in words]
        else:
            return words

    def tokenize(self, string, stem=False):
        tokenise(self, string, stem=stem)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import re
	from eparser import Parser
	from collections import defaultdict
	import glob


	p = Parser()

	files = glob.glob('./result/*')
	print files

	for i in xrange(len(files)):
	#for f in files
	text = open('./result/'+str(i)+'.txt').readlines()
	#text = open(f).readlines()
	text = text[2:]
	text = "¥n".join(text)
	text = re.sub('¥n',' ',text)
	text = re.sub('"',' ',text)
	text = re.sub('¥.','',text)
	text = re.sub(',',' ',text)
	text = p.clean(text)

	#splts = text.split()
	splts = p.tokenise(text,stem=True)
	splts = p.removeStopwords(splts)
	#print splts

	#set_splts=set(splts)#emit the doubling

	d = defaultdict(int)
	splts = [splt.lower() for splt in splts]
	for splt in splts:
	d[splt] += 1
	#print d.items()

	x= sorted(d.items(), key=lambda a:a[1],reverse=True)
	output=open('./output/'+str(i)+'output.txt','w')
	for k,v in x:
	#print '%s¥t%s¥t' % (str(k),v)
	#output.write('%s¥t%s¥t' % ("¥t".join(k),v))
	output.write('%s¥t%s¥n' % (str(k),v))
	output.close()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#2-gram version

	import re
	from eparser import Parser
	from collections import defaultdict

	p = Parser()

	for i in xrange(200):

	text = open('./result/'+str(i)+'.txt').readlines()
	text = text[2:]
	text = "¥n".join(text)
	text = re.sub('¥n',' ',text)
	text = re.sub('"',' ',text)
	text = re.sub('¥.','',text)
	text = re.sub(',',' ',text)
	text = p.clean(text)

	#splts = text.split()
	splts = p.tokenise(text,stem=True)
	splts = p.removeStopwords(splts)
	splts = [splt.lower() for splt in splts]
	nsplts=[]
	for i in xrange(len(splts)-1):
	nsplts+=[(splts[i],splts[i+1])]
	#print nsplts
	splts=nsplts

	#set_splts=set(splts)#emit the doubling

	d = defaultdict(int)
	for splt in splts:
	d[splt] += 1
	#print d.items()

	x= sorted(d.items(), key=lambda a:a[1],reverse=True)
	output=open('./output/2-gram/'+str(i)+'output.txt','w')
	for k,v in x:
	#print '%s¥t%s¥t' % (str(k),v)
	output.write('%s¥t%s¥n' % ("¥t".join(k),v))
	output.close()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#3-gram version

	import re
	from collections import defaultdict

	#text = open('./result/1.txt').read()
	text=open('obama.txt').read()
	text = re.sub('¥n',' ',text)
	text = re.sub('"',' ',text)

	splts = text.split()
	splts = [splt.lower() for splt in splts]
	nsplts=[]

	for i in xrange(len(splts)-2):
	nsplts+=[(splts[i],splts[i+1],splts[i+2])]
	print nsplts

	splts=nsplts

	#set_splts=set(splts)#emit the doubling

	d = defaultdict(int)
	for splt in splts:
	d[splt] += 1
	#print d.items()

	x= sorted(d.items(), key=lambda a:a[1],reverse=True)
	for k,v in x[:50]:
	print '%s¥t%s¥t' % (str(k),v)
	#! /usr/bin/python
	# -- encoding: utf-8 --

	from porterStemmer import PorterStemmer
	import glob
	import re
	import os

	class Parser:

	#A processor for removing the commoner morphological and inflexional endings from words in English
	stemmer=None
	stopwords=[]

	def __init__(self,):
	self.stemmer = PorterStemmer()
	self.p = re.compile(r"&.{1,5}?;\|[!-@[-`{-‾]")
	for file in glob.glob(os.path.dirname(__file__)+'/stopwords//.txt'):
	self.stopwords += [ line.strip() for line in open(file).readlines() ]
	self.stopwords.append('the')

	def clean(self, string):
	""" remove any nasty grammar tokens from string """
	string = self.p.sub(' ',string)
	string = string.lower()
	return string

	def removeStopwords(self,list):
	""" Remove common words which have no search value """
	return [word for word in list if word not in self.stopwords ]

	def tokenise(self, string, stem=False):
	""" break string up into tokens and stem words """
	string = self.clean(string)
	words = string.split()

	if stem:
	return [self.stemmer.stem(word,0,len(word)-1) for word in words]
	else:
	return words

	def tokenize(self, string, stem=False):
	tokenise(self, string, stem=stem)