This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import marshal | |
inFile = open("inflection.table.txt","r") | |
outFile = open("031out.txt","w") | |
dictInflection = dict() | |
for Line in inFile: | |
splitLine = Line.split("|") | |
DefinitionBySpeechPart = dict() | |
DefinitionBySpeechPart[splitLine[1]] = dict(conjugation = splitLine[3], base = splitLine[6]) | |
#print DefinitionBySpeechPart | |
if not dictInflection.has_key(splitLine[0]): | |
dictInflection[splitLine[0]] = DefinitionBySpeechPart | |
else: | |
dictInflection[splitLine[0]].update(DefinitionBySpeechPart) | |
#break | |
#print dictInflection | |
inFile.close() | |
while True: | |
print "Type in a word (just enter to quit):", | |
input = sys.stdin.readline() | |
if input=="\n": | |
break | |
else: | |
input = input.strip("\n") | |
if dictInflection.has_key(input): | |
print dictInflection.get(input), | |
else: | |
print "Not found:"+input, | |
print "\n", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import marshal | |
inFile = open("inflection.table.txt","r") | |
outFile = open("032out.txt","w") | |
dictInflection = dict() | |
for Line in inFile: | |
splitLine = Line.split("|") | |
DefinitionBySpeechPart = dict() | |
DefinitionBySpeechPart[splitLine[1]] = dict(conjugation = splitLine[3], base = splitLine[6]) | |
#print DefinitionBySpeechPart | |
if not dictInflection.has_key(splitLine[0]): | |
dictInflection[splitLine[0]] = DefinitionBySpeechPart | |
else: | |
dictInflection[splitLine[0]].update(DefinitionBySpeechPart) | |
#break | |
#print dictInflection | |
inFile.close() | |
marshal.dump(dictInflection, outFile) | |
while True: | |
print "Type in a word (just enter to quit):", | |
input = sys.stdin.readline() | |
if input=="\n": | |
break | |
else: | |
input = input.strip("\n") | |
if dictInflection.has_key(input): | |
Word = dictInflection[input] | |
print input,":" | |
for Def in Word: | |
print "\t["+Def+"]:" | |
for Spec in dictInflection[input][Def]: | |
print "\t ", | |
print Spec.ljust(11), | |
print ":", | |
print dictInflection[input][Def][Spec] | |
else: | |
print "Not found:"+input, | |
print "\n", | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import marshal | |
inFile = open("030out.txt","r") | |
outFile = open("033out.txt","w") | |
dictData = open("032out.txt","r") | |
Dict = marshal.load(dictData) | |
#use the same code from 032.py | |
for input in inFile: | |
input = input.strip("\n") | |
item = re.match("^(.*?)\t(.*?)\t(.*?)$", input) | |
word = item.group(2) | |
if Dict.has_key(word): | |
Word = Dict[word] | |
print word,":" | |
for Def in Word: | |
print "\t["+Def+"]:" | |
for Spec in Dict[word][Def]: | |
print "\t ", | |
print Spec.ljust(11), | |
print ":", | |
print Dict[word][Def][Spec] | |
#print Dict.get(input), | |
print "\n", | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- encoding: utf-8 -*- | |
import re | |
import sys | |
import marshal | |
inFile = open("030out.txt","r") | |
outFile = open("034out.txt","w") | |
dictData = open("032out.txt","r") | |
Dict = marshal.load(dictData) | |
for input in inFile: | |
item = re.match("^(.*?)\t(.*?)\t(.*?)$", input.strip("\n")) | |
word = item.group(2) | |
if not Dict.has_key(word): | |
outFile.write(word+"\n") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- encoding:utf-8 -*- | |
import re | |
import sys | |
import marshal | |
inFile = open("030out.txt","r") | |
dictData = open("032out.txt","r") | |
Dict = marshal.load(dictData) | |
wordCount = {} | |
for input in inFile: | |
item = re.match("^(.*?)\t(.*?)\t(.*?)$", input.strip("\n")) | |
word = item.group(2) | |
if Dict.has_key(word): #もしデータが辞書にあって | |
if word in wordCount: #すでに登場単語リストに登録されていて | |
# if wordCount[word] == 3:#登場単語が3回なら | |
# print word+":",wordCount[word] #出力 | |
wordCount[word] += 1 | |
else: | |
wordCount[word] = 1 | |
for item in wordCount: | |
if wordCount[item] >= 3: | |
print item+":",wordCount[item] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
import re | |
inFile = open("030out.txt","r") | |
outFile = open("036out.txt","w") | |
prevword = str() | |
word = str() | |
for line in inFile: | |
item = re.match("^(.*?)\t(.*?)\t(.*?)$", line.strip("\n")) | |
prevword = word | |
word = item.group(2) | |
outFile.write(prevword+"\t"+word+"\n") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- encoding: utf-8 -*- | |
import re | |
inFile = open("036out.txt","r") | |
outFile = open("037out.txt","w") | |
#frequency calc: | |
FreqList = dict() | |
for line in inFile: | |
line = line.strip("\n") | |
if line in FreqList: | |
FreqList[line] += 1 | |
else: | |
FreqList[line] = 1 | |
#ref: http://d.hatena.ne.jp/ir_takt/20110808/1312830911 | |
for k, v in sorted(FreqList.items(), key=lambda x:x[1], reverse=True): | |
newline = str(v)+"\t"+k+"\n" | |
outFile.write(newline) | |
inFile.close() | |
outFile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*-encoding:utf-8-*- | |
#(38) (37)の出力を読み込み,ある単語wに続く単語zの条件付き確率P(z|w)を求めよ.ただし,出力形式は"(条件付き確率)\t(現在の単語)\t(次の単語)"とせよ. | |
u""" | |
条件確率 | |
P(A|B) = P(A∩B)/P(B) | |
P(z|w) = P(z∩w)/P(w) | |
(Z&Wが起こる確率)/(Wが起こる確率) | |
(1): (Z&Wが起こる確率) = 037out.txtを用いて Z&Wの頻度/全頻度の和 | |
(2): (Wが起こる確率) = 025out.txtを用いて Wの出現数/全単語の出現数の和 | |
""" | |
import re, sys | |
from collections import defaultdict | |
outFile38 = open("038out.txt","w") | |
#(1): (Z&Wが起こる確率) = 037out.txtを用いて Z&Wの頻度/全頻度の和 | |
#(37)の出力を読み込み, | |
inFile37 = open("037out.txt","r") #037out.txt: 頻度\t単語\t単語のリスト | |
Dict37 = dict() #連語の頻度リスト | |
sumWZfreq = 0 | |
#037out.txtの内容を辞書型変数Dict37に代入:{"Z\tW":0, ・・・} | |
#同時にsumWZfreqに全頻度を加算 | |
for line in inFile37: | |
item = re.match("^(\d*)\t(.*?\t.*?)$",line.strip("\n")) | |
sumWZfreq += int(item.group(1))#頻度の加算 | |
Dict37[item.group(2)] = int(item.group(1)) | |
#037out.txtの内容を辞書型変数Dict37に代入:{"Z\tW":0, ・・・} | |
#WZ:ある単語wに続く単語z | |
probCapWZ = dict() #P(ZandW) | |
for WZ in Dict37: | |
if not WZ in probCapWZ: | |
probCapWZ[WZ] = Dict37[WZ]*1.0/sumWZfreq#*100 | |
print "%s:%f"%(WZ, probCapWZ[WZ]) | |
#(2): (Wが起こる確率) = 025out.txtを用いて Wの出現数/全単語の出現数の和 | |
#Wの出現総数カウント | |
inFile25 = open("025out.txt","r") #025out.txt: (単語in原型(大文字もある))\t(単語in小文字) | |
lineCount = 0 | |
wordList = dict() | |
for line in inFile25: | |
item = re.match("^(.*?)\t(.*?)$",line.strip("\n")) | |
lineCount += 1 | |
if item.group(2) in wordList: | |
wordList[item.group(2)] += 1 | |
else: | |
wordList[item.group(2)] = 1 | |
ProbW = dict() #各単語の確率:P(W) | |
for word in wordList: | |
ProbW[word] = wordList[word]*1.0/lineCount#sumWList | |
ProbWZoverW = dict() | |
for WZ in probCapWZ: | |
item = re.match("^(.*?)\t(.*?)$",WZ) | |
ProbWZoverW[WZ] = probCapWZ[WZ]*1.0/ProbW[item.group(1)] | |
print str(ProbWZoverW[WZ])+'\t'+WZ | |
outFile38.write(str(ProbWZoverW[WZ])+'\t'+WZ+'\n') | |
outFile38.close() | |
inFile37.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- encoding:utf-8 -*- | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment