Skip to content

Instantly share code, notes, and snippets.

@HiroshiMatsumoto
Created December 30, 2012 17:18
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import marshal
inFile = open("inflection.table.txt","r")
outFile = open("031out.txt","w")
dictInflection = dict()
for Line in inFile:
splitLine = Line.split("|")
DefinitionBySpeechPart = dict()
DefinitionBySpeechPart[splitLine[1]] = dict(conjugation = splitLine[3], base = splitLine[6])
#print DefinitionBySpeechPart
if not dictInflection.has_key(splitLine[0]):
dictInflection[splitLine[0]] = DefinitionBySpeechPart
else:
dictInflection[splitLine[0]].update(DefinitionBySpeechPart)
#break
#print dictInflection
inFile.close()
while True:
print "Type in a word (just enter to quit):",
input = sys.stdin.readline()
if input=="\n":
break
else:
input = input.strip("\n")
if dictInflection.has_key(input):
print dictInflection.get(input),
else:
print "Not found:"+input,
print "\n",
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import marshal
inFile = open("inflection.table.txt","r")
outFile = open("032out.txt","w")
dictInflection = dict()
for Line in inFile:
splitLine = Line.split("|")
DefinitionBySpeechPart = dict()
DefinitionBySpeechPart[splitLine[1]] = dict(conjugation = splitLine[3], base = splitLine[6])
#print DefinitionBySpeechPart
if not dictInflection.has_key(splitLine[0]):
dictInflection[splitLine[0]] = DefinitionBySpeechPart
else:
dictInflection[splitLine[0]].update(DefinitionBySpeechPart)
#break
#print dictInflection
inFile.close()
marshal.dump(dictInflection, outFile)
while True:
print "Type in a word (just enter to quit):",
input = sys.stdin.readline()
if input=="\n":
break
else:
input = input.strip("\n")
if dictInflection.has_key(input):
Word = dictInflection[input]
print input,":"
for Def in Word:
print "\t["+Def+"]:"
for Spec in dictInflection[input][Def]:
print "\t ",
print Spec.ljust(11),
print ":",
print dictInflection[input][Def][Spec]
else:
print "Not found:"+input,
print "\n",
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import marshal
inFile = open("030out.txt","r")
outFile = open("033out.txt","w")
dictData = open("032out.txt","r")
Dict = marshal.load(dictData)
#use the same code from 032.py
for input in inFile:
input = input.strip("\n")
item = re.match("^(.*?)\t(.*?)\t(.*?)$", input)
word = item.group(2)
if Dict.has_key(word):
Word = Dict[word]
print word,":"
for Def in Word:
print "\t["+Def+"]:"
for Spec in Dict[word][Def]:
print "\t ",
print Spec.ljust(11),
print ":",
print Dict[word][Def][Spec]
#print Dict.get(input),
print "\n",
#! /usr/bin/env python
#-*- encoding: utf-8 -*-
import re
import sys
import marshal
inFile = open("030out.txt","r")
outFile = open("034out.txt","w")
dictData = open("032out.txt","r")
Dict = marshal.load(dictData)
for input in inFile:
item = re.match("^(.*?)\t(.*?)\t(.*?)$", input.strip("\n"))
word = item.group(2)
if not Dict.has_key(word):
outFile.write(word+"\n")
#! /usr/bin/env python
#-*- encoding:utf-8 -*-
import re
import sys
import marshal
inFile = open("030out.txt","r")
dictData = open("032out.txt","r")
Dict = marshal.load(dictData)
wordCount = {}
for input in inFile:
item = re.match("^(.*?)\t(.*?)\t(.*?)$", input.strip("\n"))
word = item.group(2)
if Dict.has_key(word): #もしデータが辞書にあって
if word in wordCount: #すでに登場単語リストに登録されていて
# if wordCount[word] == 3:#登場単語が3回なら
# print word+":",wordCount[word] #出力
wordCount[word] += 1
else:
wordCount[word] = 1
for item in wordCount:
if wordCount[item] >= 3:
print item+":",wordCount[item]
#! /usr/bin/env python
# -*- encoding: utf-8 -*-
import re
inFile = open("030out.txt","r")
outFile = open("036out.txt","w")
prevword = str()
word = str()
for line in inFile:
item = re.match("^(.*?)\t(.*?)\t(.*?)$", line.strip("\n"))
prevword = word
word = item.group(2)
outFile.write(prevword+"\t"+word+"\n")
#! /usr/bin/env python
#-*- encoding: utf-8 -*-
import re
inFile = open("036out.txt","r")
outFile = open("037out.txt","w")
#frequency calc:
FreqList = dict()
for line in inFile:
line = line.strip("\n")
if line in FreqList:
FreqList[line] += 1
else:
FreqList[line] = 1
#ref: http://d.hatena.ne.jp/ir_takt/20110808/1312830911
for k, v in sorted(FreqList.items(), key=lambda x:x[1], reverse=True):
newline = str(v)+"\t"+k+"\n"
outFile.write(newline)
inFile.close()
outFile.close()
#! /usr/bin/env python
#-*-encoding:utf-8-*-
#(38) (37)の出力を読み込み,ある単語wに続く単語zの条件付き確率P(z|w)を求めよ.ただし,出力形式は"(条件付き確率)\t(現在の単語)\t(次の単語)"とせよ.
u"""
条件確率
P(A|B) = P(A∩B)/P(B)
P(z|w) = P(z∩w)/P(w)
(Z&Wが起こる確率)/(Wが起こる確率)
(1): (Z&Wが起こる確率) = 037out.txtを用いて Z&Wの頻度/全頻度の和
(2): (Wが起こる確率) = 025out.txtを用いて Wの出現数/全単語の出現数の和
"""
import re, sys
from collections import defaultdict
outFile38 = open("038out.txt","w")
#(1): (Z&Wが起こる確率) = 037out.txtを用いて Z&Wの頻度/全頻度の和
#(37)の出力を読み込み,
inFile37 = open("037out.txt","r") #037out.txt: 頻度\t単語\t単語のリスト
Dict37 = dict() #連語の頻度リスト
sumWZfreq = 0
#037out.txtの内容を辞書型変数Dict37に代入:{"Z\tW":0, ・・・}
#同時にsumWZfreqに全頻度を加算
for line in inFile37:
item = re.match("^(\d*)\t(.*?\t.*?)$",line.strip("\n"))
sumWZfreq += int(item.group(1))#頻度の加算
Dict37[item.group(2)] = int(item.group(1))
#037out.txtの内容を辞書型変数Dict37に代入:{"Z\tW":0, ・・・}
#WZ:ある単語wに続く単語z
probCapWZ = dict() #P(ZandW)
for WZ in Dict37:
if not WZ in probCapWZ:
probCapWZ[WZ] = Dict37[WZ]*1.0/sumWZfreq#*100
print "%s:%f"%(WZ, probCapWZ[WZ])
#(2): (Wが起こる確率) = 025out.txtを用いて Wの出現数/全単語の出現数の和
#Wの出現総数カウント
inFile25 = open("025out.txt","r") #025out.txt: (単語in原型(大文字もある))\t(単語in小文字)
lineCount = 0
wordList = dict()
for line in inFile25:
item = re.match("^(.*?)\t(.*?)$",line.strip("\n"))
lineCount += 1
if item.group(2) in wordList:
wordList[item.group(2)] += 1
else:
wordList[item.group(2)] = 1
ProbW = dict() #各単語の確率:P(W)
for word in wordList:
ProbW[word] = wordList[word]*1.0/lineCount#sumWList
ProbWZoverW = dict()
for WZ in probCapWZ:
item = re.match("^(.*?)\t(.*?)$",WZ)
ProbWZoverW[WZ] = probCapWZ[WZ]*1.0/ProbW[item.group(1)]
print str(ProbWZoverW[WZ])+'\t'+WZ
outFile38.write(str(ProbWZoverW[WZ])+'\t'+WZ+'\n')
outFile38.close()
inFile37.close()
#! /usr/bin/env python
#-*- encoding:utf-8 -*-
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment