This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- python -*- | |
# -*- encoding: utf-8 -*- | |
import sys | |
for line in sys.stdin.readline().split("."): | |
#文字列先頭の空白部(' ')除去 | |
print line+"." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- python -*- | |
# -*- encoding: utf-8 -*- | |
# Task:標準入力から英語のテキストを読み込み,ピリオド→スペース→大文字を文の区切りと見なし,1行1文の形式で標準出力に書き出せ. | |
import sys | |
import re | |
for line in sys.stdin.readline().split("."): | |
print re.match("\.\s[A-Z]",line) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- python -*- | |
# -*- encoding: utf-8 -*- | |
import re | |
#f=open("test2.txt","r") | |
f=open("j98_1002.txt","r") | |
w=open("024out.txt","w") | |
line = f.readline() | |
while line: | |
line = line.strip("\n") | |
word = re.findall("\s*,?([\(\)\<\>\{\}]|[^\".\n][\w'-]*)[\s\.,]?", line) | |
#word = re.findall("\s*,?([^\".\n][\w'-]*)[\s\.,]?", line) #proto#1 | |
#word = re.findall("([^,\n]\w+)", line) | |
#print word | |
for i in range(len(word)): | |
if len(line): | |
print word[i] | |
w.write(word[i]) | |
w.write("\n") | |
print "\n", | |
w.write("\n") | |
line = f.readline() | |
f.close() | |
w.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
fopen = open("024out.txt","r") | |
fwrite = open("025out.txt","w") | |
word = fopen.readline() | |
while word: | |
#word = "\n" #for debug | |
if not re.match("^\s+\n$",word): | |
word = word.strip("\n") | |
line = str(word)+"\t"+str(word.lower())+"\n" | |
print line, | |
fwrite.write(line) | |
word = fopen.readline() | |
#break #for debug | |
fopen.close() | |
fwrite.close() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
#inFile = open("025out.txt","r") | |
inFile = open("test2.txt","r") | |
outFile = open("026out.txt","w") | |
Word = inFile.readline() | |
setWord = set() | |
while Word: | |
#Getting lowered words (located after original_word \t) | |
Word.strip("\n") | |
pairWord = re.match("^(.*)\t(.*)$",Word) | |
loweredWord = pairWord.group(2) | |
#print loweredWord | |
setWord.add(loweredWord) | |
#print listWord | |
Word = inFile.readline() | |
#break #for debug | |
listWord = list(setWord) | |
listSuffix_ly = set() | |
#making a list of -ly words | |
for i in range(len(listWord)): | |
matched = re.match("^(.*)ly$",listWord[i]) | |
if matched: | |
listSuffix_ly.add(matched.group(1)) | |
#print matched.group(1) | |
#picking -ness words out of listWord and checking matched with listSuffix_ly | |
for i in range(len(listWord)): | |
matched = re.match("^(.*)ness$",listWord[i]) | |
if matched and matched.group(1) in listSuffix_ly: | |
print matched.group(1) | |
inFile.close() | |
outFile.close() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: utf-8 -*- | |
from f010 import RankList | |
outFile = open("027out.txt","w") | |
RankedList = RankList("025out.txt") | |
#ref: http://blog.livedoor.jp/yawamen/archives/51492355.html | |
for key, value in sorted(RankedList.items(), key=lambda x:x[1]): | |
print "%s:%d" % (key, value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
def Ngram(Content, N): | |
FreqNgramList = {} | |
for line in Content: | |
for i in range(len(line)): | |
if i+N < len(line): | |
word = line[i:i+N]#line[i:i+N] iからi+N-1までの文字列 | |
if FreqNgramList.has_key(word): | |
FreqNgramList[word]+=1 | |
else: | |
FreqNgramList[word]=0 | |
return FreqNgramList | |
N = 2 | |
Content = open("025out.txt","r") | |
FreqBigramList = Ngram(Content, N) | |
Content.close() | |
for key, value in sorted(FreqBigramList.items(), key=lambda x:x[1]): | |
print r"%s:%d" % (key, value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- coding:utf-8 -*- | |
from stemming.porter2 import stem | |
print stem("factionally") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- coding:utf-8 -*- | |
from stemming.porter2 import stem | |
import re | |
inFile = open("025out.txt","r") | |
outFile = open("030out.txt", "w") | |
for Line in inFile: | |
Line = Line.strip("\n") | |
Words = re.match("^(\w*)\t(\w*)$",Line) | |
#NewLine = Words.group(1)+"\t"+Words.group(2)+"\t"+stem(Words.group(2)) | |
if Words: | |
outFile.write(Words.group(1)+"\t"+Words.group(2)+"\t"+stem(Words.group(2))+"\n") | |
inFile.close() | |
outFile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment