Skip to content

Instantly share code, notes, and snippets.

@HiroshiMatsumoto
Created December 30, 2012 16:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HiroshiMatsumoto/4413772 to your computer and use it in GitHub Desktop.
Save HiroshiMatsumoto/4413772 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# -*- python -*-
# -*- encoding: utf-8 -*-
import sys
for line in sys.stdin.readline().split("."):
#文字列先頭の空白部(' ')除去
print line+"."
#! /usr/bin/env python
# -*- python -*-
# -*- encoding: utf-8 -*-
# Task:標準入力から英語のテキストを読み込み,ピリオド→スペース→大文字を文の区切りと見なし,1行1文の形式で標準出力に書き出せ.
import sys
import re
for line in sys.stdin.readline().split("."):
print re.match("\.\s[A-Z]",line)
#! /usr/bin/env python
# -*- python -*-
# -*- encoding: utf-8 -*-
import re
#f=open("test2.txt","r")
f=open("j98_1002.txt","r")
w=open("024out.txt","w")
line = f.readline()
while line:
line = line.strip("\n")
word = re.findall("\s*,?([\(\)\<\>\{\}]|[^\".\n][\w'-]*)[\s\.,]?", line)
#word = re.findall("\s*,?([^\".\n][\w'-]*)[\s\.,]?", line) #proto#1
#word = re.findall("([^,\n]\w+)", line)
#print word
for i in range(len(word)):
if len(line):
print word[i]
w.write(word[i])
w.write("\n")
print "\n",
w.write("\n")
line = f.readline()
f.close()
w.close()
#!/usr/bin/env python
import re
fopen = open("024out.txt","r")
fwrite = open("025out.txt","w")
word = fopen.readline()
while word:
#word = "\n" #for debug
if not re.match("^\s+\n$",word):
word = word.strip("\n")
line = str(word)+"\t"+str(word.lower())+"\n"
print line,
fwrite.write(line)
word = fopen.readline()
#break #for debug
fopen.close()
fwrite.close()
#!/usr/bin/env python
import re
#inFile = open("025out.txt","r")
inFile = open("test2.txt","r")
outFile = open("026out.txt","w")
Word = inFile.readline()
setWord = set()
while Word:
#Getting lowered words (located after original_word \t)
Word.strip("\n")
pairWord = re.match("^(.*)\t(.*)$",Word)
loweredWord = pairWord.group(2)
#print loweredWord
setWord.add(loweredWord)
#print listWord
Word = inFile.readline()
#break #for debug
listWord = list(setWord)
listSuffix_ly = set()
#making a list of -ly words
for i in range(len(listWord)):
matched = re.match("^(.*)ly$",listWord[i])
if matched:
listSuffix_ly.add(matched.group(1))
#print matched.group(1)
#picking -ness words out of listWord and checking matched with listSuffix_ly
for i in range(len(listWord)):
matched = re.match("^(.*)ness$",listWord[i])
if matched and matched.group(1) in listSuffix_ly:
print matched.group(1)
inFile.close()
outFile.close()
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from f010 import RankList
outFile = open("027out.txt","w")
RankedList = RankList("025out.txt")
#ref: http://blog.livedoor.jp/yawamen/archives/51492355.html
for key, value in sorted(RankedList.items(), key=lambda x:x[1]):
print "%s:%d" % (key, value)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import re
def Ngram(Content, N):
FreqNgramList = {}
for line in Content:
for i in range(len(line)):
if i+N < len(line):
word = line[i:i+N]#line[i:i+N] iからi+N-1までの文字列
if FreqNgramList.has_key(word):
FreqNgramList[word]+=1
else:
FreqNgramList[word]=0
return FreqNgramList
N = 2
Content = open("025out.txt","r")
FreqBigramList = Ngram(Content, N)
Content.close()
for key, value in sorted(FreqBigramList.items(), key=lambda x:x[1]):
print r"%s:%d" % (key, value)
#! /usr/bin/env python
#-*- coding:utf-8 -*-
from stemming.porter2 import stem
print stem("factionally")
#! /usr/bin/env python
#-*- coding:utf-8 -*-
from stemming.porter2 import stem
import re
inFile = open("025out.txt","r")
outFile = open("030out.txt", "w")
for Line in inFile:
Line = Line.strip("\n")
Words = re.match("^(\w*)\t(\w*)$",Line)
#NewLine = Words.group(1)+"\t"+Words.group(2)+"\t"+stem(Words.group(2))
if Words:
outFile.write(Words.group(1)+"\t"+Words.group(2)+"\t"+stem(Words.group(2))+"\n")
inFile.close()
outFile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment