Skip to content

Instantly share code, notes, and snippets.

@shunsukeaihara shunsukeaihara/crawl.py
Last active Apr 22, 2016

Embed
What would you like to do?
rongorongoのThomas Barthel's Transliteration Systemのデータをクローリングしてファイルに保存したり、文字の正規化や分解を行うスクリプト。詳細は以下 http://argmax.jp/index.php?ron
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import urllib2
import re
URL = "http://kohaumotu.org/rongorongo_org/translit/%s.html"
for i in range(97,123):
url = URL % chr(i)
html = urllib2.urlopen(url).read()
s = BeautifulSoup(html)
for li in s.findAll('li'):
line = str(li.text)
line = line.strip()
line = re.sub(r'^[A-Z][a-z]\d\d','',line)
line = line.strip()
print line
# -*- coding: utf-8 -*-
import sys
import re
import optparse
REMOVE_SUFFIX = False
REMOVE_MERGE = False
SPLIT_CHARACTOR = False
FORCE_JOIN_STACK = False
FORCE_JOIN_MERGE = False
def single_word(word):
#数字だけ
NUMERIC_ONLY = re.compile('^(\d+)$')
#数字+記号一文字
NUMERIC_AND_CHAR = re.compile('^(\d+)(\D+)$')
m = NUMERIC_ONLY.match(word)
if m:#数字だけ
return "%03d"%int(m.group(1))
m = NUMERIC_AND_CHAR.match(word)
if m:#数字+記号一文字
numeric = "%03d"%int(m.group(1))
c = m.group(2)
if REMOVE_SUFFIX:
return numeric
else:
return numeric+c
return word
def stack_and_merge(word):
"""
.で分けた後に、上下結合(;)と部分結合(;)を分ける
"""
if word.find(':')>-1:#上下の複合文字(stack)
ar = word.split(':')
stacks = [single_word(w) for w in ar]
if SPLIT_CHARACTOR and not FORCE_JOIN_STACK:
return stacks
else:
return [":".join(stacks)]
elif word.find(";")>-1:#部分結合の複合文字(merge)
ar = word.split(';')
merge = [single_word(w) for w in ar]
if REMOVE_MERGE:
merge = [merge[0]]
if SPLIT_CHARACTOR and not FORCE_JOIN_MERGE:
return merge
else:
return [";".join(merge)]
else:
return [single_word(word)]
def split_linking(word):
"""
.で繋がったものを分けて一文字ずつ処理する
"""
ar = word.split('.')
ret = []
for w in ar:
r = stack_and_merge(w)
ret+=r
return ret
def pattern(word):
if word.find('.')>-1:#ピリオドで単語がつながっている場合
if SPLIT_CHARACTOR:
return split_linking(word)
else:
return [".".join(split_linking(word))]
else:
res = stack_and_merge(word)#この時点ではlist
return res
def split_line(line):
ret = []
buf = ""
parencount = 0
for w in line:
if w=="(":
parencount+=1
buf+=w
elif w==")":
parencount-=1
buf+=w
elif w=="-":
if parencount==0:
if buf!="":
ret.append(buf)
buf=""
else:
buf+=w
else:
buf+=w
if buf!="":
ret.append(buf)
return ret
if __name__=="__main__":
p = optparse.OptionParser()
p.add_option('-s', '--splite', action="store_true", dest='split_charactor',help="SPLIT CHARACTOR",default=False)
p.add_option('-r', '--rsuffix', action="store_true", dest='remove_suffix',help="REMOVE SUFFIX",default=False)
p.add_option('-m', '--rmerge', action="store_true", dest='remove_merge',help="REMOVE MERGE",default=False)
p.add_option('-f', '--fjoinstack', action="store_true", dest='force_join_stack',help="FORCE JOIN STACK IF SET SPLITE CHARACTOR",default=False)
p.add_option('-j', '--fjoinmerge', action="store_true", dest='force_join_merge',help="FORCE JOIN MERGE IF SET SPLITE CHARACTOR",default=False)
opts, args = p.parse_args()
SPLIT_CHARACTOR = opts.split_charactor
REMOVE_MERGE = opts.remove_merge
REMOVE_SUFFIX = opts.remove_suffix
FORCE_JOIN_MERGE = opts.force_join_merge
FORCE_JOIN_STACK = opts.force_join_stack
for line in sys.stdin:
line = line.strip()
ar = split_line(line)
for w in ar:
w = w.strip()
if w!="":
r = pattern(w)
for x in r:
print x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.