shunsukeaihara/crawl.py

## crawl.py
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import urllib2
import re


URL = "http://kohaumotu.org/rongorongo_org/translit/%s.html"
for i in range(97,123):
    url = URL % chr(i)
    html = urllib2.urlopen(url).read()
    s = BeautifulSoup(html)
    for li in s.findAll('li'):
        line = str(li.text)
        line = line.strip()
        line = re.sub(r'^[A-Z][a-z]\d\d','',line)
        line = line.strip()
        print line

## split.py
# -*- coding: utf-8 -*-
import sys
import re
import optparse

REMOVE_SUFFIX = False
REMOVE_MERGE = False
SPLIT_CHARACTOR = False
FORCE_JOIN_STACK = False
FORCE_JOIN_MERGE = False

def single_word(word):
    #数字だけ
    NUMERIC_ONLY = re.compile('^(\d+)$')
    #数字+記号一文字
    NUMERIC_AND_CHAR = re.compile('^(\d+)(\D+)$')
    m = NUMERIC_ONLY.match(word)
    if m:#数字だけ
        return "%03d"%int(m.group(1))
    m = NUMERIC_AND_CHAR.match(word)
    if m:#数字+記号一文字
        numeric = "%03d"%int(m.group(1))
        c = m.group(2)
        if REMOVE_SUFFIX:
            return numeric
        else:
            return numeric+c
    return word


def stack_and_merge(word):
    """
    .で分けた後に、上下結合(;)と部分結合(;)を分ける
    """
    if word.find(':')>-1:#上下の複合文字(stack)
        ar = word.split(':')
        stacks = [single_word(w) for w in ar]
        if SPLIT_CHARACTOR and not FORCE_JOIN_STACK:
            return stacks
        else:
            return [":".join(stacks)]
    elif word.find(";")>-1:#部分結合の複合文字(merge)
        ar = word.split(';')
        merge = [single_word(w) for w in ar]
        if REMOVE_MERGE:
            merge = [merge[0]]
        if SPLIT_CHARACTOR and not FORCE_JOIN_MERGE:
            return merge
        else:
            return [";".join(merge)]
    else:
        return [single_word(word)]


def split_linking(word):
    """
    .で繋がったものを分けて一文字ずつ処理する
    """
    ar = word.split('.')
    ret = []
    for w in ar:
        r = stack_and_merge(w)
        ret+=r
    return ret

def pattern(word):

    if word.find('.')>-1:#ピリオドで単語がつながっている場合
        if SPLIT_CHARACTOR:
            return split_linking(word)
        else:
            return [".".join(split_linking(word))]
    else:
        res =  stack_and_merge(word)#この時点ではlist
        return res

def split_line(line):
    ret = []
    buf = ""
    parencount = 0
    for w in line:
        if w=="(":
            parencount+=1
            buf+=w
        elif w==")":
            parencount-=1
            buf+=w
        elif w=="-":
            if parencount==0:
                if buf!="":
                    ret.append(buf)
                buf=""
            else:
                buf+=w
        else:
            buf+=w
    if buf!="":
        ret.append(buf)
    return ret

if __name__=="__main__":

    p = optparse.OptionParser()
    p.add_option('-s', '--splite', action="store_true", dest='split_charactor',help="SPLIT CHARACTOR",default=False)
    p.add_option('-r', '--rsuffix', action="store_true", dest='remove_suffix',help="REMOVE SUFFIX",default=False)
    p.add_option('-m', '--rmerge', action="store_true", dest='remove_merge',help="REMOVE MERGE",default=False)
    p.add_option('-f', '--fjoinstack', action="store_true", dest='force_join_stack',help="FORCE JOIN STACK IF SET SPLITE CHARACTOR",default=False)
    p.add_option('-j', '--fjoinmerge', action="store_true", dest='force_join_merge',help="FORCE JOIN MERGE IF SET SPLITE CHARACTOR",default=False)

    opts, args = p.parse_args()
    SPLIT_CHARACTOR = opts.split_charactor
    REMOVE_MERGE = opts.remove_merge
    REMOVE_SUFFIX = opts.remove_suffix
    FORCE_JOIN_MERGE = opts.force_join_merge
    FORCE_JOIN_STACK = opts.force_join_stack

    for line in sys.stdin:
        line = line.strip()
        ar = split_line(line)
        for w in ar:
            w = w.strip()
            if w!="":
                r = pattern(w)
                for x in r:
                    print x
	# -- coding: utf-8 --
	from BeautifulSoup import BeautifulSoup
	import urllib2
	import re


	URL = "http://kohaumotu.org/rongorongo_org/translit/%s.html"
	for i in range(97,123):
	url = URL % chr(i)
	html = urllib2.urlopen(url).read()
	s = BeautifulSoup(html)
	for li in s.findAll('li'):
	line = str(li.text)
	line = line.strip()
	line = re.sub(r'^[A-Z][a-z]\d\d','',line)
	line = line.strip()
	print line
	# -- coding: utf-8 --
	import sys
	import re
	import optparse

	REMOVE_SUFFIX = False
	REMOVE_MERGE = False
	SPLIT_CHARACTOR = False
	FORCE_JOIN_STACK = False
	FORCE_JOIN_MERGE = False

	def single_word(word):
	#数字だけ
	NUMERIC_ONLY = re.compile('^(\d+)$')
	#数字+記号一文字
	NUMERIC_AND_CHAR = re.compile('^(\d+)(\D+)$')
	m = NUMERIC_ONLY.match(word)
	if m:#数字だけ
	return "%03d"%int(m.group(1))
	m = NUMERIC_AND_CHAR.match(word)
	if m:#数字+記号一文字
	numeric = "%03d"%int(m.group(1))
	c = m.group(2)
	if REMOVE_SUFFIX:
	return numeric
	else:
	return numeric+c
	return word


	def stack_and_merge(word):
	"""
	.で分けた後に、上下結合(;)と部分結合(;)を分ける
	"""
	if word.find(':')>-1:#上下の複合文字(stack)
	ar = word.split(':')
	stacks = [single_word(w) for w in ar]
	if SPLIT_CHARACTOR and not FORCE_JOIN_STACK:
	return stacks
	else:
	return [":".join(stacks)]
	elif word.find(";")>-1:#部分結合の複合文字(merge)
	ar = word.split(';')
	merge = [single_word(w) for w in ar]
	if REMOVE_MERGE:
	merge = [merge[0]]
	if SPLIT_CHARACTOR and not FORCE_JOIN_MERGE:
	return merge
	else:
	return [";".join(merge)]
	else:
	return [single_word(word)]


	def split_linking(word):
	"""
	.で繋がったものを分けて一文字ずつ処理する
	"""
	ar = word.split('.')
	ret = []
	for w in ar:
	r = stack_and_merge(w)
	ret+=r
	return ret

	def pattern(word):

	if word.find('.')>-1:#ピリオドで単語がつながっている場合
	if SPLIT_CHARACTOR:
	return split_linking(word)
	else:
	return [".".join(split_linking(word))]
	else:
	res = stack_and_merge(word)#この時点ではlist
	return res

	def split_line(line):
	ret = []
	buf = ""
	parencount = 0
	for w in line:
	if w=="(":
	parencount+=1
	buf+=w
	elif w==")":
	parencount-=1
	buf+=w
	elif w=="-":
	if parencount==0:
	if buf!="":
	ret.append(buf)
	buf=""
	else:
	buf+=w
	else:
	buf+=w
	if buf!="":
	ret.append(buf)
	return ret

	if __name__=="__main__":

	p = optparse.OptionParser()
	p.add_option('-s', '--splite', action="store_true", dest='split_charactor',help="SPLIT CHARACTOR",default=False)
	p.add_option('-r', '--rsuffix', action="store_true", dest='remove_suffix',help="REMOVE SUFFIX",default=False)
	p.add_option('-m', '--rmerge', action="store_true", dest='remove_merge',help="REMOVE MERGE",default=False)
	p.add_option('-f', '--fjoinstack', action="store_true", dest='force_join_stack',help="FORCE JOIN STACK IF SET SPLITE CHARACTOR",default=False)
	p.add_option('-j', '--fjoinmerge', action="store_true", dest='force_join_merge',help="FORCE JOIN MERGE IF SET SPLITE CHARACTOR",default=False)

	opts, args = p.parse_args()
	SPLIT_CHARACTOR = opts.split_charactor
	REMOVE_MERGE = opts.remove_merge
	REMOVE_SUFFIX = opts.remove_suffix
	FORCE_JOIN_MERGE = opts.force_join_merge
	FORCE_JOIN_STACK = opts.force_join_stack

	for line in sys.stdin:
	line = line.strip()
	ar = split_line(line)
	for w in ar:
	w = w.strip()
	if w!="":
	r = pattern(w)
	for x in r:
	print x