Created
May 10, 2012 10:06
-
-
Save bluele/2652264 to your computer and use it in GitHub Desktop.
Generate japanese sentence.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding:utf-8 -*- | |
__doc__ = u""" | |
指定したコーパスから文章を生成します | |
ファイルを読み込んでsplit /[、。]/で分割 | |
""" | |
__usage__ = u""" | |
USAGE: | |
$ python generate_sentance.py <source directory> <output directory> | |
""" | |
import sys | |
import os | |
import re | |
import random | |
import sgmllib | |
import urllib2 | |
try: | |
import cStringIO as StringIO | |
except: | |
import StringIO | |
noise_data = None | |
_pattern = dict() | |
def setup(): | |
global noise_data | |
# noise生成時に使用するデータリスト | |
noise_data = [unichr(i) for i in xrange(ord(u"あ"), ord(u"ん"))] | |
# 文書の構造解析の区切りパターン | |
_pattern["separator"] = re.compile(ur"[、。 ]", re.I) | |
_pattern['html_body'] = re.compile(ur"<body[^>]*>(.+)</body>", re.I) | |
_pattern.setdefault("html_tag", re.compile(ur"<.*?>", re.I | re.S)) | |
_pattern.setdefault("tospace", re.compile(ur"(?:\t|\s| )+")) | |
class ApplicationError(Exception): | |
pass | |
class Stripper(sgmllib.SGMLParser): | |
@staticmethod | |
def normalize_line(text, line=u"\n"): | |
''' 改行の正規化を行います ''' | |
return text.replace(u'\r\n', line).replace(u'\r', line).replace(u'\n',u' ') | |
@staticmethod | |
def normalize_space(text): | |
''' 空白文字の正規化を行います ''' | |
return _pattern['tospace'].sub(u" ", text) | |
def __init__(self): | |
sgmllib.SGMLParser.__init__(self) | |
def strip(self, html): | |
self.theString = StringIO.StringIO() | |
self.feed(html) | |
self.close() | |
self.theString.seek(0) | |
return self.normalize_space(self.normalize_line(self.theString.read())) | |
def handle_data(self, data): | |
self.theString.write(data) | |
class HTMLReader(): | |
def __init__(self): | |
self.stripper = Stripper() | |
def analyze_url(self, url): | |
''' | |
指定したURLの文書を取得します | |
@param url: URL | |
@return: str | |
''' | |
con = urllib2.urlopen(url) | |
return self.stripper.strip(con.read()) | |
def analyze_text(self, text): | |
''' | |
指定したHTML文書を解析します | |
''' | |
return self.stripper.strip(text) | |
def guess_decode(text, encoding_list=("utf8", "shift-jis", "euc-jp")): | |
''' | |
指定したテキストをエンコードします | |
@param text: str or unicode | |
@param encoding_list: tuple | |
@return unicode | |
''' | |
if isinstance(text, unicode): | |
return text | |
for ecode in encoding_list: | |
try: | |
return text.decode(ecode) | |
except: | |
pass | |
else: | |
raise ApplicationError("Can't decode text.") | |
class Generater(): | |
file_template = u"doc_%d.txt" | |
separator = (u"、", u"。") | |
def __init__(self, source_path, output_path, noise=0): | |
''' | |
@param dir_path: str | |
''' | |
self.hreader = HTMLReader() | |
self.source_path = source_path | |
self.output_path = output_path | |
self.data = list() | |
self.noise = noise | |
self.load_corpus_directory() | |
def load_corpus_directory(self): | |
''' | |
指定したディレクトリからコーパスを読み込みます | |
''' | |
root, _, flist = os.walk(self.source_path).next() | |
for fname in flist: | |
try: | |
file_path = os.path.join(root, fname) | |
self.load_corpus_file(file_path) | |
except: | |
print u"IOError: %s" % os.path.join(root, fname) | |
def load_corpus_file(self, file_path): | |
''' | |
指定したファイルを解析します | |
@param file_path: str | |
''' | |
with open(file_path, "rb") as f: | |
text = guess_decode(f.read()) | |
if file_path.endswith(".html"): | |
result = self.analyze_html(text) | |
else: | |
result = self.analyze_text(text) | |
self.data.extend(result) | |
@staticmethod | |
def analyze_html(text): | |
''' | |
指定したHTMLを解析して、文節ごとに分解したリストにして返します | |
@param text: unicode | |
@return: list | |
''' | |
return _pattern["separator"].split(self.hreader.analyze_text(text)) | |
@staticmethod | |
def analyze_text(text): | |
''' | |
指定したテキストを解析して、文節ごとに分解したリストにして返します | |
@param text: unicode | |
@return: list | |
''' | |
return _pattern["separator"].split(text) | |
def generate(self, file_num=10, token_num=100): | |
''' | |
保持しているデータから文書を新たに生成します | |
@param file_num: int: 生成する文書数 | |
@param token_num: int: 文書ごとの文節の数 | |
''' | |
for i in xrange(file_num): | |
with open(os.path.join(self.output_path, self.file_template % i), "wb") as f: | |
for _ in xrange(token_num): | |
data = self.make_noise(random.choice(self.data), self.noise) | |
separator = random.choice(self.separator) | |
text = data + separator | |
f.write(text.encode("utf8")) | |
@staticmethod | |
def make_noise(data, num=1, is_shuffle=False): | |
''' | |
指定したデータにnoiseを挿入します | |
@param data: unicode 対象の文字列 | |
@param num: int | |
@return: unicode | |
''' | |
# 空文字列はそのまま返す | |
if len(data) <= 0: | |
return data | |
string_list = list(data) | |
if is_shuffle: | |
random.shuffle(string_list) | |
for _ in xrange(num): | |
idx = random.randint(0, len(string_list)-1) | |
string_list[idx] = random.choice(noise_data) | |
return u"".join(string_list) | |
def main(): | |
setup() | |
argv = sys.argv[1:] | |
if len(argv) <= 1: | |
raise ApplicationError(__usage__) | |
source = os.path.abspath(argv[0]) | |
output = os.path.abspath(argv[1]) | |
if not os.path.isdir(source) or not os.path.isdir(output): | |
raise ApplicationError(__usage__) | |
gen = Generater(source, output, 2) | |
gen.generate(file_num=10, token_num=30) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment