Skip to content

Instantly share code, notes, and snippets.

@huangziwei
Last active April 20, 2016 06:24
Show Gist options
  • Save huangziwei/655e5948a729e376de87 to your computer and use it in GitHub Desktop.
Save huangziwei/655e5948a729e376de87 to your computer and use it in GitHub Desktop.
class getSentences(object):
def __init__(self, dirname):
self.dirname = dirname
def split_paragraph(self, raw):
def is_punt(char):
stop = ['。', '!', '…', '?']
return char in stop # 判断是否要分句的标点符号
def is_other_punt(char):
other_punts = [')']
return char in other_punts
raw = re.sub('\n|“|”', '', raw)
sents = []
start = 0
quo = 0
for i, char in enumerate(raw):
if ( i < (len(raw) - 1)
and is_punt(char)
and not (is_punt(raw[i+1]) or is_other_punt(raw[i+1]))
and quo != 1):
# print(i, char)
sents.append(raw[start:i+1])
start = i+1
else:
continue
if start < len(raw):
sents.append(raw[start:])
return sents
def __iter__(self):
for file in glob.glob(self.dirname + '/*'):
for paragraph in codecs.open(file, 'r'):
for line in self.split_paragraph(paragraph):
# yield list(jieba.cut(line))
yield line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment