Skip to content

Instantly share code, notes, and snippets.

@shiwano
Created March 1, 2018 03:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shiwano/2b7970933d4fd151830871f63dea9163 to your computer and use it in GitHub Desktop.
Save shiwano/2b7970933d4fd151830871f63dea9163 to your computer and use it in GitHub Desktop.
Japanese wikipedia corpus modification tools
import glob
import os
import re
import unicodedata
import codecs
import MeCab
class Sentence(object):
def __init__(self, root):
self.root = root
self.surfaces = []
self.features = []
if self.root:
node = root
while node:
self.surfaces.append(node.surface)
self.features.append(node.feature)
node = node.next
def all_words(self):
for surface, feature in zip(self.surfaces, self.features):
yield surface, feature
def word_count(self):
return len(self.surfaces)
def to_wakati(self):
return ' '.join([w for w in self.surfaces if w])
def get_line_generator():
wiki_root_dir = './extracted'
document_path = os.path.join(wiki_root_dir, '*/*')
fnames = glob.glob(document_path)
pattern1 = re.compile(r'[(\(][,;。?!\s]*[)\)]')
pattern2 = re.compile(r'[「『]')
pattern3 = re.compile(r'[」』]')
count = 0
for fname in fnames:
with open(fname, encoding='utf-8') as lines:
for line in lines:
if line.startswith('<doc'):
count += 1
elif line.startswith('</doc'):
print('Document: ' + str(count) + ', ' + fname)
else:
line = pattern1.sub('', line)
line = pattern2.sub('“', line)
line = pattern3.sub('”', line)
line = unicodedata.normalize('NFKC', line)
yield line
def main():
tagger = MeCab.Tagger(' -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/')
tagger.parseToNode('') # to prevent GC
line_generator = get_line_generator()
with codecs.open('spaced/spaced.txt', 'w', 'utf-8') as out:
for line in line_generator:
node = tagger.parseToNode(line)
sentence = Sentence(node)
wakati = sentence.to_wakati()
if wakati:
out.write(wakati)
print('finish')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment