Skip to content

Instantly share code, notes, and snippets.

@yingminc
Created June 30, 2017 05:29
Show Gist options
  • Save yingminc/33607c38789903187b98aa241f187f95 to your computer and use it in GitHub Desktop.
Save yingminc/33607c38789903187b98aa241f187f95 to your computer and use it in GitHub Desktop.
prepare english/japanese tokens for word2vec (use mecab for japanese segmentation)
#-*- encoding: utf-8 -*-
import MeCab
import sys
import codecs
import argparse
import re, string
parser =argparse.ArgumentParser()
parser.add_argument('input_dict', help = 'the file of input dictionary')
parser.add_argument('--utf8', help = 'switch to utf-8 mode', action = 'store_true')
args = parser.parse_args()
index = args.input_dict.find('.txt')
output_file = args.input_dict[:index] +'_seg'+ args.input_dict[index:]
input_file = open(args.input_dict, 'r')
#load txt file and remove duplicate
if args.utf8:
lines = input_file.read().decode('utf-8').split(u'\n')
lines = list(set(lines))
else:
lines = input_file.read().split('\n')
lines = list(set(lines))
#mecab for japanese segmentation
if args.utf8: \
#remove punctuation
jpun = re.compile(u'^[\u3000-\u303f\uff00-\uff65]$')
mt = MeCab.Tagger('-F\s%f[6] -U\s%m -E\\n')
line_num = 0
for line in lines:
result = mt.parse(line.encode('utf-8'))
for words in result.decode('utf-8').split('\n'):
for char in words:
if jpun.match(char) != None:
words = words.replace(' '+char, '')
with codecs.open(output_file, 'a', 'utf-8') as output_txt:
output_txt.write(words+' ')
with codecs.open(output_file, 'a', 'utf-8') as output_txt:
output_txt.write('\n')
line_num += 1
if line_num % 100 ==0:
print 'line ', line_num, " processed"
else: #english text
#remove punctuation
pl = list(set(string.punctuation))
pl.remove('-')
pl.extend(['\n', '\r'])
print pl
for line in lines:
nline = []
line_num = 0
for ch in line:
if ch not in pl:
nline.append(ch)
line = ''.join(ch for ch in nline)
line = line.replace('--', ' ')
with open(output_file, 'a') as output_txt:
output_txt.write(line+'\n')
line_num += 1
if line_num % 100 ==0:
print 'line ', line_num, " processed"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment