Skip to content

Instantly share code, notes, and snippets.

@vortexkd
Last active March 26, 2019 01:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vortexkd/f5ba994c8fae2406df67b31aab5e7af9 to your computer and use it in GitHub Desktop.
Save vortexkd/f5ba994c8fae2406df67b31aab5e7af9 to your computer and use it in GitHub Desktop.
Creating a japanese Word2Vec dictionary
from multiprocessing import cpu_count
import MeCab
import wget
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import jaconv
import os
from typing import List
VECTORS_SIZE = 50
wiki_url = 'https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles1.xml-p1p106175.bz2'
wiki_file_name = 'jawiki-latest-pages-articles1.xml-p1p106175.bz2'
wiki_text_file_name = 'wiki.txt'
token_file = 'tokens.txt'
vector_file = 'ja-MeCab-50.data.model'
## download wiki file:
# this is part of the wiki, you can pick different files from here:
# https://dumps.wikimedia.org/jawiki/latest/
if not os.path.isfile(wiki_file_name):
wget.download(wiki_url, bar=wget.bar_adaptive)
## read downloaded wiki file
def normalize_text(text: str) -> str:
return jaconv.h2z(text, digit=True, ascii=True, kana=True).lower()
def read_wiki(wiki_data: str, save_file: str):
if os.path.isfile(save_file):
print('Skipping reading wiki file...')
return
with open(save_file, 'w') as out:
wiki = WikiCorpus(wiki_data, lemmatize=False, dictionary={}, processes=cpu_count())
wiki.metadata = True
texts = wiki.get_texts()
for i, article in enumerate(texts):
text = article[0] # article[1] refers to the name of the article.
sentences = [normalize_text(line) for line in text]
text = ' '.join(sentences) + u'\n'
out.write(text)
if i % 1000 == 0 and i != 0:
print('Logged', i, 'articles')
print('保存完了')
read_wiki(wiki_file_name, JA_WIKI_TEXT_FILENAME)
## tokenize text
def get_words(text: str, mt: MeCab.Tagger) -> List[str]:
mt.parse("")
parsed = mt.parseToNode(text)
components = []
while parsed:
if len(parsed.surface) >= 1:
components.append(parsed.surface)
parsed = parsed.next
return components
def tokenize_text(input_filename, output_filename, mt):
lines = 0
if os.path.isfile(output_filename):
lines = count_lines(output_filename)
batch = []
with open(input_filename, 'r') as data:
for i, text in enumerate(data.readlines()):
if i < lines:
continue
tokenized_text = ' '.join(get_words(text, mt))
batch.append(tokenized_text)
if i % 10000 == 0 and i != 0:
write_tokens(batch, output_filename)
batch = []
print('Tokenized ,', i, 'lines')
write_tokens(batch, output_filename)
print('Finished tokenizing text')
def write_tokens(batch: List, file_name):
with open(file_name, 'a+') as out:
for out_line in batch:
out.write(out_line)
def count_lines(file: str) -> int:
count = 0
with open(file) as d:
for line in d:
count += 1
return count
tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') #neologd がなければ、別なものを使ってもOK
tokenize_text(wiki_text_file_name, token_file, tagger)
## generate vectors
def generate_vectors(input_filename, output_filename):
if os.path.isfile(output_filename):
return
model = Word2Vec(LineSentence(input_filename),
size=VECTORS_SIZE, window=5, min_count=5,
workers=cpu_count(), iter=5)
model.save(output_filename)
print('ベクトル作成完了。')
generate_vectors(token_file, vector_file)
## try it out!
from gensim.models import load_model
model = load_model(vector_file)
print(model.wv['東京'])
print(mm.wv.most_similar(positive='東京', topn=5))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment