-
-
Save vortexkd/f5ba994c8fae2406df67b31aab5e7af9 to your computer and use it in GitHub Desktop.
Creating a japanese Word2Vec dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from multiprocessing import cpu_count | |
import MeCab | |
import wget | |
from gensim.corpora import WikiCorpus | |
from gensim.models import Word2Vec | |
from gensim.models.word2vec import LineSentence | |
import jaconv | |
import os | |
from typing import List | |
VECTORS_SIZE = 50 | |
wiki_url = 'https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles1.xml-p1p106175.bz2' | |
wiki_file_name = 'jawiki-latest-pages-articles1.xml-p1p106175.bz2' | |
wiki_text_file_name = 'wiki.txt' | |
token_file = 'tokens.txt' | |
vector_file = 'ja-MeCab-50.data.model' | |
## download wiki file: | |
# this is part of the wiki, you can pick different files from here: | |
# https://dumps.wikimedia.org/jawiki/latest/ | |
if not os.path.isfile(wiki_file_name): | |
wget.download(wiki_url, bar=wget.bar_adaptive) | |
## read downloaded wiki file | |
def normalize_text(text: str) -> str: | |
return jaconv.h2z(text, digit=True, ascii=True, kana=True).lower() | |
def read_wiki(wiki_data: str, save_file: str): | |
if os.path.isfile(save_file): | |
print('Skipping reading wiki file...') | |
return | |
with open(save_file, 'w') as out: | |
wiki = WikiCorpus(wiki_data, lemmatize=False, dictionary={}, processes=cpu_count()) | |
wiki.metadata = True | |
texts = wiki.get_texts() | |
for i, article in enumerate(texts): | |
text = article[0] # article[1] refers to the name of the article. | |
sentences = [normalize_text(line) for line in text] | |
text = ' '.join(sentences) + u'\n' | |
out.write(text) | |
if i % 1000 == 0 and i != 0: | |
print('Logged', i, 'articles') | |
print('保存完了') | |
read_wiki(wiki_file_name, JA_WIKI_TEXT_FILENAME) | |
## tokenize text | |
def get_words(text: str, mt: MeCab.Tagger) -> List[str]: | |
mt.parse("") | |
parsed = mt.parseToNode(text) | |
components = [] | |
while parsed: | |
if len(parsed.surface) >= 1: | |
components.append(parsed.surface) | |
parsed = parsed.next | |
return components | |
def tokenize_text(input_filename, output_filename, mt): | |
lines = 0 | |
if os.path.isfile(output_filename): | |
lines = count_lines(output_filename) | |
batch = [] | |
with open(input_filename, 'r') as data: | |
for i, text in enumerate(data.readlines()): | |
if i < lines: | |
continue | |
tokenized_text = ' '.join(get_words(text, mt)) | |
batch.append(tokenized_text) | |
if i % 10000 == 0 and i != 0: | |
write_tokens(batch, output_filename) | |
batch = [] | |
print('Tokenized ,', i, 'lines') | |
write_tokens(batch, output_filename) | |
print('Finished tokenizing text') | |
def write_tokens(batch: List, file_name): | |
with open(file_name, 'a+') as out: | |
for out_line in batch: | |
out.write(out_line) | |
def count_lines(file: str) -> int: | |
count = 0 | |
with open(file) as d: | |
for line in d: | |
count += 1 | |
return count | |
tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') #neologd がなければ、別なものを使ってもOK | |
tokenize_text(wiki_text_file_name, token_file, tagger) | |
## generate vectors | |
def generate_vectors(input_filename, output_filename): | |
if os.path.isfile(output_filename): | |
return | |
model = Word2Vec(LineSentence(input_filename), | |
size=VECTORS_SIZE, window=5, min_count=5, | |
workers=cpu_count(), iter=5) | |
model.save(output_filename) | |
print('ベクトル作成完了。') | |
generate_vectors(token_file, vector_file) | |
## try it out! | |
from gensim.models import load_model | |
model = load_model(vector_file) | |
print(model.wv['東京']) | |
print(mm.wv.most_similar(positive='東京', topn=5)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment