vortexkd/make_dictionary.py Secret

## make_dictionary.py
from multiprocessing import cpu_count

import MeCab
import wget
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import jaconv
import os
from typing import List

VECTORS_SIZE = 50
wiki_url = 'https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles1.xml-p1p106175.bz2'

wiki_file_name = 'jawiki-latest-pages-articles1.xml-p1p106175.bz2'
wiki_text_file_name = 'wiki.txt'
token_file = 'tokens.txt'
vector_file = 'ja-MeCab-50.data.model'


## download wiki file:
# this is part of the wiki, you can pick different files from here:
# https://dumps.wikimedia.org/jawiki/latest/
if not os.path.isfile(wiki_file_name):
    wget.download(wiki_url, bar=wget.bar_adaptive)


## read downloaded wiki file
def normalize_text(text: str) -> str:
    return jaconv.h2z(text, digit=True, ascii=True, kana=True).lower()


def read_wiki(wiki_data: str, save_file: str):
  if os.path.isfile(save_file):
      print('Skipping reading wiki file...')
      return
  with open(save_file, 'w') as out:
          wiki = WikiCorpus(wiki_data, lemmatize=False, dictionary={}, processes=cpu_count())
          wiki.metadata = True
          texts = wiki.get_texts()
          for i, article in enumerate(texts):
              text = article[0]  # article[1] refers to the name of the article.
              sentences = [normalize_text(line) for line in text]
              text = ' '.join(sentences) + u'\n'
              out.write(text)
              if i % 1000 == 0 and i != 0:
                  print('Logged', i, 'articles')
  print('保存完了')


read_wiki(wiki_file_name, JA_WIKI_TEXT_FILENAME)


## tokenize text
def get_words(text: str, mt: MeCab.Tagger) -> List[str]:
    mt.parse("")
    parsed = mt.parseToNode(text)
    components = []
    while parsed:
        if len(parsed.surface) >= 1:
            components.append(parsed.surface)
        parsed = parsed.next
    return components


def tokenize_text(input_filename, output_filename, mt):
    lines = 0
    if os.path.isfile(output_filename):
        lines = count_lines(output_filename)
    batch = []
    with open(input_filename, 'r') as data:
        for i, text in enumerate(data.readlines()):
            if i < lines:
                continue
            tokenized_text = ' '.join(get_words(text, mt))
            batch.append(tokenized_text)
            if i % 10000 == 0 and i != 0:
                write_tokens(batch, output_filename)
                batch = []
                print('Tokenized ,', i, 'lines')
    write_tokens(batch, output_filename)
    print('Finished tokenizing text')


def write_tokens(batch: List, file_name):
    with open(file_name, 'a+') as out:
        for out_line in batch:
            out.write(out_line)

def count_lines(file: str) -> int:
    count = 0
    with open(file) as d:
        for line in d:
            count += 1
    return count

tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') #neologd がなければ、別なものを使ってもOK
tokenize_text(wiki_text_file_name, token_file, tagger)


## generate vectors
def generate_vectors(input_filename, output_filename):
    if os.path.isfile(output_filename):
        return
    model = Word2Vec(LineSentence(input_filename),
                     size=VECTORS_SIZE, window=5, min_count=5,
                     workers=cpu_count(), iter=5)
    model.save(output_filename)
    print('ベクトル作成完了。')

generate_vectors(token_file, vector_file)


## try it out!

from gensim.models import load_model

model = load_model(vector_file)

print(model.wv['東京'])
print(mm.wv.most_similar(positive='東京', topn=5))
	from multiprocessing import cpu_count

	import MeCab
	import wget
	from gensim.corpora import WikiCorpus
	from gensim.models import Word2Vec
	from gensim.models.word2vec import LineSentence
	import jaconv
	import os
	from typing import List

	VECTORS_SIZE = 50
	wiki_url = 'https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles1.xml-p1p106175.bz2'

	wiki_file_name = 'jawiki-latest-pages-articles1.xml-p1p106175.bz2'
	wiki_text_file_name = 'wiki.txt'
	token_file = 'tokens.txt'
	vector_file = 'ja-MeCab-50.data.model'


	## download wiki file:
	# this is part of the wiki, you can pick different files from here:
	# https://dumps.wikimedia.org/jawiki/latest/
	if not os.path.isfile(wiki_file_name):
	wget.download(wiki_url, bar=wget.bar_adaptive)


	## read downloaded wiki file
	def normalize_text(text: str) -> str:
	return jaconv.h2z(text, digit=True, ascii=True, kana=True).lower()


	def read_wiki(wiki_data: str, save_file: str):
	if os.path.isfile(save_file):
	print('Skipping reading wiki file...')
	return
	with open(save_file, 'w') as out:
	wiki = WikiCorpus(wiki_data, lemmatize=False, dictionary={}, processes=cpu_count())
	wiki.metadata = True
	texts = wiki.get_texts()
	for i, article in enumerate(texts):
	text = article[0] # article[1] refers to the name of the article.
	sentences = [normalize_text(line) for line in text]
	text = ' '.join(sentences) + u'\n'
	out.write(text)
	if i % 1000 == 0 and i != 0:
	print('Logged', i, 'articles')
	print('保存完了')


	read_wiki(wiki_file_name, JA_WIKI_TEXT_FILENAME)


	## tokenize text
	def get_words(text: str, mt: MeCab.Tagger) -> List[str]:
	mt.parse("")
	parsed = mt.parseToNode(text)
	components = []
	while parsed:
	if len(parsed.surface) >= 1:
	components.append(parsed.surface)
	parsed = parsed.next
	return components


	def tokenize_text(input_filename, output_filename, mt):
	lines = 0
	if os.path.isfile(output_filename):
	lines = count_lines(output_filename)
	batch = []
	with open(input_filename, 'r') as data:
	for i, text in enumerate(data.readlines()):
	if i < lines:
	continue
	tokenized_text = ' '.join(get_words(text, mt))
	batch.append(tokenized_text)
	if i % 10000 == 0 and i != 0:
	write_tokens(batch, output_filename)
	batch = []
	print('Tokenized ,', i, 'lines')
	write_tokens(batch, output_filename)
	print('Finished tokenizing text')


	def write_tokens(batch: List, file_name):
	with open(file_name, 'a+') as out:
	for out_line in batch:
	out.write(out_line)

	def count_lines(file: str) -> int:
	count = 0
	with open(file) as d:
	for line in d:
	count += 1
	return count

	tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') #neologd がなければ、別なものを使ってもOK
	tokenize_text(wiki_text_file_name, token_file, tagger)


	## generate vectors
	def generate_vectors(input_filename, output_filename):
	if os.path.isfile(output_filename):
	return
	model = Word2Vec(LineSentence(input_filename),
	size=VECTORS_SIZE, window=5, min_count=5,
	workers=cpu_count(), iter=5)
	model.save(output_filename)
	print('ベクトル作成完了。')

	generate_vectors(token_file, vector_file)


	## try it out!

	from gensim.models import load_model

	model = load_model(vector_file)

	print(model.wv['東京'])
	print(mm.wv.most_similar(positive='東京', topn=5))