Skip to content

Instantly share code, notes, and snippets.

@bulgakovk
Created January 14, 2019 15:22
Show Gist options
  • Save bulgakovk/4d81cdfb12bc0edab8f0f1fa0c578bc4 to your computer and use it in GitHub Desktop.
Save bulgakovk/4d81cdfb12bc0edab8f0f1fa0c578bc4 to your computer and use it in GitHub Desktop.
"""
Creates a corpus from Wikipedia dump file.
Inspired by:
https://github.com/panyang/Wikipedia_Word2vec/blob/master/v1/process_wiki.py
"""
import sys
from gensim.corpora import WikiCorpus
def make_corpus(in_f, out_f):
"""Convert Wikipedia xml dump file to text corpus"""
output = open(out_f, 'w')
wiki = WikiCorpus(in_f)
i = 0
for text in wiki.get_texts():
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
i = i + 1
if i % 10000 == 0:
print('Processed ' + str(i) + ' articles')
output.close()
print('Processing complete!')
if __name__ == '__main__':
if len(sys.argv) != 3:
print('Usage: python make_wiki_corpus.py <wikipedia_dump_file> <processed_text_file>')
sys.exit(1)
in_f = sys.argv[1]
out_f = sys.argv[2]
make_corpus(in_f, out_f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment