Skip to content

Instantly share code, notes, and snippets.

@kgmyshin
Created January 2, 2015 07:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kgmyshin/6190f10295126774af4f to your computer and use it in GitHub Desktop.
Save kgmyshin/6190f10295126774af4f to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import logging
import os.path
import sys
from gensim.corpora.wikicorpus import filter_wiki
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel
# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
DEFAULT_DICT_SIZE = 100000
def jatokenize(text):
node = tagger.parseToNode(text.encode('utf-8')).next
while node:
if node.feature.split(',')[0] == '名詞':
yield node.surface.lower()
node = node.next
def tokenize(content):
return [token for token in jatokenize(content) if not token.startswith('_')]
class JaWikiCorpus(WikiCorpus):
def getArticles(self, return_raw=False):
articles, articles_all = 0, 0
intext, positions = False, 0
for lineno, line in enumerate(bz2.BZ2File(self.fname)):
if line.startswith(' <text'):
intext = True
line = line[line.find('>') + 1 : ]
lines = [line]
elif intext:
lines.append(line)
pos = line.find('</text>') # can be on the same line as <text>
if pos >= 0:
articles_all += 1
intext = False
if not lines:
continue
lines[-1] = line[:pos]
text = filter_wiki(''.join(lines))
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
articles += 1
if return_raw:
result = text
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles before pruning)" %
(articles, positions, articles_all))
self.numDocs = articles # cache corpus length
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp = sys.argv[1:3]
if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program
if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt.bz2')
wiki.save(outp + '_corpus.pkl.bz2')
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
# save tfidf vectors in matrix market format
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logger.info("finished running %s" % program)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment