Skip to content

Instantly share code, notes, and snippets.

@he7d3r
Last active August 29, 2015 14:07
Show Gist options
  • Save he7d3r/82eefda254d416292141 to your computer and use it in GitHub Desktop.
Save he7d3r/82eefda254d416292141 to your computer and use it in GitHub Desktop.
Prints out the frequency of the words (found in a dump.xml) which corresponds to each stem in a file
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
Prints out words (found in a dump.xml) which corresponds to each stem in a file.
Example:
python StemsToWords.py stems.txt words.txt dump1.xml dump2.xml
"""
from mw import xml_dump
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import sys
import re
import time
from datetime import timedelta
import operator
startTime = time.time()
globWords = {}
cache = {}
tokenizer = RegexpTokenizer('[a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]{3,}')
stemmer = SnowballStemmer('portuguese')
def page_info(dump, path):
global globWords, cache, tokenizer, stemmer
for page in dump:
words = {}
for revision in page:
for w in tokenizer.tokenize(revision.text):
# Optimization, assuming all stems have 3+ characters
if len(w) < 3:
continue
else:
if w not in cache:
cache[w] = stemmer.stem(w)
s = cache[w].lower()
if s in globWords:
if s not in words:
words[s] = Counter()
words[s][w.lower()] += 1
yield page.title, words
def run(fileIn, fileOut, dumps):
global globWords
for line in open(fileIn).read().splitlines():
m = re.search('^(\\S+)', line)
if m:
globWords[m.group(1).lower()] = Counter()
#totalPages = 768 # ocwikibooks
#totalPages = 11410 # ptwikiversity
totalPages = 3620665 # ptwiki
p = 0
for title, words in xml_dump.map(dumps, page_info):
p += 1
print( '%5.2f%% (+%s) %s' % ( 100*p/totalPages,
timedelta(seconds= (time.time() - startTime)*(totalPages - p)/p ),
title ) )
for s in words:
globWords[s] += words[s]
log = open(fileOut, 'w')
for s in (sorted(globWords, reverse=True, key=lambda k: sum(globWords[k].values()) )):
if globWords[s]:
print('%s\t%s\t%s' % (s, sum(globWords[s].values()), globWords[s]), end='\n', file=log)
print('Done in %s.' % timedelta(seconds=time.time() - startTime))
if __name__ == "__main__":
if len(sys.argv) < 3:
print('Please provide 3+ file names: in, out and dump(s).')
sys.exit(1)
fileIn = sys.argv[1]
fileOut = sys.argv[2]
dumps = sys.argv[3:]
run(fileIn, fileOut, dumps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment