Skip to content

Instantly share code, notes, and snippets.

@he7d3r
Last active August 29, 2015 14:07
Show Gist options
  • Save he7d3r/f99482f4f54f97895ccb to your computer and use it in GitHub Desktop.
Save he7d3r/f99482f4f54f97895ccb to your computer and use it in GitHub Desktop.
Prints out badword stems found in a list of XML dumps (by number of removals)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
Prints out badword stems (by number of removals) in a dump.xml.
Example:
python BadWordsCounter.py bad.txt bad-stats.txt dump1.xml dump2.xml
"""
from mw import xml_dump
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import sys
import time
from datetime import timedelta
startTime = time.time()
badWords = None
cache = {}
tokenizer = RegexpTokenizer('[a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]{3,}')
stemmer = SnowballStemmer('portuguese')
def page_info(dump, path):
global badWords, cache, tokenizer, stemmer
for page in dump:
count = Counter()
firstRev = True
for revision in page:
stems = set()
for w in tokenizer.tokenize(revision.text):
if len(w) < 3:
continue
elif len(w) == 3:
stems.add(w.lower())
continue
else:
if w not in cache:
cache[w] = stemmer.stem(w)
stems.add(cache[w].lower())
intersection = badWords & stems
if firstRev:
prevIntersection = intersection
firstRev = False
removed = prevIntersection - intersection
prevIntersection = intersection
for w in removed:
count[w] += 1
yield page.title, count
def run(fileIn, fileOut, dumps):
global badWords
badWords = set( [item.lower() for item in open(fileIn).read().splitlines()] )
total = Counter()
#totalPages = 768 # ocwikibooks
#totalPages = 11410 # ptwikiversity
totalPages = 3617245 # ptwiki
p = 0
for title, count in xml_dump.map(dumps, page_info):
p += 1
if count:
print( '%5.2f%% (+%s) %s' % ( 100*p/totalPages,
timedelta(seconds= (time.time() - startTime)*(totalPages - p)/p ),
title ) )
for w in count:
total[w] += count[w]
log = open(fileOut, 'w')
for s in total.most_common():
print('%s\t%s' % s, end='\n', file=log)
print('Done in %s.' % timedelta(seconds=time.time() - startTime))
if __name__ == "__main__":
if len(sys.argv) < 3:
print('Please provide 3+ file names: in, out and dump(s).')
sys.exit(1)
fileIn = sys.argv[1]
fileOut = sys.argv[2]
dumps = sys.argv[3:]
run(fileIn, fileOut, dumps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment