Skip to content

Instantly share code, notes, and snippets.

@he7d3r
Last active August 29, 2015 14:07
Show Gist options
  • Save he7d3r/cbdffa019659f6f437ef to your computer and use it in GitHub Desktop.
Save he7d3r/cbdffa019659f6f437ef to your computer and use it in GitHub Desktop.
Prints out words (found in a dump.xml) which corresponds to some rule in a file.
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
Prints out words (found in a dump.xml) which corresponds to some rule in a file.
Example:
python WordsMatchingSalebotRules.py salebot.txt words.txt dump1.xml dump2.xml
"""
from mw import xml_dump
from collections import Counter
import sys
import re
import time
from datetime import timedelta
globRules = set()
startTime = time.time()
def page_info(dump, path):
global globRules
for page in dump:
count = Counter()
for revision in page:
for rule in globRules:
m = re.search(rule, revision.text)
# Ignore wikitext syntax, n-grams, words in other languages, etc
if m and not re.search('[^a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', m.group(0)):
# TODO: Compute the stem too, while we are at it
# TODO: Conver to lower case?
count[m.group(0)] += 1
yield page.title, count
def run(fileIn, fileOut, dumps):
global globRules
total = Counter()
log = open(fileOut, 'w')
for line in open(fileIn).read().splitlines():
# Ignore comments
if re.search('^\\s*#', line):
continue
# Ignore lines containing estrange syntax and lines with a positive score (good words)
m = re.search('^\\s*(-\\d+)\\s+\\/(.+)\\/', line)
# Ignore regexes which match 2+ words or which do not have letters from A-Z
# FIXME: This will ignore rules like "(foo|bar baz)" even if "foo" does not have spaces
if m and not re.search(' \w', m.group(2)) and re.search('[a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', m.group(2)):
# Reduce infinite repetitions to finite ones
rule = re.sub('\{(\d+),\}', '{\\1}', m.group(2))
rule = rule.replace( '+', '{1,5}').replace( '*', '{0,5}')
globRules.add(re.compile(rule))
#totalPages = 768 # ocwikibooks
totalPages = 3617245 # ptwiki
p = 0
for title, words in xml_dump.map(dumps, page_info):
p += 1
print( '%5.2f%% (+%s) %s' % ( 100*p/totalPages,
timedelta(seconds= (time.time() - startTime)*(totalPages - p)/p ),
title ) )
for w in words:
total[w] += words[w]
for s in total.most_common():
# TODO: Ignore words with a single occurrence in the whole history?
print('%s\t%s' % s, end='\n', file=log)
print('Done in %s.' % timedelta(seconds=time.time() - startTime))
if __name__ == "__main__":
if len(sys.argv) < 3:
print('Please provide 3+ file names: in, out and dump(s).')
sys.exit(1)
fileIn = sys.argv[1]
fileOut = sys.argv[2]
dumps = sys.argv[3:]
run(fileIn, fileOut, dumps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment