he7d3r/WordsMatchingSalebotRules.py

## WordsMatchingSalebotRules.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
Prints out words (found in a dump.xml) which corresponds to some rule in a file.

Example:
python WordsMatchingSalebotRules.py salebot.txt words.txt dump1.xml dump2.xml
"""
from mw import xml_dump
from collections import Counter
import sys
import re
import time
from datetime import timedelta

globRules = set()
startTime = time.time()

def page_info(dump, path):
    global globRules
    for page in dump:
        count = Counter()
        for revision in page:
            for rule in globRules:
                m = re.search(rule, revision.text)
                # Ignore wikitext syntax, n-grams, words in other languages, etc
                if m and not re.search('[^a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', m.group(0)):
                    # TODO: Compute the stem too, while we are at it
                    # TODO: Conver to lower case?
                    count[m.group(0)] += 1
        yield page.title, count

def run(fileIn, fileOut, dumps):
    global globRules
    total = Counter()
    log = open(fileOut, 'w')
    for line in open(fileIn).read().splitlines():
        # Ignore comments
        if re.search('^\\s*#', line):
            continue
        # Ignore lines containing estrange syntax and lines with a positive score (good words)
        m = re.search('^\\s*(-\\d+)\\s+\\/(.+)\\/', line)
        # Ignore regexes which match 2+ words or which do not have letters from A-Z
        # FIXME: This will ignore rules like "(foo|bar baz)" even if "foo" does not have spaces
        if m and not re.search(' \w', m.group(2)) and re.search('[a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', m.group(2)):
            # Reduce infinite repetitions to finite ones
            rule = re.sub('\{(\d+),\}', '{\\1}', m.group(2))
            rule = rule.replace( '+', '{1,5}').replace( '*', '{0,5}')
            globRules.add(re.compile(rule))
    #totalPages = 768 # ocwikibooks
    totalPages = 3617245 # ptwiki
    p = 0
    for title, words in xml_dump.map(dumps, page_info):
        p += 1
        print( '%5.2f%% (+%s) %s' % ( 100*p/totalPages,
                                   timedelta(seconds= (time.time() - startTime)*(totalPages - p)/p ),
                                   title ) )
        for w in words:
            total[w] += words[w]
    for s in total.most_common():
        # TODO: Ignore words with a single occurrence in the whole history?
        print('%s\t%s' % s, end='\n', file=log)
    print('Done in %s.' % timedelta(seconds=time.time() - startTime))

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print('Please provide 3+ file names: in, out and dump(s).')
        sys.exit(1)
    fileIn = sys.argv[1]
    fileOut = sys.argv[2]
    dumps = sys.argv[3:]
    run(fileIn, fileOut, dumps)
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	# Copyright © 2014 He7d3r
	# License: http://he7d3r.mit-license.org/
	"""
	Prints out words (found in a dump.xml) which corresponds to some rule in a file.

	Example:
	python WordsMatchingSalebotRules.py salebot.txt words.txt dump1.xml dump2.xml
	"""
	from mw import xml_dump
	from collections import Counter
	import sys
	import re
	import time
	from datetime import timedelta

	globRules = set()
	startTime = time.time()

	def page_info(dump, path):
	global globRules
	for page in dump:
	count = Counter()
	for revision in page:
	for rule in globRules:
	m = re.search(rule, revision.text)
	# Ignore wikitext syntax, n-grams, words in other languages, etc
	if m and not re.search('[^a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', m.group(0)):
	# TODO: Compute the stem too, while we are at it
	# TODO: Conver to lower case?
	count[m.group(0)] += 1
	yield page.title, count

	def run(fileIn, fileOut, dumps):
	global globRules
	total = Counter()
	log = open(fileOut, 'w')
	for line in open(fileIn).read().splitlines():
	# Ignore comments
	if re.search('^\\s*#', line):
	continue
	# Ignore lines containing estrange syntax and lines with a positive score (good words)
	m = re.search('^\\s*(-\\d+)\\s+\\/(.+)\\/', line)
	# Ignore regexes which match 2+ words or which do not have letters from A-Z
	# FIXME: This will ignore rules like "(foo\|bar baz)" even if "foo" does not have spaces
	if m and not re.search(' \w', m.group(2)) and re.search('[a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', m.group(2)):
	# Reduce infinite repetitions to finite ones
	rule = re.sub('\{(\d+),\}', '{\\1}', m.group(2))
	rule = rule.replace( '+', '{1,5}').replace( '*', '{0,5}')
	globRules.add(re.compile(rule))
	#totalPages = 768 # ocwikibooks
	totalPages = 3617245 # ptwiki
	p = 0
	for title, words in xml_dump.map(dumps, page_info):
	p += 1
	print( '%5.2f%% (+%s) %s' % ( 100*p/totalPages,
	timedelta(seconds= (time.time() - startTime)*(totalPages - p)/p ),
	title ) )
	for w in words:
	total[w] += words[w]
	for s in total.most_common():
	# TODO: Ignore words with a single occurrence in the whole history?
	print('%s\t%s' % s, end='\n', file=log)
	print('Done in %s.' % timedelta(seconds=time.time() - startTime))

	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print('Please provide 3+ file names: in, out and dump(s).')
	sys.exit(1)
	fileIn = sys.argv[1]
	fileOut = sys.argv[2]
	dumps = sys.argv[3:]
	run(fileIn, fileOut, dumps)