Created
October 22, 2014 15:11
-
-
Save he7d3r/34f332d0c0523a1bd438 to your computer and use it in GitHub Desktop.
Create a list of words and a list of stems for each regex in the Salebot config
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted from http://utilitymill.com/edit/Regex_inverter | |
# License: GPL/GFDL | |
# Extracted from invRegex.py, at http://pyparsing.wikispaces.com | |
from pyparsing import (Literal, oneOf, printables, ParserElement, Combine, | |
SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc, | |
Suppress, ParseResults, srange) | |
from nltk.stem.snowball import SnowballStemmer | |
import sys | |
import re | |
ParserElement.enablePackrat() | |
class CharacterRangeEmitter(object): | |
def __init__(self,chars): | |
# remove duplicate chars in character range, but preserve original order | |
seen = set() | |
self.charset = "".join( seen.add(c) or c for c in chars if c not in seen ) | |
def __str__(self): | |
return '['+self.charset+']' | |
def __repr__(self): | |
return '['+self.charset+']' | |
def makeGenerator(self): | |
def genChars(): | |
for s in self.charset: | |
yield s | |
return genChars | |
class OptionalEmitter(object): | |
def __init__(self,expr): | |
self.expr = expr | |
def makeGenerator(self): | |
def optionalGen(): | |
yield "" | |
for s in self.expr.makeGenerator()(): | |
yield s | |
return optionalGen | |
class DotEmitter(object): | |
def makeGenerator(self): | |
def dotGen(): | |
for c in printables: | |
yield c | |
return dotGen | |
class GroupEmitter(object): | |
def __init__(self,exprs): | |
self.exprs = ParseResults(exprs) | |
def makeGenerator(self): | |
def groupGen(): | |
def recurseList(elist): | |
if len(elist)==1: | |
for s in elist[0].makeGenerator()(): | |
yield s | |
else: | |
for s in elist[0].makeGenerator()(): | |
for s2 in recurseList(elist[1:]): | |
yield s + s2 | |
if self.exprs: | |
for s in recurseList(self.exprs): | |
yield s | |
return groupGen | |
class AlternativeEmitter(object): | |
def __init__(self,exprs): | |
self.exprs = exprs | |
def makeGenerator(self): | |
def altGen(): | |
for e in self.exprs: | |
for s in e.makeGenerator()(): | |
yield s | |
return altGen | |
class LiteralEmitter(object): | |
def __init__(self,lit): | |
self.lit = lit | |
def __str__(self): | |
return "Lit:"+self.lit | |
def __repr__(self): | |
return "Lit:"+self.lit | |
def makeGenerator(self): | |
def litGen(): | |
yield self.lit | |
return litGen | |
def handleRange(toks): | |
return CharacterRangeEmitter(srange(toks[0])) | |
def handleRepetition(toks): | |
toks=toks[0] | |
if toks[1] in "*+": | |
raise ParseFatalException("",0,"unbounded repetition operators not supported") | |
if toks[1] == "?": | |
return OptionalEmitter(toks[0]) | |
if "count" in toks: | |
return GroupEmitter([toks[0]] * int(toks.count)) | |
if "minCount" in toks: | |
mincount = int(toks.minCount) | |
maxcount = int(toks.maxCount) | |
optcount = maxcount - mincount | |
if optcount: | |
opt = OptionalEmitter(toks[0]) | |
for i in range(1,optcount): | |
opt = OptionalEmitter(GroupEmitter([toks[0],opt])) | |
return GroupEmitter([toks[0]] * mincount + [opt]) | |
else: | |
return [toks[0]] * mincount | |
def handleLiteral(toks): | |
lit = "" | |
for t in toks: | |
if t[0] == "\\": | |
if t[1] == "t": | |
lit += '\t' | |
else: | |
lit += t[1] | |
else: | |
lit += t | |
return LiteralEmitter(lit) | |
def handleMacro(toks): | |
macroChar = toks[0][1] | |
if macroChar == "d": | |
return CharacterRangeEmitter("0123456789") | |
elif macroChar == "w": | |
return CharacterRangeEmitter(srange("[A-Za-z0-9_]")) | |
elif macroChar == "s": | |
return LiteralEmitter(" ") | |
else: | |
raise ParseFatalException("",0,"unsupported macro character (" + macroChar + ")") | |
def handleSequence(toks): | |
return GroupEmitter(toks[0]) | |
def handleDot(): | |
if USE_ALL_PRINTABLES: | |
return CharacterRangeEmitter(printables) | |
else: | |
return CharacterRangeEmitter(srange(DOT_CHARS)) | |
def handleAlternative(toks): | |
return AlternativeEmitter(toks[0]) | |
_parser = None | |
def parser(): | |
global _parser | |
if _parser is None: | |
ParserElement.setDefaultWhitespaceChars("") | |
lbrack,rbrack,lbrace,rbrace,lparen,rparen = map(Literal,"[]{}()") | |
reMacro = Combine("\\" + oneOf(list("dws"))) | |
escapedChar = ~reMacro + Combine("\\" + oneOf(list(printables))) | |
reLiteralChar = "".join(c for c in printables if c not in r"\[]{}().*?+|") + " \t" | |
reRange = Combine(lbrack + SkipTo(rbrack,ignore=escapedChar) + rbrack) | |
reLiteral = ( escapedChar | oneOf(list(reLiteralChar)) ) | |
reDot = Literal(".") | |
repetition = ( | |
( lbrace + Word(nums).setResultsName("count") + rbrace ) | | |
( lbrace + Word(nums).setResultsName("minCount")+","+ Word(nums).setResultsName("maxCount") + rbrace ) | | |
oneOf(list("*+?")) | |
) | |
reRange.setParseAction(handleRange) | |
reLiteral.setParseAction(handleLiteral) | |
reMacro.setParseAction(handleMacro) | |
reDot.setParseAction(handleDot) | |
reTerm = ( reLiteral | reRange | reMacro | reDot ) | |
reExpr = operatorPrecedence( reTerm, | |
[ | |
(repetition, 1, opAssoc.LEFT, handleRepetition), | |
(None, 2, opAssoc.LEFT, handleSequence), | |
(Suppress('|'), 2, opAssoc.LEFT, handleAlternative), | |
] | |
) | |
_parser = reExpr | |
return _parser | |
def count(gen): | |
"""Simple function to count the number of elements returned by a generator.""" | |
i = 0 | |
for s in gen: | |
i += 1 | |
return i | |
def invert(regex): | |
"""Call this routine as a generator to return all the strings that | |
match the input regular expression. | |
for s in invert("[A-Z]{3}\d{3}"): | |
print(s) | |
""" | |
invReGenerator = GroupEmitter(parser().parseString(regex)).makeGenerator() | |
return invReGenerator() | |
from pyparsing import __version__ as pyparsing_version | |
def run(fRegexes, fWords, fStems): | |
words = set() | |
stems = set() | |
stemmer = SnowballStemmer('portuguese') | |
for line in open(fRegexes).read().splitlines(): | |
if re.search('^\\s*#', line): | |
continue | |
# Ignore lines containing estrange syntax and lines with a positive score (good words) | |
m = re.search('^\\s*(-\\d+)\\s+\\/(.+)\\/', line) | |
if not m: | |
continue | |
regex = m.group(2).replace('\\b','').replace('*','{0,2}').replace('+','{1,2}') | |
max_out = 500000 | |
i = 0 | |
#print('== %s ==' % m.group(2), end='\n', file=log) | |
for s in invert(regex): | |
i += 1 | |
if not re.search('[^a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', s): | |
words.add(s) | |
stems.add(stemmer.stem(s)) | |
#print(s, end='\n', file=log) | |
if i > max_out: | |
print("Regex has more than %d matching strings..." % max_out) | |
break | |
logWords = open(fWords, 'w') | |
for s in sorted(words): | |
print(s, end='\n', file=logWords) | |
logStems = open(fStems, 'w') | |
for s in sorted(stems): | |
print(s, end='\n', file=logStems) | |
if __name__ == "__main__": | |
if tuple(map(int,pyparsing_version.split('.'))) < (1,5,1): | |
print("Pyparsing version installed (%s), must be 1.5.1 or later" % pyparsing_version) | |
else: | |
if len(sys.argv) < 2: | |
print('Please provide 3 file names (input: regexes; output: words and stems).') | |
sys.exit(1) | |
fileRegexes = sys.argv[1] | |
fileWords = sys.argv[2] | |
fileStems = sys.argv[3] | |
run(fileRegexes, fileWords, fileStems) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment