Skip to content

Instantly share code, notes, and snippets.

@he7d3r
Created October 22, 2014 15:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save he7d3r/34f332d0c0523a1bd438 to your computer and use it in GitHub Desktop.
Save he7d3r/34f332d0c0523a1bd438 to your computer and use it in GitHub Desktop.
Create a list of words and a list of stems for each regex in the Salebot config
# Adapted from http://utilitymill.com/edit/Regex_inverter
# License: GPL/GFDL
# Extracted from invRegex.py, at http://pyparsing.wikispaces.com
from pyparsing import (Literal, oneOf, printables, ParserElement, Combine,
SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc,
Suppress, ParseResults, srange)
from nltk.stem.snowball import SnowballStemmer
import sys
import re
ParserElement.enablePackrat()
class CharacterRangeEmitter(object):
def __init__(self,chars):
# remove duplicate chars in character range, but preserve original order
seen = set()
self.charset = "".join( seen.add(c) or c for c in chars if c not in seen )
def __str__(self):
return '['+self.charset+']'
def __repr__(self):
return '['+self.charset+']'
def makeGenerator(self):
def genChars():
for s in self.charset:
yield s
return genChars
class OptionalEmitter(object):
def __init__(self,expr):
self.expr = expr
def makeGenerator(self):
def optionalGen():
yield ""
for s in self.expr.makeGenerator()():
yield s
return optionalGen
class DotEmitter(object):
def makeGenerator(self):
def dotGen():
for c in printables:
yield c
return dotGen
class GroupEmitter(object):
def __init__(self,exprs):
self.exprs = ParseResults(exprs)
def makeGenerator(self):
def groupGen():
def recurseList(elist):
if len(elist)==1:
for s in elist[0].makeGenerator()():
yield s
else:
for s in elist[0].makeGenerator()():
for s2 in recurseList(elist[1:]):
yield s + s2
if self.exprs:
for s in recurseList(self.exprs):
yield s
return groupGen
class AlternativeEmitter(object):
def __init__(self,exprs):
self.exprs = exprs
def makeGenerator(self):
def altGen():
for e in self.exprs:
for s in e.makeGenerator()():
yield s
return altGen
class LiteralEmitter(object):
def __init__(self,lit):
self.lit = lit
def __str__(self):
return "Lit:"+self.lit
def __repr__(self):
return "Lit:"+self.lit
def makeGenerator(self):
def litGen():
yield self.lit
return litGen
def handleRange(toks):
return CharacterRangeEmitter(srange(toks[0]))
def handleRepetition(toks):
toks=toks[0]
if toks[1] in "*+":
raise ParseFatalException("",0,"unbounded repetition operators not supported")
if toks[1] == "?":
return OptionalEmitter(toks[0])
if "count" in toks:
return GroupEmitter([toks[0]] * int(toks.count))
if "minCount" in toks:
mincount = int(toks.minCount)
maxcount = int(toks.maxCount)
optcount = maxcount - mincount
if optcount:
opt = OptionalEmitter(toks[0])
for i in range(1,optcount):
opt = OptionalEmitter(GroupEmitter([toks[0],opt]))
return GroupEmitter([toks[0]] * mincount + [opt])
else:
return [toks[0]] * mincount
def handleLiteral(toks):
lit = ""
for t in toks:
if t[0] == "\\":
if t[1] == "t":
lit += '\t'
else:
lit += t[1]
else:
lit += t
return LiteralEmitter(lit)
def handleMacro(toks):
macroChar = toks[0][1]
if macroChar == "d":
return CharacterRangeEmitter("0123456789")
elif macroChar == "w":
return CharacterRangeEmitter(srange("[A-Za-z0-9_]"))
elif macroChar == "s":
return LiteralEmitter(" ")
else:
raise ParseFatalException("",0,"unsupported macro character (" + macroChar + ")")
def handleSequence(toks):
return GroupEmitter(toks[0])
def handleDot():
if USE_ALL_PRINTABLES:
return CharacterRangeEmitter(printables)
else:
return CharacterRangeEmitter(srange(DOT_CHARS))
def handleAlternative(toks):
return AlternativeEmitter(toks[0])
_parser = None
def parser():
global _parser
if _parser is None:
ParserElement.setDefaultWhitespaceChars("")
lbrack,rbrack,lbrace,rbrace,lparen,rparen = map(Literal,"[]{}()")
reMacro = Combine("\\" + oneOf(list("dws")))
escapedChar = ~reMacro + Combine("\\" + oneOf(list(printables)))
reLiteralChar = "".join(c for c in printables if c not in r"\[]{}().*?+|") + " \t"
reRange = Combine(lbrack + SkipTo(rbrack,ignore=escapedChar) + rbrack)
reLiteral = ( escapedChar | oneOf(list(reLiteralChar)) )
reDot = Literal(".")
repetition = (
( lbrace + Word(nums).setResultsName("count") + rbrace ) |
( lbrace + Word(nums).setResultsName("minCount")+","+ Word(nums).setResultsName("maxCount") + rbrace ) |
oneOf(list("*+?"))
)
reRange.setParseAction(handleRange)
reLiteral.setParseAction(handleLiteral)
reMacro.setParseAction(handleMacro)
reDot.setParseAction(handleDot)
reTerm = ( reLiteral | reRange | reMacro | reDot )
reExpr = operatorPrecedence( reTerm,
[
(repetition, 1, opAssoc.LEFT, handleRepetition),
(None, 2, opAssoc.LEFT, handleSequence),
(Suppress('|'), 2, opAssoc.LEFT, handleAlternative),
]
)
_parser = reExpr
return _parser
def count(gen):
"""Simple function to count the number of elements returned by a generator."""
i = 0
for s in gen:
i += 1
return i
def invert(regex):
"""Call this routine as a generator to return all the strings that
match the input regular expression.
for s in invert("[A-Z]{3}\d{3}"):
print(s)
"""
invReGenerator = GroupEmitter(parser().parseString(regex)).makeGenerator()
return invReGenerator()
from pyparsing import __version__ as pyparsing_version
def run(fRegexes, fWords, fStems):
words = set()
stems = set()
stemmer = SnowballStemmer('portuguese')
for line in open(fRegexes).read().splitlines():
if re.search('^\\s*#', line):
continue
# Ignore lines containing estrange syntax and lines with a positive score (good words)
m = re.search('^\\s*(-\\d+)\\s+\\/(.+)\\/', line)
if not m:
continue
regex = m.group(2).replace('\\b','').replace('*','{0,2}').replace('+','{1,2}')
max_out = 500000
i = 0
#print('== %s ==' % m.group(2), end='\n', file=log)
for s in invert(regex):
i += 1
if not re.search('[^a-zA-ZáàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ]', s):
words.add(s)
stems.add(stemmer.stem(s))
#print(s, end='\n', file=log)
if i > max_out:
print("Regex has more than %d matching strings..." % max_out)
break
logWords = open(fWords, 'w')
for s in sorted(words):
print(s, end='\n', file=logWords)
logStems = open(fStems, 'w')
for s in sorted(stems):
print(s, end='\n', file=logStems)
if __name__ == "__main__":
if tuple(map(int,pyparsing_version.split('.'))) < (1,5,1):
print("Pyparsing version installed (%s), must be 1.5.1 or later" % pyparsing_version)
else:
if len(sys.argv) < 2:
print('Please provide 3 file names (input: regexes; output: words and stems).')
sys.exit(1)
fileRegexes = sys.argv[1]
fileWords = sys.argv[2]
fileStems = sys.argv[3]
run(fileRegexes, fileWords, fileStems)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment