Created
July 3, 2017 15:49
-
-
Save anonymous/c6896e647840ff90a18f6b82489e2c69 to your computer and use it in GitHub Desktop.
Exploiting Python sre_parse internal to generate random text with regular expressions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import sre_parse | |
MAX_ALLOWED_REPEATS = 100 | |
def randre(re_text): | |
pattern = sre_parse.parse(re_text) | |
return Generator().gen_pattern(pattern) | |
class Generator: | |
def __init__(self): | |
self.groups = {} | |
def gen_pattern(self, pattern): | |
return ''.join(self.gen_item(op, args) for op, args in pattern) | |
def gen_item(self, op, args): | |
return self.RAND_GENERATORS[op](self, args) | |
def gen_literal(self, lit): | |
return chr(lit) | |
def gen_category(self, category): | |
return self.gen_pattern(CATEGORIES[category]) | |
def gen_any(self, _): | |
return chr(random.randint(32, 127)) | |
def _check_negate(self, generation, options): | |
# XXX: are we missing any other op that can go in a IN? | |
for op, arg in options: | |
if op == sre_parse.LITERAL: | |
if generation == arg: | |
return True | |
elif op == sre_parse.RANGE: | |
if arg[0] <= ord(generation) <= arg[1]: | |
return True | |
return False | |
def gen_in(self, options): | |
if options[0][0] != sre_parse.NEGATE: | |
op, args = random.choice(options) | |
return self.gen_item(op, args) | |
else: | |
generation = self.gen_any(None) | |
while self._check_negate(generation, options): | |
generation = self.gen_any(None) | |
return generation | |
def gen_repeat(self, args): | |
min_repeats, max_repeats, subpattern = args | |
max_repeats = min(max_repeats, MAX_ALLOWED_REPEATS) | |
repeats = random.randint(min_repeats, max_repeats) | |
return ''.join(self.gen_pattern(subpattern) for _ in range(repeats)) | |
def gen_range(self, args): | |
min_range, max_range = args | |
return chr(random.randint(min_range, max_range)) | |
def gen_subpattern(self, args): | |
# XXX: what are the two missing args? | |
groupid, _, _, subpattern = args | |
generation = self.gen_pattern(subpattern) | |
self.groups[groupid] = generation | |
return generation | |
def gen_branch(self, args): | |
# XXX: what is first arg? | |
_, subpatterns = args | |
subpattern = random.choice(subpatterns) | |
return self.gen_pattern(subpattern) | |
def gen_groupref(self, groupid): | |
return self.groups[groupid] | |
def gen_not_literal(self, lit): | |
generation = self.gen_any(None) | |
while generation == lit: | |
generation = self.gen_any(None) | |
return generation | |
def gen_assert(self, args): | |
return '' | |
def gen_assert_not(self, args): | |
return '' | |
def gen_at(self, args): | |
return '' | |
RAND_GENERATORS = { | |
sre_parse.LITERAL: gen_literal, | |
sre_parse.CATEGORY: gen_category, | |
sre_parse.ANY: gen_any, | |
sre_parse.IN: gen_in, | |
sre_parse.MAX_REPEAT: gen_repeat, | |
sre_parse.MIN_REPEAT: gen_repeat, | |
sre_parse.RANGE: gen_range, | |
sre_parse.SUBPATTERN: gen_subpattern, | |
sre_parse.BRANCH: gen_branch, | |
sre_parse.NOT_LITERAL: gen_not_literal, | |
sre_parse.ASSERT: gen_assert, | |
sre_parse.ASSERT_NOT: gen_assert_not, | |
sre_parse.AT: gen_at, | |
} | |
CATEGORIES = { | |
sre_parse.CATEGORY_DIGIT: sre_parse.parse("[0-9]"), | |
sre_parse.CATEGORY_NOT_DIGIT: sre_parse.parse("[^0-9]"), | |
sre_parse.CATEGORY_SPACE: sre_parse.parse("[ \t\n\r\f\v]"), | |
sre_parse.CATEGORY_NOT_SPACE: sre_parse.parse("[^ \t\n\r\f\v]"), | |
sre_parse.CATEGORY_WORD: sre_parse.parse("[a-zA-Z0-9_]"), | |
sre_parse.CATEGORY_NOT_WORD: sre_parse.parse("[^a-zA-Z0-9_]"), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment