-
-
Save sorcio/25ce2913ec8786ad10b279bc5edfd247 to your computer and use it in GitHub Desktop.
Exploiting Python sre_parse internal to generate random text with regular expressions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import sre_parse | |
MAX_ALLOWED_REPEATS = 100 | |
def randre(re_text): | |
pattern = sre_parse.parse(re_text) | |
return Generator().gen_pattern(pattern) | |
class Generator: | |
def __init__(self): | |
self.groups = {} | |
def gen_pattern(self, pattern): | |
return ''.join(self.gen_item(op, args) for op, args in pattern) | |
def gen_item(self, op, args): | |
return self.RAND_GENERATORS[op](self, args) | |
def gen_literal(self, lit): | |
return chr(lit) | |
def gen_category(self, category): | |
return self.gen_pattern(CATEGORIES[category]) | |
def gen_any(self, _): | |
return chr(random.randint(32, 127)) | |
def _check_negate(self, generation, options): | |
# XXX: are we missing any other op that can go in a IN? | |
for op, arg in options: | |
if op == sre_parse.LITERAL: | |
if generation == arg: | |
return True | |
elif op == sre_parse.RANGE: | |
if arg[0] <= ord(generation) <= arg[1]: | |
return True | |
return False | |
def gen_in(self, options): | |
if options[0][0] != sre_parse.NEGATE: | |
op, args = random.choice(options) | |
return self.gen_item(op, args) | |
else: | |
generation = self.gen_any(None) | |
while self._check_negate(generation, options): | |
generation = self.gen_any(None) | |
return generation | |
def gen_repeat(self, args): | |
min_repeats, max_repeats, subpattern = args | |
max_repeats = min(max_repeats, MAX_ALLOWED_REPEATS) | |
repeats = random.randint(min_repeats, max_repeats) | |
return ''.join(self.gen_pattern(subpattern) for _ in range(repeats)) | |
def gen_range(self, args): | |
min_range, max_range = args | |
return chr(random.randint(min_range, max_range)) | |
def gen_subpattern(self, args): | |
groupid = args[0] | |
subpattern = args[-1] | |
generation = self.gen_pattern(subpattern) | |
self.groups[groupid] = generation | |
return generation | |
def gen_branch(self, args): | |
# XXX: what is first arg? | |
_, subpatterns = args | |
subpattern = random.choice(subpatterns) | |
return self.gen_pattern(subpattern) | |
def gen_groupref(self, groupid): | |
return self.groups[groupid] | |
def gen_groupref_exists(self, args): | |
groupid, yes_pattern, no_pattern = args | |
if self.groups.get(groupid, False): | |
return self.gen_pattern(yes_pattern) | |
elif no_pattern: | |
return self.gen_pattern(no_pattern) | |
else: | |
return '' | |
def gen_not_literal(self, lit): | |
generation = self.gen_any(None) | |
while generation == lit: | |
generation = self.gen_any(None) | |
return generation | |
def gen_assert(self, args): | |
return '' | |
def gen_assert_not(self, args): | |
return '' | |
def gen_at(self, args): | |
return '' | |
RAND_GENERATORS = { | |
sre_parse.LITERAL: gen_literal, | |
sre_parse.CATEGORY: gen_category, | |
sre_parse.ANY: gen_any, | |
sre_parse.IN: gen_in, | |
sre_parse.MAX_REPEAT: gen_repeat, | |
sre_parse.MIN_REPEAT: gen_repeat, | |
sre_parse.RANGE: gen_range, | |
sre_parse.SUBPATTERN: gen_subpattern, | |
sre_parse.BRANCH: gen_branch, | |
sre_parse.GROUPREF: gen_groupref, | |
sre_parse.GROUPREF_EXISTS: gen_groupref_exists, | |
sre_parse.NOT_LITERAL: gen_not_literal, | |
sre_parse.ASSERT: gen_assert, | |
sre_parse.ASSERT_NOT: gen_assert_not, | |
sre_parse.AT: gen_at, | |
} | |
CATEGORIES = { | |
sre_parse.CATEGORY_DIGIT: sre_parse.parse("[0-9]"), | |
sre_parse.CATEGORY_NOT_DIGIT: sre_parse.parse("[^0-9]"), | |
sre_parse.CATEGORY_SPACE: sre_parse.parse("[ \t\n\r\f\v]"), | |
sre_parse.CATEGORY_NOT_SPACE: sre_parse.parse("[^ \t\n\r\f\v]"), | |
sre_parse.CATEGORY_WORD: sre_parse.parse("[a-zA-Z0-9_]"), | |
sre_parse.CATEGORY_NOT_WORD: sre_parse.parse("[^a-zA-Z0-9_]"), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment