Skip to content

Instantly share code, notes, and snippets.

@phith0n
Last active May 21, 2021 10:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save phith0n/e31ba266ec6fff45bc8b316b1101b723 to your computer and use it in GitHub Desktop.
Save phith0n/e31ba266ec6fff45bc8b316b1101b723 to your computer and use it in GitHub Desktop.
a simple regexp generator
import re
from enum import Enum
class State(Enum):
Initial = 0
D = 10
DRest = 11
S = 20
SRest = 21
W = 30
WRest = 31
Other = 100
class ReGenerate(object):
def __init__(self):
self.tokens = []
self.current_state: State.value = State.Initial
self.fragment = ''
def flush(self, ch: str):
if self.fragment:
self.tokens.append(self.fragment)
self.fragment = ''
if self.is_d(ch):
self.fragment = r'\d'
self.current_state = State.D
elif self.is_w(ch):
self.fragment = r'\w'
self.current_state = State.W
elif self.is_space(ch):
self.fragment = r'\s'
self.current_state = State.S
else:
self.fragment = re.escape(ch)
self.current_state = State.Other
def generate(self, data: str):
for ch in data:
if self.current_state == State.Initial:
self.flush(ch)
elif self.current_state == State.D:
if self.is_d(ch):
self.current_state = State.DRest
self.fragment += r'+'
else:
self.flush(ch)
elif self.current_state == State.DRest:
if not self.is_d(ch):
self.flush(ch)
elif self.current_state == State.W:
if self.is_w(ch):
self.current_state = State.WRest
self.fragment += r'+'
else:
self.flush(ch)
elif self.current_state == State.WRest:
if not self.is_w(ch):
self.flush(ch)
elif self.current_state == State.S:
if self.is_space(ch):
self.current_state = State.SRest
self.fragment += r'+'
else:
self.flush(ch)
elif self.current_state == State.SRest:
if not self.is_space(ch):
self.flush(ch)
else:
self.flush(ch)
self.flush('')
return ''.join(self.tokens)
def is_d(self, ch: str):
return re.match(r'[\d]', ch)
def is_space(self, ch: str):
return re.match(r'[\s]', ch)
def is_w(self, ch: str):
return re.match(r'[\w]', ch)
if __name__ == '__main__':
# output: \w+\s\w+\s\w+\s\w+,\s\w'\w\s\d+\s\w+\s\w+
regexp = ReGenerate().generate('My name is Bob, I\'m 25 years old')
print(regexp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment