EZLiang/regex.md

## regex.md

      
    Raw
  

              regex.md
            
          
    Regex

is a very simple esoteric programming language.
The program is of the form
c/filter/flags/halt/flags2 comments

Note that /s in c must be escaped, and special chars in /filter/flags and /halt/flags2 must be escaped, as they are regular expressions and subject to their rules. Flags are case insensitive. Supported flags:

n - also consider non-printable characters (if on for any is automatically on for the other)
i - case-insensitive
m - also consider multiline strings (ditto)
l - enable multiline mode, automatically implying m
s - dot matches all
f - match the entire string
x - only a flag in halt, make program run infinitely
+group - only print a group (reference by number). Default is equivalent to +0
-group - same, but by name
In addition, verbose mode is enabled, allowing comments.

Execution

The regex interpreter, if c is preset, takes an input, and then runs through strings, and for each string:

If /halt/flags2 matches inputcstring if there is input, or string if there is none, it halts.
If /filter/flags matches inputcstring if there is input, or string if there is none, it prints string.
It counts using Pyth's base256 encoding.


## regex.py
import re

class RegexSyntaxError(Exception): ...

filter_printable = re.compile("")

def basedecode(n, charset):
  if n == 0: return ""
  last = n % len(charset)
  return basedecode(n // len(charset), charset) + charset[last]

multiline, nonprintable = False, False
def compile(regex, flags, error_context=""):
  global multiline, nonprintable
  if "n" in flags: nonprintable = True
  if "m" in flags or "l" in flags: multiline = True
  fg = re.U
  if "x" in flags: return lambda x: False
  if "i" in flags: fg |= re.IGNORECASE
  if "l" in flags: fg |= re.MULTILINE
  if "s" in flags: fg |= re.DOTALL
  try: result = re.compile(regex, fg)
  except re.error as e: raise RegexSyntaxError("Invalid regex in " + error_context + "(/" + regex + "/" + flags + "): " + "\n".join(e.args))
  if "f" in flags: return result.fullmatch
  return                  result.match

extraction = re.compile(r"^((?:[^\\/]|\\.)*)/((?:[^\\/]|\\.)*)/([nimsfNIMSF]*)((?:\+[0-9]+)?(?:\-\w+)?)/((?:[^\\/]|\\.)*)/([nimsfxNIMSFX]*)") # ironic
def parse(program):
  matches = extraction.match(program)
  sep = matches.group(1)
  uinput = bool(sep)
  filt = compile(matches.group(2), matches.group(3), "filter")
  halt = compile(matches.group(5), matches.group(6), "halting")
  printing = matches.group(4)
  if "+" in printing and "-" in printing: raise RegexSyntaxError("You cannot use two priting modes")
  if printing == "":     group = 0
  elif printing[0] == "+": group = int(printing[1:])
  elif printing[0] == "-": group = printing[1:]
  return sep, uinput, filt, halt, group

def enum(charset):
  i = 0
  while True:
    yield basedecode(i, charset)
    i += 1

def run(regex, get_input=input):
  sep, uinput, filt, halt, group = parse(regex)
  if uinput: prefix = get_input() + sep
  else: prefix = ""
  if nonprintable and multiline:         charset = [chr(i) for i in range(128)]
  if nonprintable and not multiline:     charset = [chr(i) for i in range(128) if i != 10]
  if not nonprintable and multiline:     charset = [10] + [chr(i) for i in range(32, 127)]
  if not nonprintable and not multiline: charset = [chr(i) for i in range(32, 127)]
  for i in enum(charset):
    if halt(prefix + i): return
    if filt(prefix + i): yield filt(prefix + i).group(group)
	import re

	class RegexSyntaxError(Exception): ...

	filter_printable = re.compile("")

	def basedecode(n, charset):
	if n == 0: return ""
	last = n % len(charset)
	return basedecode(n // len(charset), charset) + charset[last]

	multiline, nonprintable = False, False
	def compile(regex, flags, error_context=""):
	global multiline, nonprintable
	if "n" in flags: nonprintable = True
	if "m" in flags or "l" in flags: multiline = True
	fg = re.U
	if "x" in flags: return lambda x: False
	if "i" in flags: fg \|= re.IGNORECASE
	if "l" in flags: fg \|= re.MULTILINE
	if "s" in flags: fg \|= re.DOTALL
	try: result = re.compile(regex, fg)
	except re.error as e: raise RegexSyntaxError("Invalid regex in " + error_context + "(/" + regex + "/" + flags + "): " + "\n".join(e.args))
	if "f" in flags: return result.fullmatch
	return result.match

	extraction = re.compile(r"^((?:[^\\/]\|\\.))/((?:[^\\/]\|\\.))/([nimsfNIMSF])((?:\+[0-9]+)?(?:\-\w+)?)/((?:[^\\/]\|\\.))/([nimsfxNIMSFX]*)") # ironic
	def parse(program):
	matches = extraction.match(program)
	sep = matches.group(1)
	uinput = bool(sep)
	filt = compile(matches.group(2), matches.group(3), "filter")
	halt = compile(matches.group(5), matches.group(6), "halting")
	printing = matches.group(4)
	if "+" in printing and "-" in printing: raise RegexSyntaxError("You cannot use two priting modes")
	if printing == "": group = 0
	elif printing[0] == "+": group = int(printing[1:])
	elif printing[0] == "-": group = printing[1:]
	return sep, uinput, filt, halt, group

	def enum(charset):
	i = 0
	while True:
	yield basedecode(i, charset)
	i += 1

	def run(regex, get_input=input):
	sep, uinput, filt, halt, group = parse(regex)
	if uinput: prefix = get_input() + sep
	else: prefix = ""
	if nonprintable and multiline: charset = [chr(i) for i in range(128)]
	if nonprintable and not multiline: charset = [chr(i) for i in range(128) if i != 10]
	if not nonprintable and multiline: charset = [10] + [chr(i) for i in range(32, 127)]
	if not nonprintable and not multiline: charset = [chr(i) for i in range(32, 127)]
	for i in enum(charset):
	if halt(prefix + i): return
	if filt(prefix + i): yield filt(prefix + i).group(group)