Veedrac/SliceRegex.py

## SliceRegex.py
"""
This is hereby released completely and irrevocably into the Public Domain.
Because, you know, I'm awesome.

See "raw.github.com/Veedrac/Sublime-Extras/master/license%20information.txt"
for details of what this means if you're not confident about it.
"""

# TODO:
#
# 1.
# There needs to be an option for case-insensitivity, but I
# don't think regex can support partial case-insensitivity
# without *loads* of difficulty...
#
# 2.
# Move to the new, extremely more powerful "regex" module
# on PyPI that's meant to replace "re".
#
# 3.
# Python 3 doesn't need the re.UNICODE flag, and re.UNICODE
# prevents usage of re.ASCII (which is somethimes desirable
# for some odd reason), so I should remove re.UNICODE and
# allow re.ASCII flags.
#
# 3.
# Everything. Really :P.

import re
from numbers import Integral

# A class that SliceRegex heavily requires
def _to_regex_str(x):
	"""
	Take a load of different inputs and "translate" them into
	regex strings.

	Apporximately:
		slice -> "x.start{x.stop,x.step}"
		regex -> x.pattern
		str   -> re.escape(x)
		list  -> "|".join(x)
	"""

	if isinstance(x, slice):
		# We want to give "x.start{x.stop,x.step}"
		# We need to treat None or Ellipsis as a no-value
		# Which implies not limit
		#
		# However if x.step is None it overrides
		# and "x.start{x.stop}" will be used for
		# an exact count

		def sanitise(n):
			if n in [None, Ellipsis]:
				return ""

			if isinstance(n, Integral):
				return int(n)

			raise ValueError("Slice boundary", n, "is not an integral!")

		value = _to_regex_str(x.start)
		start = sanitise(x.stop)
		stop  = sanitise(x.step)

		# Not when x.step is Ellipsis!
		# This has to be special-cased
		if start and x.step is None:
			return "(?:{}){{{}}}".format(value, start)

		# This isn't needed per-se, but it's nicer to look at
		if start == stop == "":
			return "(?:{})*".format(value)

		return "(?:{}){{{},{}}}".format(value, start, stop)

	elif isinstance(x, type(re.compile(""))):
		# Regex be nice, they give their pattern
		# Unfortunately this method means that things
		# like the flags would be lost
		#
		# Because this is internal we can ignore
		# re.M, re.S and re.U – they'll be added
		# if we like it or not!
		#
		# re.DEBUG cannot be passed easily, so is
		# dropped.
		#
		# re.A is ignored as it clashes with re.U # TODO: Remove, re.U is not needed as it's default.
		# re.X raises errors as it's impossible to support.
		# re.L and re.I are passed through.

		pattern = x.pattern
		flags = x.flags

		flags_lst = []

		if flags & re.ASCII:
			raise ValueError("The re.ASCII flag is not yet supported.")

		if flags & re.IGNORECASE:
			flags_lst.append("i")

		if flags & re.LOCALE:
			flags_lst.append("L")

		if flags & re.VERBOSE:
			raise ValueError("The re.VERBOSE flag is not supported.")

		if flags_lst:
			pattern = "(?{}){}".format("".join(flags_lst), pattern)

		return pattern

	elif isinstance(x, str):
		return re.escape(x)
		# "Got away safely!"

	elif isinstance(x, list):
		return "(?:{})".format("|".join(map(_to_regex_str, x)))

	else:
		raise ValueError(x, "is not able to be parsed into regex.")


class SliceRegex:
	"""
	Usage: SliceRegex()[*cool, readable stuff*]

	There is no important initialisation.
	"""

	# At all points the flags re.M, re.S and re.U
	# are to be set. It's the only way to support
	# all options at once, AFAICT, and the non-flag
	# versions can be done with other means.

	# Additionally, *all* results need to be atomic
	# to safely allow for transformations.

	# At all points input should be treated as
	# non-atomic.

	def __getitem__(self, item):

		# "item" will either be an item (s[foo])
		# a tuple of items (s[foo, bar])
		#
		# If the former, change to the second.
		# We are *very* explicit about being a tuple
		# here because it minimizes confusion.
		if type(item) is not tuple:
			item = (item,)

		regex_str = "".join(map(_to_regex_str, item))
		return re.compile(regex_str, flags=re.M|re.S|re.U)

	def any_of_chars(self, characters):
		"""Match any one of the given characters."""
		return re.compile("[{}]".format(re.escape(characters)), flags=re.M|re.S|re.U)

	def not_any_of_chars(self, characters):
		"""Match anything other than one of the given characters."""
		return re.compile("[^{}]".format(re.escape(characters)), flags=re.M|re.S|re.U)

	def store(self, regexable, *, name=None):
		if name is None:
			formatter = "({regex_str})"

		else:
			formatter = "(?P<{name}>{regex_str})"
			name = re.escape(name)

		return re.compile(formatter.format(name=name, regex_str=_to_regex_str(regexable), flags=re.M|re.S|re.U))

# These assume "(?msu)"
regexes = r"""
	\A    start
	\b    word_boundary
	\B    not_word_boundary
	\d    digit
	\D    not_digit
	\s    space
	\S    not_space
	\w    word_character
	\W    not_word_character
	\Z    end
	.     anything
	[^\n] any_in_line
	^     line_start
	$     line_end
""".splitlines()[1:]

# Add on to SliceRegex, so that SliceRegex.not_space == re.compile("\S", flags=re.M|re.S|re.U)
for regex in regexes:
	expression, name = regex.split()
	setattr(SliceRegex, name, re.compile(expression, flags=re.M|re.S|re.U))

# This is the objct through which the SliceRegex class is used
r = SliceRegex()

#>>>
###################################################################
# Just some quick "tests" that I like to run inline while working #
#                                                                 #
# If you're wondering why the stuff below is formatted how it is, #
# it's a SublimeText plugin I'm working on that I'll release once #
# I'm more confident it's stable and once some big bugs are gone. #
#                                                                 #
# It's already awesome, though, because my REPL is inside my code #
# so it's like an IDE with it's integrated console... but better! #
#                                                                 #
# Are you impressed that my blocks align perfectly at the border? #
###################################################################

truth_asserter = r[r.start, "I is ", r["really "::7, "quite "]:, r.store(["cool", "modest"], name="attitude"), r.end]

print(truth_asserter.pattern)

statements = [
	"I is cool",
	"I is really cool",
	"I is really really quite modest",
	"I is quite modest",
	"I is really quite really cool",
	"I is quite quite quite cool",
	"I is quite quite quite really modest",
	"I is quite really cool",
	"I is really really really really really really really quite really "
	"really really really really really really quite really really really "
	"really really really really quite cool"
]

for statement in statements:
	statement_repr = repr(statement if len(statement) <= 120 else statement[:78]+"..."+statement[-19:])
	match = truth_asserter.match(statement)
	print("It is {1} that {0}".format(statement_repr, bool(match)))

	if match:
		if match.group("attitude") == "modest":
			print("... Hey, wait... I'm not modest!")

#>>> \AI\ is\ (?:(?:really\ ){,7}quite\ )*(?P<attitude>(?:cool|modest))\Z
#>>> It is True that 'I is cool'
#>>> It is False that 'I is really cool'
#>>> It is True that 'I is really really quite modest'
#>>> ... Hey, wait... I'm not modest!
#>>> It is True that 'I is quite modest'
#>>> ... Hey, wait... I'm not modest!
#>>> It is False that 'I is really quite really cool'
#>>> It is True that 'I is quite quite quite cool'
#>>> It is False that 'I is quite quite quite really modest'
#>>> It is False that 'I is quite really cool'
#>>> It is True that 'I is really really really really really really really quite really really real...y really quite cool'

# You can trust me, these (might) work!

r[r.start, r.anything, r.any_in_line].search("\n!")
#>>> <_sre.SRE_Match object at 0x7ff312e3c2a0>

r[r.start, r.anything, r.any_in_line].search("z\n!")
#>>>

r[r.anything, r.any_in_line].search("z\n!")
#>>> <_sre.SRE_Match object at 0x7ff312e3c3d8>
	"""
	This is hereby released completely and irrevocably into the Public Domain.
	Because, you know, I'm awesome.

	See "raw.github.com/Veedrac/Sublime-Extras/master/license%20information.txt"
	for details of what this means if you're not confident about it.
	"""

	# TODO:
	#
	# 1.
	# There needs to be an option for case-insensitivity, but I
	# don't think regex can support partial case-insensitivity
	# without loads of difficulty...
	#
	# 2.
	# Move to the new, extremely more powerful "regex" module
	# on PyPI that's meant to replace "re".
	#
	# 3.
	# Python 3 doesn't need the re.UNICODE flag, and re.UNICODE
	# prevents usage of re.ASCII (which is somethimes desirable
	# for some odd reason), so I should remove re.UNICODE and
	# allow re.ASCII flags.
	#
	# 3.
	# Everything. Really :P.

	import re
	from numbers import Integral

	# A class that SliceRegex heavily requires
	def _to_regex_str(x):
	"""
	Take a load of different inputs and "translate" them into
	regex strings.

	Apporximately:
	slice -> "x.start{x.stop,x.step}"
	regex -> x.pattern
	str -> re.escape(x)
	list -> "\|".join(x)
	"""

	if isinstance(x, slice):
	# We want to give "x.start{x.stop,x.step}"
	# We need to treat None or Ellipsis as a no-value
	# Which implies not limit
	#
	# However if x.step is None it overrides
	# and "x.start{x.stop}" will be used for
	# an exact count

	def sanitise(n):
	if n in [None, Ellipsis]:
	return ""

	if isinstance(n, Integral):
	return int(n)

	raise ValueError("Slice boundary", n, "is not an integral!")

	value = _to_regex_str(x.start)
	start = sanitise(x.stop)
	stop = sanitise(x.step)

	# Not when x.step is Ellipsis!
	# This has to be special-cased
	if start and x.step is None:
	return "(?:{}){{{}}}".format(value, start)

	# This isn't needed per-se, but it's nicer to look at
	if start == stop == "":
	return "(?:{})*".format(value)

	return "(?:{}){{{},{}}}".format(value, start, stop)

	elif isinstance(x, type(re.compile(""))):
	# Regex be nice, they give their pattern
	# Unfortunately this method means that things
	# like the flags would be lost
	#
	# Because this is internal we can ignore
	# re.M, re.S and re.U – they'll be added
	# if we like it or not!
	#
	# re.DEBUG cannot be passed easily, so is
	# dropped.
	#
	# re.A is ignored as it clashes with re.U # TODO: Remove, re.U is not needed as it's default.
	# re.X raises errors as it's impossible to support.
	# re.L and re.I are passed through.

	pattern = x.pattern
	flags = x.flags

	flags_lst = []

	if flags & re.ASCII:
	raise ValueError("The re.ASCII flag is not yet supported.")

	if flags & re.IGNORECASE:
	flags_lst.append("i")

	if flags & re.LOCALE:
	flags_lst.append("L")

	if flags & re.VERBOSE:
	raise ValueError("The re.VERBOSE flag is not supported.")

	if flags_lst:
	pattern = "(?{}){}".format("".join(flags_lst), pattern)

	return pattern

	elif isinstance(x, str):
	return re.escape(x)
	# "Got away safely!"

	elif isinstance(x, list):
	return "(?:{})".format("\|".join(map(_to_regex_str, x)))

	else:
	raise ValueError(x, "is not able to be parsed into regex.")



	class SliceRegex:
	"""
	Usage: SliceRegex()[cool, readable stuff]

	There is no important initialisation.
	"""

	# At all points the flags re.M, re.S and re.U
	# are to be set. It's the only way to support
	# all options at once, AFAICT, and the non-flag
	# versions can be done with other means.

	# Additionally, all results need to be atomic
	# to safely allow for transformations.

	# At all points input should be treated as
	# non-atomic.

	def __getitem__(self, item):

	# "item" will either be an item (s[foo])
	# a tuple of items (s[foo, bar])
	#
	# If the former, change to the second.
	# We are very explicit about being a tuple
	# here because it minimizes confusion.
	if type(item) is not tuple:
	item = (item,)

	regex_str = "".join(map(_to_regex_str, item))
	return re.compile(regex_str, flags=re.M\|re.S\|re.U)

	def any_of_chars(self, characters):
	"""Match any one of the given characters."""
	return re.compile("[{}]".format(re.escape(characters)), flags=re.M\|re.S\|re.U)

	def not_any_of_chars(self, characters):
	"""Match anything other than one of the given characters."""
	return re.compile("[^{}]".format(re.escape(characters)), flags=re.M\|re.S\|re.U)

	def store(self, regexable, *, name=None):
	if name is None:
	formatter = "({regex_str})"

	else:
	formatter = "(?P<{name}>{regex_str})"
	name = re.escape(name)

	return re.compile(formatter.format(name=name, regex_str=_to_regex_str(regexable), flags=re.M\|re.S\|re.U))

	# These assume "(?msu)"
	regexes = r"""
	\A start
	\b word_boundary
	\B not_word_boundary
	\d digit
	\D not_digit
	\s space
	\S not_space
	\w word_character
	\W not_word_character
	\Z end
	. anything
	[^\n] any_in_line
	^ line_start
	$ line_end
	""".splitlines()[1:]

	# Add on to SliceRegex, so that SliceRegex.not_space == re.compile("\S", flags=re.M\|re.S\|re.U)
	for regex in regexes:
	expression, name = regex.split()
	setattr(SliceRegex, name, re.compile(expression, flags=re.M\|re.S\|re.U))

	# This is the objct through which the SliceRegex class is used
	r = SliceRegex()

	#>>>
	###################################################################
	# Just some quick "tests" that I like to run inline while working #
	# #
	# If you're wondering why the stuff below is formatted how it is, #
	# it's a SublimeText plugin I'm working on that I'll release once #
	# I'm more confident it's stable and once some big bugs are gone. #
	# #
	# It's already awesome, though, because my REPL is inside my code #
	# so it's like an IDE with it's integrated console... but better! #
	# #
	# Are you impressed that my blocks align perfectly at the border? #
	###################################################################

	truth_asserter = r[r.start, "I is ", r["really "::7, "quite "]:, r.store(["cool", "modest"], name="attitude"), r.end]

	print(truth_asserter.pattern)

	statements = [
	"I is cool",
	"I is really cool",
	"I is really really quite modest",
	"I is quite modest",
	"I is really quite really cool",
	"I is quite quite quite cool",
	"I is quite quite quite really modest",
	"I is quite really cool",
	"I is really really really really really really really quite really "
	"really really really really really really quite really really really "
	"really really really really quite cool"
	]

	for statement in statements:
	statement_repr = repr(statement if len(statement) <= 120 else statement[:78]+"..."+statement[-19:])
	match = truth_asserter.match(statement)
	print("It is {1} that {0}".format(statement_repr, bool(match)))

	if match:
	if match.group("attitude") == "modest":
	print("... Hey, wait... I'm not modest!")

	#>>> \AI\ is\ (?:(?:really\ ){,7}quite\ )*(?P<attitude>(?:cool\|modest))\Z
	#>>> It is True that 'I is cool'
	#>>> It is False that 'I is really cool'
	#>>> It is True that 'I is really really quite modest'
	#>>> ... Hey, wait... I'm not modest!
	#>>> It is True that 'I is quite modest'
	#>>> ... Hey, wait... I'm not modest!
	#>>> It is False that 'I is really quite really cool'
	#>>> It is True that 'I is quite quite quite cool'
	#>>> It is False that 'I is quite quite quite really modest'
	#>>> It is False that 'I is quite really cool'
	#>>> It is True that 'I is really really really really really really really quite really really real...y really quite cool'

	# You can trust me, these (might) work!

	r[r.start, r.anything, r.any_in_line].search("\n!")
	#>>> <_sre.SRE_Match object at 0x7ff312e3c2a0>

	r[r.start, r.anything, r.any_in_line].search("z\n!")
	#>>>

	r[r.anything, r.any_in_line].search("z\n!")
	#>>> <_sre.SRE_Match object at 0x7ff312e3c3d8>