Skip to content

Instantly share code, notes, and snippets.

@lyxal
Created July 5, 2021 13:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lyxal/19a8c7650d6f7bb8ad08c4aa39554118 to your computer and use it in GitHub Desktop.
Save lyxal/19a8c7650d6f7bb8ad08c4aa39554118 to your computer and use it in GitHub Desktop.
import string
NAME = "CONSTANT_TOKEN_NAME"
VALUE = "CONSTANT_TOKEN_VALUE"
IF_STMT = "STRUCTURE_IF"
FOR_STMT = "STRUCTURE_FOR"
WHILE_STMT = "STRUCTURE_WHILE"
FUNCTION_STMT = "STRUCTURE_FUNCTION"
LAMBDA_STMT = "STRUCTURE_LAMBDA"
NO_STMT = "STRUCTURE_NONE"
STRING_STMT = "STRUCTURE_STRING"
INTEGER = "STRUCTURE_INTEGER"
CHARACTER = "STRUCTURE_CHARACTER"
LAMBDA_STMT = "LAMBDA_STMT"
LAMBDA_MAP = "LAMBDA_MAP"
LAMBDA_FILTER = "LAMBDA_FILTER"
LAMBDA_SORT = "LAMBDA_SORT"
LIST_STMT = "LIST_STMT"
VARIABLE_GET = "VARIABLE_GET"
VARIABLE_SET = "VARIABLE_SET"
FUNCTION_REFERENCE = "FUNCTION_REFERENCE"
COMPRESSED_NUMBER = "COMPRESSED_NUMBER"
COMPRESSED_STRING = "COMPRESSED_STRING"
VARIABLES = [VARIABLE_GET, VARIABLE_SET]
STRING_CONTENTS = "string_contents"
INTEGER_CONTENTS = "integer_contents"
IF_ON_TRUE = "if_on_true"
IF_ON_FALSE = "if_on_false"
FOR_VARIABLE = "for_variable"
FOR_BODY = "for_body"
WHILE_CONDITION = "while_condition"
WHILE_BODY = "while_body"
FUNCTION_NAME = "function_name"
FUNCTION_BODY = "function_body"
LAMBDA_BODY = "lambda_body"
LIST_ITEM = "list_item"
LIST_ITEMS = "list_items"
VARIABLE_NAME = "variable_name"
LAMBDA_ARGUMENTS = "lambda_arguments"
COMPRESSED_NUMBER_VALUE = "compressed_number_value"
COMPRESSED_STRING_VALUE = "compressed_string_value"
TWO_CHAR_STUFF = "two_char_data_idk"
ONE_CHARS = list("kv⁽∆ø⁺Þ¨&~ß‘")
TWO_CHARS = list("₌‡₍")
CONSTANT_CHAR = "k"
VECTORISATION_CHAR = "v"
CODEPAGE_INDEX = "⁺"
ONE_CHAR_FUNCTION_REFERENCE = "⁽"
TWO_BYTE_MATH = "∆"
TWO_BYTE_STRING = "ø"
TWO_BYTE_LIST = "Þ"
TWO_BYTE_MISC = "¨"
STRING_DELIMITER = "`"
REGISTER_MODIFIER = "&"
DONT_POP = "~"
CONDITIONAL_EXECUTION = "ß"
VAR_SET = "→"
VAR_GET = "←"
PARA_APPLY = "₌"
PARA_APPLY_COLLECT = "₍"
TWO_CHAR_LAMBDA = "‡"
THREE_CHAR_LAMBDA = "≬"
DECIMAL = "."
OPENING = {
NO_STMT: "",
IF_STMT: "[",
FOR_STMT: "(",
WHILE_STMT: "{",
FUNCTION_STMT: "@",
LAMBDA_STMT: "λ",
LAMBDA_MAP: "ƛ",
LAMBDA_FILTER: "'",
LAMBDA_SORT: "µ",
LIST_STMT: "⟨",
FUNCTION_REFERENCE: "°",
COMPRESSED_NUMBER: "»",
COMPRESSED_STRING: "«",
}
inv_OPENING = {v: k for k, v in OPENING.items()}
CLOSING = {
NO_STMT: "",
IF_STMT: "]",
FOR_STMT: ")",
WHILE_STMT: "}",
FUNCTION_STMT: ";",
LAMBDA_STMT: ";",
LAMBDA_MAP: ";",
LAMBDA_FILTER: ";",
LAMBDA_SORT: ";",
LIST_STMT: "⟩",
FUNCTION_REFERENCE: ";",
COMPRESSED_NUMBER: "»",
COMPRESSED_STRING: "«",
}
inv_CLOSING = {v: k for k, v in CLOSING.items()}
DEFAULT_KEYS = {
IF_STMT: IF_ON_TRUE,
FOR_STMT: FOR_BODY,
WHILE_STMT: WHILE_BODY,
INTEGER: INTEGER_CONTENTS,
FUNCTION_STMT: FUNCTION_NAME,
LAMBDA_STMT: LAMBDA_BODY,
LAMBDA_MAP: LAMBDA_BODY,
LAMBDA_FILTER: LAMBDA_BODY,
LAMBDA_SORT: LAMBDA_BODY,
LIST_STMT: LIST_ITEM,
FUNCTION_REFERENCE: FUNCTION_NAME,
COMPRESSED_NUMBER: COMPRESSED_NUMBER_VALUE,
COMPRESSED_STRING: COMPRESSED_STRING_VALUE,
}
class StringDelimiters:
NORMAL = "`"
DICTIONARY = "“"; COM_NUMBER = "»"; COM_STRING = "«"
TWO_CHAR = "‛"; DELIM_TUPLE = (NORMAL, DICTIONARY, COM_NUMBER, COM_STRING, TWO_CHAR)
class Token:
def __init__(self, name: str, value: object):
self.name = name
self.value = value
def __getitem__(self, key: int):
if key in (0, NAME):
return self.name
elif key in (1, VALUE):
return self.value
else:
raise IndexError("Token value not in the range of 0/1")
def __str__(self):
return str(self.name) + "|" + str(self.value)
def group_two_byte_strings(source):
components = []
temp, in_string, escaped = "", False, False
for character in source:
if escaped: escaped = components.append(character) or False
elif temp:
temp = components.append([temp + character, "`"]) or ""
in_string = False
elif in_string: temp = character
elif character in "\\⁺": escaped = components.append(character) or True
elif character == StringDelimiters.TWO_CHAR: in_string = True
else: components.append(character)
if temp: components.append(temp)
return components
def group_strings(source):
components = []
temp = ""
escaped = False
flux_string = [False, "", StringDelimiters.NORMAL]
for character in source:
if type(character) is list:
if flux_string[0]: flux_string[1] += character
else: components.append(character)
elif flux_string[0]:
if escaped:
if character in (StringDelimiters.NORMAL, StringDelimiters.DICTIONARY):
flux_string[1] = flux_string[1][:-1]
flux_string[1] += character
escaped = False
elif character == flux_string[2]:
components.append([flux_string[1], flux_string[2]])
flux_string = [False, "", StringDelimiters.NORMAL]
elif character == "\\":
escaped = True
flux_string[1] += character
else:
flux_string[1] += character
elif escaped: escaped = components.append(character) or False
elif character in "\\⁺": escaped = components.append(character) or True
elif character in StringDelimiters.DELIM_TUPLE: flux_string = [True, "", character]
else: components.append(character)
if flux_string[0]: components.append([flux_string[1], flux_string[2]])
return components
def group_digraphs(source, vars=False):
components = []
temp = ""
escaped = False
ALL_DIGRAPHS = "k∆ø¨"
if vars: ALL_DIGRAPHS += "→←"
for character in source:
if type(character) is list: components.append(character)
elif escaped: escaped = components.append(character) or False
elif temp: temp = components.append(temp + character) or ""
elif character in "\\⁺": escaped = components.append(character) or True
elif character in ALL_DIGRAPHS: temp = character
else: components.append(character)
return components
def Tokenise(source: str, variables_are_digraphs=False):
tokens = []
escaped = False
comment = False
structure = NO_STMT
structure_data = {}
active_key = "" # The key which is currently being dealt with
default_key = "" # The key that is used if there is only one component in an element that can have branches (|)
nest_level = 0 # How far deep we are, as in, are we at the uppermost level of the program?
source = group_two_byte_strings(source)
source = group_strings(source)
source = group_digraphs(source, variables_are_digraphs)
token_pointer = 0
while token_pointer < len(source):
character = source[token_pointer]
if comment: comment = character == "\n"; continue
if escaped:
if structure != NO_STMT:
structure_data[active_key] += "\\" + character
else:
tokens.append(Token(CHARACTER, character))
escaped = False; continue
if structure == INTEGER:
if type(character) is str and character in (string.digits + "."): # If the character is a digit, we keep adding to the flux number
structure_data[active_key] += character
continue
else:
tokens.append(Token(INTEGER, structure_data[active_key]))
structure, structure_data, active_key, default_key = NO_STMT, {}, "", ""
if structure == VARIABLE_GET or structure == VARIABLE_SET:
if type(character) is str and character in string.ascii_letters + "_": # If the character is a valid variable name letter, we keep adding to the name
structure_data[active_key] += character
continue
else:
tokens.append(Token(structure, structure_data[active_key]))
structure, structure_data, active_key, default_key = NO_STMT, {}, "", ""
if character == "'":
escaped = True
continue
if type(character) is list:
tokens.append((STRING_STMT, character))
continue
elif character in ONE_CHARS:
everything_after = Tokenise(source[token_pointer + 1:])
tokens.append(Token(character, everything_after[0]))
tokens += everything_after[1:]
break
elif character in TWO_CHARS:
everything_after = Tokenise(source[token_pointer + 1:])
if character == TWO_CHAR_LAMBDA:
tokens.append(
Token(
LAMBDA_STMT,
{LAMBDA_BODY: everything_after[0:2]},
)
)
structure = NO_STMT
structure_data = {}
else:
tokens.append(Token(character, everything_after[0:2]))
tokens += everything_after[2:]
break
else:
tokens.append(Token(NO_STMT, character))
token_pointer += 1
return tokens
print([(n[0], n[1]) for n in Tokenise("‡₌*ġḭ")])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment