Created
July 5, 2021 13:04
-
-
Save lyxal/19a8c7650d6f7bb8ad08c4aa39554118 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
NAME = "CONSTANT_TOKEN_NAME" | |
VALUE = "CONSTANT_TOKEN_VALUE" | |
IF_STMT = "STRUCTURE_IF" | |
FOR_STMT = "STRUCTURE_FOR" | |
WHILE_STMT = "STRUCTURE_WHILE" | |
FUNCTION_STMT = "STRUCTURE_FUNCTION" | |
LAMBDA_STMT = "STRUCTURE_LAMBDA" | |
NO_STMT = "STRUCTURE_NONE" | |
STRING_STMT = "STRUCTURE_STRING" | |
INTEGER = "STRUCTURE_INTEGER" | |
CHARACTER = "STRUCTURE_CHARACTER" | |
LAMBDA_STMT = "LAMBDA_STMT" | |
LAMBDA_MAP = "LAMBDA_MAP" | |
LAMBDA_FILTER = "LAMBDA_FILTER" | |
LAMBDA_SORT = "LAMBDA_SORT" | |
LIST_STMT = "LIST_STMT" | |
VARIABLE_GET = "VARIABLE_GET" | |
VARIABLE_SET = "VARIABLE_SET" | |
FUNCTION_REFERENCE = "FUNCTION_REFERENCE" | |
COMPRESSED_NUMBER = "COMPRESSED_NUMBER" | |
COMPRESSED_STRING = "COMPRESSED_STRING" | |
VARIABLES = [VARIABLE_GET, VARIABLE_SET] | |
STRING_CONTENTS = "string_contents" | |
INTEGER_CONTENTS = "integer_contents" | |
IF_ON_TRUE = "if_on_true" | |
IF_ON_FALSE = "if_on_false" | |
FOR_VARIABLE = "for_variable" | |
FOR_BODY = "for_body" | |
WHILE_CONDITION = "while_condition" | |
WHILE_BODY = "while_body" | |
FUNCTION_NAME = "function_name" | |
FUNCTION_BODY = "function_body" | |
LAMBDA_BODY = "lambda_body" | |
LIST_ITEM = "list_item" | |
LIST_ITEMS = "list_items" | |
VARIABLE_NAME = "variable_name" | |
LAMBDA_ARGUMENTS = "lambda_arguments" | |
COMPRESSED_NUMBER_VALUE = "compressed_number_value" | |
COMPRESSED_STRING_VALUE = "compressed_string_value" | |
TWO_CHAR_STUFF = "two_char_data_idk" | |
ONE_CHARS = list("kv⁽∆ø⁺Þ¨&~ß‘") | |
TWO_CHARS = list("₌‡₍") | |
CONSTANT_CHAR = "k" | |
VECTORISATION_CHAR = "v" | |
CODEPAGE_INDEX = "⁺" | |
ONE_CHAR_FUNCTION_REFERENCE = "⁽" | |
TWO_BYTE_MATH = "∆" | |
TWO_BYTE_STRING = "ø" | |
TWO_BYTE_LIST = "Þ" | |
TWO_BYTE_MISC = "¨" | |
STRING_DELIMITER = "`" | |
REGISTER_MODIFIER = "&" | |
DONT_POP = "~" | |
CONDITIONAL_EXECUTION = "ß" | |
VAR_SET = "→" | |
VAR_GET = "←" | |
PARA_APPLY = "₌" | |
PARA_APPLY_COLLECT = "₍" | |
TWO_CHAR_LAMBDA = "‡" | |
THREE_CHAR_LAMBDA = "≬" | |
DECIMAL = "." | |
OPENING = { | |
NO_STMT: "", | |
IF_STMT: "[", | |
FOR_STMT: "(", | |
WHILE_STMT: "{", | |
FUNCTION_STMT: "@", | |
LAMBDA_STMT: "λ", | |
LAMBDA_MAP: "ƛ", | |
LAMBDA_FILTER: "'", | |
LAMBDA_SORT: "µ", | |
LIST_STMT: "⟨", | |
FUNCTION_REFERENCE: "°", | |
COMPRESSED_NUMBER: "»", | |
COMPRESSED_STRING: "«", | |
} | |
inv_OPENING = {v: k for k, v in OPENING.items()} | |
CLOSING = { | |
NO_STMT: "", | |
IF_STMT: "]", | |
FOR_STMT: ")", | |
WHILE_STMT: "}", | |
FUNCTION_STMT: ";", | |
LAMBDA_STMT: ";", | |
LAMBDA_MAP: ";", | |
LAMBDA_FILTER: ";", | |
LAMBDA_SORT: ";", | |
LIST_STMT: "⟩", | |
FUNCTION_REFERENCE: ";", | |
COMPRESSED_NUMBER: "»", | |
COMPRESSED_STRING: "«", | |
} | |
inv_CLOSING = {v: k for k, v in CLOSING.items()} | |
DEFAULT_KEYS = { | |
IF_STMT: IF_ON_TRUE, | |
FOR_STMT: FOR_BODY, | |
WHILE_STMT: WHILE_BODY, | |
INTEGER: INTEGER_CONTENTS, | |
FUNCTION_STMT: FUNCTION_NAME, | |
LAMBDA_STMT: LAMBDA_BODY, | |
LAMBDA_MAP: LAMBDA_BODY, | |
LAMBDA_FILTER: LAMBDA_BODY, | |
LAMBDA_SORT: LAMBDA_BODY, | |
LIST_STMT: LIST_ITEM, | |
FUNCTION_REFERENCE: FUNCTION_NAME, | |
COMPRESSED_NUMBER: COMPRESSED_NUMBER_VALUE, | |
COMPRESSED_STRING: COMPRESSED_STRING_VALUE, | |
} | |
class StringDelimiters: | |
NORMAL = "`" | |
DICTIONARY = "“"; COM_NUMBER = "»"; COM_STRING = "«" | |
TWO_CHAR = "‛"; DELIM_TUPLE = (NORMAL, DICTIONARY, COM_NUMBER, COM_STRING, TWO_CHAR) | |
class Token: | |
def __init__(self, name: str, value: object): | |
self.name = name | |
self.value = value | |
def __getitem__(self, key: int): | |
if key in (0, NAME): | |
return self.name | |
elif key in (1, VALUE): | |
return self.value | |
else: | |
raise IndexError("Token value not in the range of 0/1") | |
def __str__(self): | |
return str(self.name) + "|" + str(self.value) | |
def group_two_byte_strings(source): | |
components = [] | |
temp, in_string, escaped = "", False, False | |
for character in source: | |
if escaped: escaped = components.append(character) or False | |
elif temp: | |
temp = components.append([temp + character, "`"]) or "" | |
in_string = False | |
elif in_string: temp = character | |
elif character in "\\⁺": escaped = components.append(character) or True | |
elif character == StringDelimiters.TWO_CHAR: in_string = True | |
else: components.append(character) | |
if temp: components.append(temp) | |
return components | |
def group_strings(source): | |
components = [] | |
temp = "" | |
escaped = False | |
flux_string = [False, "", StringDelimiters.NORMAL] | |
for character in source: | |
if type(character) is list: | |
if flux_string[0]: flux_string[1] += character | |
else: components.append(character) | |
elif flux_string[0]: | |
if escaped: | |
if character in (StringDelimiters.NORMAL, StringDelimiters.DICTIONARY): | |
flux_string[1] = flux_string[1][:-1] | |
flux_string[1] += character | |
escaped = False | |
elif character == flux_string[2]: | |
components.append([flux_string[1], flux_string[2]]) | |
flux_string = [False, "", StringDelimiters.NORMAL] | |
elif character == "\\": | |
escaped = True | |
flux_string[1] += character | |
else: | |
flux_string[1] += character | |
elif escaped: escaped = components.append(character) or False | |
elif character in "\\⁺": escaped = components.append(character) or True | |
elif character in StringDelimiters.DELIM_TUPLE: flux_string = [True, "", character] | |
else: components.append(character) | |
if flux_string[0]: components.append([flux_string[1], flux_string[2]]) | |
return components | |
def group_digraphs(source, vars=False): | |
components = [] | |
temp = "" | |
escaped = False | |
ALL_DIGRAPHS = "k∆ø¨" | |
if vars: ALL_DIGRAPHS += "→←" | |
for character in source: | |
if type(character) is list: components.append(character) | |
elif escaped: escaped = components.append(character) or False | |
elif temp: temp = components.append(temp + character) or "" | |
elif character in "\\⁺": escaped = components.append(character) or True | |
elif character in ALL_DIGRAPHS: temp = character | |
else: components.append(character) | |
return components | |
def Tokenise(source: str, variables_are_digraphs=False): | |
tokens = [] | |
escaped = False | |
comment = False | |
structure = NO_STMT | |
structure_data = {} | |
active_key = "" # The key which is currently being dealt with | |
default_key = "" # The key that is used if there is only one component in an element that can have branches (|) | |
nest_level = 0 # How far deep we are, as in, are we at the uppermost level of the program? | |
source = group_two_byte_strings(source) | |
source = group_strings(source) | |
source = group_digraphs(source, variables_are_digraphs) | |
token_pointer = 0 | |
while token_pointer < len(source): | |
character = source[token_pointer] | |
if comment: comment = character == "\n"; continue | |
if escaped: | |
if structure != NO_STMT: | |
structure_data[active_key] += "\\" + character | |
else: | |
tokens.append(Token(CHARACTER, character)) | |
escaped = False; continue | |
if structure == INTEGER: | |
if type(character) is str and character in (string.digits + "."): # If the character is a digit, we keep adding to the flux number | |
structure_data[active_key] += character | |
continue | |
else: | |
tokens.append(Token(INTEGER, structure_data[active_key])) | |
structure, structure_data, active_key, default_key = NO_STMT, {}, "", "" | |
if structure == VARIABLE_GET or structure == VARIABLE_SET: | |
if type(character) is str and character in string.ascii_letters + "_": # If the character is a valid variable name letter, we keep adding to the name | |
structure_data[active_key] += character | |
continue | |
else: | |
tokens.append(Token(structure, structure_data[active_key])) | |
structure, structure_data, active_key, default_key = NO_STMT, {}, "", "" | |
if character == "'": | |
escaped = True | |
continue | |
if type(character) is list: | |
tokens.append((STRING_STMT, character)) | |
continue | |
elif character in ONE_CHARS: | |
everything_after = Tokenise(source[token_pointer + 1:]) | |
tokens.append(Token(character, everything_after[0])) | |
tokens += everything_after[1:] | |
break | |
elif character in TWO_CHARS: | |
everything_after = Tokenise(source[token_pointer + 1:]) | |
if character == TWO_CHAR_LAMBDA: | |
tokens.append( | |
Token( | |
LAMBDA_STMT, | |
{LAMBDA_BODY: everything_after[0:2]}, | |
) | |
) | |
structure = NO_STMT | |
structure_data = {} | |
else: | |
tokens.append(Token(character, everything_after[0:2])) | |
tokens += everything_after[2:] | |
break | |
else: | |
tokens.append(Token(NO_STMT, character)) | |
token_pointer += 1 | |
return tokens | |
print([(n[0], n[1]) for n in Tokenise("‡₌*ġḭ")]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment