Last active
June 16, 2018 13:28
-
-
Save lemilonkh/d0af83d54cf60312c3cfe18b86c508aa to your computer and use it in GitHub Desktop.
Parse molecule descriptions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
from ply import lex | |
import ply.yacc as yacc | |
# data storage class for parsed groups of ELEMENT and COUNT | |
class Atom: | |
def __init__(self, element, count): | |
self.element = element | |
self.count = count | |
def __repr__(self): | |
return "Atom(%r, %r)" % (self.element, self.count) | |
def multiply(self, count): | |
self.count *= count | |
### TOKENIZATION ### | |
# non-terminal classes and their regex/ function definitions | |
tokens = ("ELEMENT", "COUNT", "OPEN_BRACKET", "CLOSED_BRACKET") | |
t_ELEMENT = r"[A-Z][a-z]?" # TODO validate that this is a correct element? | |
t_OPEN_BRACKET = r"[\{\[\(]" | |
t_CLOSED_BRACKET = r"[\}\]\)]" | |
def t_COUNT(t): | |
r"\d+" | |
t.value = int(t.value) | |
return t | |
def t_error(t): | |
raise TypeError("Incorrect text '%s'" % t.value) | |
### PARSING ### | |
# Allow however many groups of brackets after each other and concatenate them | |
def p_species_list_list(p): | |
"chemical_equation : chemical_equation chemical_equation" | |
p[0] = p[1] + p[2] | |
# Remove brackets from group (and multiply by count if given) | |
def p_bracketed_species_list(p): | |
""" | |
chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET | |
chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET COUNT | |
""" | |
# multiply all atoms contained in the brackets by the count after them | |
if len(p) == 5: | |
for atom in p[2]: | |
atom.multiply(p[4]) | |
p[0] = p[2] | |
# Parse lists of ELEMENT SPECIES sequentially from the back | |
def p_species_list(p): | |
"chemical_equation : chemical_equation species" | |
p[0] = p[1] + [p[2]] | |
# Edge case when there is only one element (left) | |
def p_species(p): | |
"chemical_equation : species" | |
p[0] = [p[1]] | |
# Leaf case: Create a new Atom instance for ever element (with associated count) | |
def p_single_species(p): | |
""" | |
species : ELEMENT | |
species : ELEMENT COUNT | |
""" | |
if len(p) == 2: | |
p[0] = Atom(p[1], 1) | |
elif len(p) == 3: | |
p[0] = Atom(p[1], p[2]) | |
def p_error(p): | |
print("Syntax error at '%s'" % p.value) | |
### MAIN PROGRAM ### | |
def parse_molecule(molecule): | |
# execute lex and yacc on the input rules above | |
lex.lex() | |
yacc.yacc() | |
# run the tokenization and parsing steps | |
atoms = yacc.parse(molecule) | |
# sum up the atoms by type | |
result = {} | |
for atom in atoms: | |
if not (atom.element in result): | |
result[atom.element] = 0 | |
result[atom.element] += atom.count | |
return result | |
if __name__ == "__main__": | |
molecule = input("Enter a molecule: ") | |
try: | |
print(parse_molecule(molecule)) | |
except Exception as e: | |
print("Error: " + str(e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Download the ply repo and copy its
ply
folder to the same folder as the script.Execute it using:
python3 parse_molecule.py
Enter your molecule string.