Skip to content

Instantly share code, notes, and snippets.

@lemilonkh
Last active June 16, 2018 13:28
Show Gist options
  • Save lemilonkh/d0af83d54cf60312c3cfe18b86c508aa to your computer and use it in GitHub Desktop.
Save lemilonkh/d0af83d54cf60312c3cfe18b86c508aa to your computer and use it in GitHub Desktop.
Parse molecule descriptions
#!/usr/bin/python3
from ply import lex
import ply.yacc as yacc
# data storage class for parsed groups of ELEMENT and COUNT
class Atom:
def __init__(self, element, count):
self.element = element
self.count = count
def __repr__(self):
return "Atom(%r, %r)" % (self.element, self.count)
def multiply(self, count):
self.count *= count
### TOKENIZATION ###
# non-terminal classes and their regex/ function definitions
tokens = ("ELEMENT", "COUNT", "OPEN_BRACKET", "CLOSED_BRACKET")
t_ELEMENT = r"[A-Z][a-z]?" # TODO validate that this is a correct element?
t_OPEN_BRACKET = r"[\{\[\(]"
t_CLOSED_BRACKET = r"[\}\]\)]"
def t_COUNT(t):
r"\d+"
t.value = int(t.value)
return t
def t_error(t):
raise TypeError("Incorrect text '%s'" % t.value)
### PARSING ###
# Allow however many groups of brackets after each other and concatenate them
def p_species_list_list(p):
"chemical_equation : chemical_equation chemical_equation"
p[0] = p[1] + p[2]
# Remove brackets from group (and multiply by count if given)
def p_bracketed_species_list(p):
"""
chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET
chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET COUNT
"""
# multiply all atoms contained in the brackets by the count after them
if len(p) == 5:
for atom in p[2]:
atom.multiply(p[4])
p[0] = p[2]
# Parse lists of ELEMENT SPECIES sequentially from the back
def p_species_list(p):
"chemical_equation : chemical_equation species"
p[0] = p[1] + [p[2]]
# Edge case when there is only one element (left)
def p_species(p):
"chemical_equation : species"
p[0] = [p[1]]
# Leaf case: Create a new Atom instance for ever element (with associated count)
def p_single_species(p):
"""
species : ELEMENT
species : ELEMENT COUNT
"""
if len(p) == 2:
p[0] = Atom(p[1], 1)
elif len(p) == 3:
p[0] = Atom(p[1], p[2])
def p_error(p):
print("Syntax error at '%s'" % p.value)
### MAIN PROGRAM ###
def parse_molecule(molecule):
# execute lex and yacc on the input rules above
lex.lex()
yacc.yacc()
# run the tokenization and parsing steps
atoms = yacc.parse(molecule)
# sum up the atoms by type
result = {}
for atom in atoms:
if not (atom.element in result):
result[atom.element] = 0
result[atom.element] += atom.count
return result
if __name__ == "__main__":
molecule = input("Enter a molecule: ")
try:
print(parse_molecule(molecule))
except Exception as e:
print("Error: " + str(e))
@lemilonkh
Copy link
Author

lemilonkh commented Jun 16, 2018

Download the ply repo and copy its ply folder to the same folder as the script.
Execute it using: python3 parse_molecule.py
Enter your molecule string.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment