lemilonkh/parse_molecule.py

## parse_molecule.py
#!/usr/bin/python3

from ply import lex
import ply.yacc as yacc

# data storage class for parsed groups of ELEMENT and COUNT
class Atom:
    def __init__(self, element, count):
        self.element = element
        self.count = count

    def __repr__(self):
        return "Atom(%r, %r)" % (self.element, self.count)

    def multiply(self, count):
        self.count *= count

### TOKENIZATION ###

# non-terminal classes and their regex/ function definitions
tokens = ("ELEMENT", "COUNT", "OPEN_BRACKET", "CLOSED_BRACKET")
t_ELEMENT = r"[A-Z][a-z]?" # TODO validate that this is a correct element?
t_OPEN_BRACKET = r"[\{\[\(]"
t_CLOSED_BRACKET = r"[\}\]\)]"
def t_COUNT(t):
    r"\d+"
    t.value = int(t.value)
    return t

def t_error(t):
    raise TypeError("Incorrect text '%s'" % t.value)

### PARSING ###

# Allow however many groups of brackets after each other and concatenate them
def p_species_list_list(p):
    "chemical_equation : chemical_equation chemical_equation"
    p[0] = p[1] + p[2]

# Remove brackets from group (and multiply by count if given)
def p_bracketed_species_list(p):
    """
    chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET
    chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET COUNT
    """

    # multiply all atoms contained in the brackets by the count after them
    if len(p) == 5:
        for atom in p[2]:
            atom.multiply(p[4])
    p[0] = p[2]

# Parse lists of ELEMENT SPECIES sequentially from the back
def p_species_list(p):
    "chemical_equation : chemical_equation species"
    p[0] = p[1] + [p[2]]

# Edge case when there is only one element (left)
def p_species(p):
    "chemical_equation : species"
    p[0] = [p[1]]

# Leaf case: Create a new Atom instance for ever element (with associated count)
def p_single_species(p):
    """
    species : ELEMENT
    species : ELEMENT COUNT
    """
    if len(p) == 2:
        p[0] = Atom(p[1], 1)
    elif len(p) == 3:
        p[0] = Atom(p[1], p[2])

def p_error(p):
    print("Syntax error at '%s'" % p.value)

### MAIN PROGRAM ###

def parse_molecule(molecule):
    # execute lex and yacc on the input rules above
    lex.lex()
    yacc.yacc()

    # run the tokenization and parsing steps
    atoms = yacc.parse(molecule)

    # sum up the atoms by type
    result = {}
    for atom in atoms:
        if not (atom.element in result):
            result[atom.element] = 0
        result[atom.element] += atom.count

    return result

if __name__ == "__main__":
    molecule = input("Enter a molecule: ")
    try:
        print(parse_molecule(molecule))
    except Exception as e:
        print("Error: " + str(e))
	#!/usr/bin/python3

	from ply import lex
	import ply.yacc as yacc

	# data storage class for parsed groups of ELEMENT and COUNT
	class Atom:
	def __init__(self, element, count):
	self.element = element
	self.count = count

	def __repr__(self):
	return "Atom(%r, %r)" % (self.element, self.count)

	def multiply(self, count):
	self.count *= count

	### TOKENIZATION ###

	# non-terminal classes and their regex/ function definitions
	tokens = ("ELEMENT", "COUNT", "OPEN_BRACKET", "CLOSED_BRACKET")
	t_ELEMENT = r"[A-Z][a-z]?" # TODO validate that this is a correct element?
	t_OPEN_BRACKET = r"[\{\[\(]"
	t_CLOSED_BRACKET = r"[\}\]\)]"
	def t_COUNT(t):
	r"\d+"
	t.value = int(t.value)
	return t

	def t_error(t):
	raise TypeError("Incorrect text '%s'" % t.value)

	### PARSING ###

	# Allow however many groups of brackets after each other and concatenate them
	def p_species_list_list(p):
	"chemical_equation : chemical_equation chemical_equation"
	p[0] = p[1] + p[2]

	# Remove brackets from group (and multiply by count if given)
	def p_bracketed_species_list(p):
	"""
	chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET
	chemical_equation : OPEN_BRACKET chemical_equation CLOSED_BRACKET COUNT
	"""

	# multiply all atoms contained in the brackets by the count after them
	if len(p) == 5:
	for atom in p[2]:
	atom.multiply(p[4])
	p[0] = p[2]

	# Parse lists of ELEMENT SPECIES sequentially from the back
	def p_species_list(p):
	"chemical_equation : chemical_equation species"
	p[0] = p[1] + [p[2]]

	# Edge case when there is only one element (left)
	def p_species(p):
	"chemical_equation : species"
	p[0] = [p[1]]

	# Leaf case: Create a new Atom instance for ever element (with associated count)
	def p_single_species(p):
	"""
	species : ELEMENT
	species : ELEMENT COUNT
	"""
	if len(p) == 2:
	p[0] = Atom(p[1], 1)
	elif len(p) == 3:
	p[0] = Atom(p[1], p[2])

	def p_error(p):
	print("Syntax error at '%s'" % p.value)

	### MAIN PROGRAM ###

	def parse_molecule(molecule):
	# execute lex and yacc on the input rules above
	lex.lex()
	yacc.yacc()

	# run the tokenization and parsing steps
	atoms = yacc.parse(molecule)

	# sum up the atoms by type
	result = {}
	for atom in atoms:
	if not (atom.element in result):
	result[atom.element] = 0
	result[atom.element] += atom.count

	return result

	if __name__ == "__main__":
	molecule = input("Enter a molecule: ")
	try:
	print(parse_molecule(molecule))
	except Exception as e:
	print("Error: " + str(e))