Skip to content

Instantly share code, notes, and snippets.

@wilsonfreitas
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wilsonfreitas/7f6450343b3958a67f4e to your computer and use it in GitHub Desktop.
Save wilsonfreitas/7f6450343b3958a67f4e to your computer and use it in GitHub Desktop.
textparser—For simply converting small chunks of text into anything useful

textparser

Frequently I have to parse text into float, int and date, for just a few examples. I've written that class to isolate the parsing task, instead of getting it spreaded all over the code. This is a fairly simple class which helped me very much.

I copied the idea of using a regular expression in __doc__ from PLY.

import re
from types import MethodType
class TextParser(object):
def __init__(self):
self.parsers = self.__createMethodAnalyzers()
def __createMethodAnalyzers(self):
pairs = []
for methodName in dir(self):
method = getattr(self, methodName)
if methodName.startswith('parse') and type(method) is MethodType and method.__doc__:
pairs.append(buildparser(method.__doc__, method))
return pairs
def parse(self, text):
for parser in self.parsers:
val = parser(text)
if val != text:
return val
return self.parseText(text)
def parseText(self, text):
return text
class BooleanParser(TextParser):
def parseBoolean(self, text, match):
r'^[Tt][Rr][Uu][eE]|[Ff][Aa][Ll][Ss][Ee]$'
return eval(text.lower().capitalize())
class NumberParser(TextParser):
def parseInteger(self, text, match):
r'^-?\s*\d+$'
return eval(text)
def parse_number_decimal(self, text, match):
r'^-?\s*\d+\.\d+?$'
return eval(text)
def parse_number_with_thousands(self, text, match):
r'^-?\s*(\d+[,])+\d+[\.]\d+?$'
text = text.replace(',', '')
return eval(text)
class PortugueseRulesParser(TextParser):
def parseBoolean_ptBR(self, text, match):
r'^(sim|Sim|SIM|n.o|N.o|N.O)$'
return text[0].lower() == 's'
def parseBoolean_ptBR2(self, text, match):
r'^(verdadeiro|VERDADEIRO|falso|FALSO|V|F|v|f)$'
return text[0].lower() == 'v'
def parse_number_with_thousands_ptBR(self, text, match):
r'^-?\s*(\d+\.)+\d+,\d+?$'
text = text.replace('.', '')
text = text.replace(',', '.')
return eval(text)
def parse_number_decimal_ptBR(self, text, match):
r'^-?\s*\d+,\d+?$'
text = text.replace(',', '.')
return eval(text)
def textparse(text, regex, func):
parser = buildparser(regex, func)
return parser(text)
def buildparser(regex, func):
_regex = re.compile(regex)
def _func(text):
match = _regex.match(text)
return func(text, match) if match else text
return _func
class GenericParser(NumberParser, BooleanParser):
pass
parse = GenericParser().parse
if __name__ == '__main__':
assert parse('true')
assert parse('1.1') == 1.1
assert parse('11') == 11
assert parse('1,100.01') == 1100.01
parser = PortugueseRulesParser()
assert parser.parse('1,1') == 1.1
assert parser.parse('-1,1') == -1.1
assert parser.parse('- 1,1') == -1.1
assert parser.parse('Wálson') == 'Wálson'
assert parser.parse('1.100,01') == 1100.01
assert textparse('TRUe', r'^[Tt][Rr][Uu][eE]|[Ff][Aa][Ll][Ss][Ee]$', lambda t, m: eval(t.lower().capitalize()))
assert textparse('1,1', r'^-?\s*\d+[\.,]\d+?$', lambda t, m: eval(t.replace(',', '.'))) == 1.1
num_parser = buildparser(r'^-?\s*\d+[\.,]\d+?$', lambda t, m: eval(t.replace(',', '.')))
assert num_parser('1,1') == 1.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment