Skip to content

Instantly share code, notes, and snippets.

@dhondta
Last active April 27, 2020 19:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dhondta/c217ed25efd5f9ff95227f999423b68b to your computer and use it in GitHub Desktop.
Save dhondta/c217ed25efd5f9ff95227f999423b68b to your computer and use it in GitHub Desktop.
Simple lexer module for parsing a line of arguments and keyword-arguments, useful for CLI tools

Arguments Lexer

Simple arguments lexer for parsing a line of arguments and keyword-arguments.

The ValueLexer evaluates strings (delimited by single or double quotes), booleans, floats, integers and also binary, octal or hexadecimal (to an integer).

The ArgumentsLexer ensures that the input line of arguments has the form:

arg1 arg2 ... argN kw1=val1 kw2=val2 ... kwM=valM

It returns (None, None) if parsing failed.

Analyze input command line in a CLI tool

from arglexer import ArgumentsLexer

lexer = ArgumentsLexer()

def process(command):
    name, line = command.split(" ", 1)
    args, kwargs = lexer.parse(line)
    if args is None and kwargs is None:
        raise ValueError("Bad command")
    # command processing code

Example usage:

>>> from arglexer import ArgumentsLexer
>>> lexer = ArgumentsLexer()
>>> lexer.parse("'test string' 123 b=true f=1.23")
(('test string', 123), {'b': True, 'f': 1.23})
>>> lexer.parse("0o132 b=0b0101 h=0x123")
((90,), {'b': 5, 'h': 291})
>>> lexer.parse("123 b=true error")
(None, None)
# -*- coding: utf-8 -*-
import ast
from pygments.lexer import RegexLexer, bygroups, using
from pygments.token import Error, Keyword, Name, Number, Operator, String, Whitespace
__all__ = ["ArgumentsLexer", "ValueLexer"]
class ValueLexer(RegexLexer):
""" A small lexer to analyze string, number, boolean and variable name. """
tokens = {
'root': [
(r"^(?P<delimiter>['\"])(.*?)(?P=delimiter)$", String),
(r'^([Ff]alse|[Tt]rue)$', Keyword),
(r'^(\d+\.\d*|\d*\.\d+)(e[+-]?[0-9]+)?$', Number.Float),
(r'^0o[0-7]+$', Number.Oct),
(r'^0x[a-fA-F0-9]+$', Number.Hex),
(r'^0b[01]+$', Number.Bin),
(r'^\d+$', Number.Integer),
(r'^[^\s]+$', String),
],
}
def parse(self, text):
""" Parse input text and convert it to its base type. """
if any([token is Error for token, value in self.get_tokens(text)]):
return
token, value = list(self.get_tokens(text))[0]
if token is Keyword:
if value in ["true", "True", "false", "False"]:
value = value in ["true", "True"]
elif token is String:
value = value.strip("'\"")
elif token is Number.Integer:
value = int(value)
elif token is Number.Float:
value = float(value)
elif token in [Number.Oct, Number.Hex, Number.Bin]:
value = ast.literal_eval(value)
return value
class ArgumentsLexer(RegexLexer):
""" A lexer to analyze command arguments with the following structure:
arg1 arg2 ... argN kwarg1 kwarg2 ... kwargM """
tokens = {
'root': [
(r'((?:[a-zA-Z]|\_{1,2})(?:[a-zA-Z0-9-_]*[a-zA-Z0-9])?)(=)((?P<delimiter>[\'"]).*?(?P=delimiter))',
bygroups(Name, Operator, using(ValueLexer)), ('kwargs', '#push')),
(r'((?:[a-zA-Z]|\_{1,2})(?:[a-zA-Z0-9-_]*[a-zA-Z0-9])?)(=)([^\s]+)',
bygroups(Name, Operator, using(ValueLexer)), ('kwargs', '#push')),
(r'(?P<delimiter>[\'"])(.*?)(?P=delimiter)', using(ValueLexer), '#push'),
(r'[^\s]+', using(ValueLexer), '#push'),
(r'\s+', Whitespace, '#pop'),
],
'kwargs': [
(r'((?:[a-zA-Z]|\_{1,2})(?:[a-zA-Z0-9-_]*[a-zA-Z0-9])?)(=)((?P<delimiter>[\'"]).*?(?P=delimiter))',
bygroups(Name, Operator, using(ValueLexer)), '#push'),
(r'((?:[a-zA-Z]|\_{1,2})(?:[a-zA-Z0-9-_]*[a-zA-Z0-9])?)(=)([^\s]+)',
bygroups(Name, Operator, using(ValueLexer)), '#push'),
(r'\s+', Whitespace, '#pop'),
],
}
def parse(self, text):
""" Parse the input text and return a tuple of arguments and a dictionary of keyword-arguments. """
if any([token is Error for token, value in self.get_tokens(text)]):
return 2 * (None, )
tokens, args, kwargs = self.get_tokens(text), [], {}
vl = ValueLexer()
for token, value in tokens:
if token is Whitespace:
continue
elif token is Name:
next(tokens) # pass the Operator '='
_, v = next(tokens)
kwargs.update({value: vl.parse(v)})
else:
args.append(vl.parse(value))
return tuple(args), kwargs
#!/usr/bin/env python
"""Arglexer module's tests.
"""
from unittest import TestCase
from arglexer import *
al = ArgumentsLexer()
vl = ValueLexer()
class TestValueLexer(TestCase):
def test_good_expressions(self):
p = vl.parse
self.assertEqual(p("01234"), 1234)
self.assertEqual(p("1.234"), 1.234)
self.assertEqual(p("1.23e4"), 12300.0)
self.assertEqual(p("0b100"), 4)
self.assertEqual(p("0xbabe"), 47806)
self.assertEqual(p("0o1234"), 668)
self.assertEqual(p("'this is a test'"), "this is a test")
self.assertEqual(p("true"), True)
self.assertEqual(p("True"), True)
self.assertEqual(p("false"), False)
self.assertEqual(p("False"), False)
def test_bad_expressions(self):
p = vl.parse
self.assertEqual(p("1..23"), "1..23")
self.assertEqual(p("1.23ee4"), "1.23ee4")
self.assertIsNone(p("this is a test"))
class TestArgumentsLexer(TestCase):
def test_good_expressions(self):
p = al.parse
self.assertIsNotNone(p("test"))
self.assertIsNotNone(p("'test string' false"))
self.assertIsNotNone(p("'test string' 1.234 12.e3"))
self.assertIsNotNone(p("'test string' var1=true"))
self.assertIsNotNone(p("'test string' var1=true var2=\"string\""))
self.assertIsNotNone(p("a=1 b=test c=0x123"))
def test_bad_expressions(self):
p = al.parse
self.assertIsNone(p("kw1=arg1 arg2")[1])
self.assertIsNone(p("kw1=arg1 'arg2 string'")[1])
self.assertIsNone(p("arg1 arg2 kw3=arg3 arg4")[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment