Skip to content

Instantly share code, notes, and snippets.

@pragma-

pragma-/Lexer.pm Secret

Created June 26, 2020 18:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pragma-/8860629fa5a02b7561f39d0cb90cf006 to your computer and use it in GitHub Desktop.
Save pragma-/8860629fa5a02b7561f39d0cb90cf006 to your computer and use it in GitHub Desktop.
int main(void) {
int i = 42;
char c = 'b'; // this is a 'single quoted' literal
char *s = "cat";
i++; // increment "i" by one
puts("string /* with comment */");
/* "comment with string" */
printf("\"i\" = %d, c = '%c'%c", i, c, '\n');
/* one
two */
float/* test */j = 42;
/*******************
* here we return *
* the value for a *
* successful exit *
*******************/
return EXIT_SUCCESS;
}
#!/usr/bin/env perl
use warnings;
use strict;
package Lexer;
sub new {
my ($proto, %conf) = @_;
my $class = ref($proto) || $proto;
my $self = bless {}, $class;
$self->initialize(%conf);
return $self;
}
sub initialize {
my ($self, %conf) = @_;
$self->{tokentypes} = [];
}
# define our tokentypes
sub define_tokens {
my $self = shift;
$self->{tokentypes} = [];
foreach my $arg (@_) {
push @{$self->{tokentypes}}, $arg;
}
}
# append new tokentypes definitions
sub add_token {
my ($self, $tokentype) = @_;
push @{$self->{tokentypes}}, $tokentype;
}
sub tokens {
my ($self, $input, $tokentypes) = @_;
# allow overriding list of tokentypes and matchers
$tokentypes //= $self->{tokentypes};
# the current line being lexed
my $text;
# closures are neat
return sub {
INPUT: {
# get next line if we don't have a line
$text = $input->() if not defined $text;
# all done when there's no more input
return undef if not defined $text;
# go through each tokentype
foreach my $tokentype (@$tokentypes) {
# does this bit of text match this tokentype?
if ($text =~ /$tokentype->[1]/gc) {
# got a token
my $literal = $1;
# do we have a specific function to continue lexing this token?
if (defined $tokentype->[3]) {
return $tokentype->[3]->($input, \$text, $tokentype->[0], $literal);
}
# do we have a specific function to return this token?
if (defined $tokentype->[2]) {
return $tokentype->[2]->();
}
# return this token
return [ $tokentype->[0], $literal ];
}
}
# end of this input
$text = undef;
redo INPUT;
}
}
}
1;
HERE'S A TOKEN: ["IDENT","int"]
HERE'S A TOKEN: ["IDENT","main"]
HERE'S A TOKEN: ["L_PAREN","("]
HERE'S A TOKEN: ["IDENT","void"]
HERE'S A TOKEN: ["R_PAREN",")"]
HERE'S A TOKEN: ["OTHER","{"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","int"]
HERE'S A TOKEN: ["IDENT","i"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["NUM",42]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["IDENT","char"]
HERE'S A TOKEN: ["IDENT","c"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["SQUOTE_STRING","'b'"]
HERE'S A TOKEN: ["TERM",";"]
HERE'S A TOKEN: ["COMMENT_EOL","// this is a 'single quoted' literal"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","char"]
HERE'S A TOKEN: ["STAR","*"]
HERE'S A TOKEN: ["IDENT","s"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["DQUOTE_STRING","\"cat\""]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","i"]
HERE'S A TOKEN: ["PLUS_PLUS","++"]
HERE'S A TOKEN: ["TERM",";"]
HERE'S A TOKEN: ["COMMENT_EOL","// increment \"i\" by one"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","puts"]
HERE'S A TOKEN: ["L_PAREN","("]
HERE'S A TOKEN: ["DQUOTE_STRING","\"string /* with comment */\""]
HERE'S A TOKEN: ["R_PAREN",")"]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["COMMENT_INLINE","/* \"comment with string\" */"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","printf"]
HERE'S A TOKEN: ["L_PAREN","("]
HERE'S A TOKEN: ["DQUOTE_STRING","\"\\\"i\\\" = %d, c = '%c'%c\""]
HERE'S A TOKEN: ["COMMA",","]
HERE'S A TOKEN: ["IDENT","i"]
HERE'S A TOKEN: ["COMMA",","]
HERE'S A TOKEN: ["IDENT","c"]
HERE'S A TOKEN: ["COMMA",","]
HERE'S A TOKEN: ["SQUOTE_STRING","'\\n'"]
HERE'S A TOKEN: ["R_PAREN",")"]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["COMMENT_MULTI","/* one\n two */"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","float"]
HERE'S A TOKEN: ["COMMENT_INLINE","/* test */"]
HERE'S A TOKEN: ["IDENT","j"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["NUM",42]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["COMMENT_MULTI","/*******************\n * here we return * * the value for a * * successful exit * *******************/"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","return"]
HERE'S A TOKEN: ["IDENT","EXIT_SUCCESS"]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["OTHER","}"]
HERE'S A TOKEN: ["TERM","\n"]
#!/bin/env perl
use warnings;
use strict;
use Lexer;
my $lexer = Lexer->new(debug => $ENV{debug});
sub multiline_comment {
my ($input, $text, $tokentype, $buf) = @_;
while (1) {
$$text = $input->() if not defined $$text;
return [$tokentype, $buf] if not defined $$text;
if ($$text =~ m{\G( .*? \*/ )}gcx) {
$buf .= $1;
return [$tokentype, $buf];
} else {
$$text =~ m{\G( .* )}gxc;
$buf .= $1;
$$text = undef;
}
}
}
$lexer->define_tokens(
['COMMENT_EOL', qr{\G( (?://|\#).*$ )}x], #, sub {''}],
['COMMENT_INLINE', qr{\G( /\* .*? \*/ )}x], #, sub {''}],
['COMMENT_MULTI', qr{\G( /\* .*?(?!\*/)\s+$ )}x, undef, sub { multiline_comment(@_) }],
['DQUOTE_STRING', qr{\G( "(?:[^"\\]|\\.)*" )}x],
['SQUOTE_STRING', qr{\G( '(?:[^'\\]|\\.)*' )}x],
['EQ_EQ', qr{\G( == )}x],
['PLUS_EQ', qr{\G( \+= )}x],
['PLUS_PLUS', qr{\G( \+\+ )}x],
['EQ', qr{\G( = )}x],
['PLUS', qr{\G( \+ )}x],
['MINUS', qr{\G( - )}x],
['COMMA', qr{\G( , )}x],
['STAR', qr{\G( \* )}x],
['BSLASH', qr{\G( / )}x],
['FSLASH', qr{\G( \\ )}x],
['L_PAREN', qr{\G( \( )}x],
['R_PAREN', qr{\G( \) )}x],
['NUM', qr{\G( [0-9.]+ )}x],
['IDENT', qr{\G( [A-Za-z_]\w* )}x],
['TERM', qr{\G( ;\n* | \n+ )}x],
['WHITESPACE', qr{\G( \s+ )}x, sub {''}],
['OTHER', qr{\G( . )}x],
);
use Data::Dumper;
$Data::Dumper::Terse = 1;
$Data::Dumper::Indent = 0;
$Data::Dumper::Useqq = 1;
sub stream_iter {
my $handle = $_[0];
sub {
return <$handle>;
}
};
my $input_iter = stream_iter(\*STDIN);
my $tokens = $lexer->tokens($input_iter);
while (defined (my $token = $tokens->())) {
next if not length $token;
print "HERE'S A TOKEN: ", (Dumper $token), "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment