-
-
Save pragma-/8860629fa5a02b7561f39d0cb90cf006 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int main(void) { | |
int i = 42; | |
char c = 'b'; // this is a 'single quoted' literal | |
char *s = "cat"; | |
i++; // increment "i" by one | |
puts("string /* with comment */"); | |
/* "comment with string" */ | |
printf("\"i\" = %d, c = '%c'%c", i, c, '\n'); | |
/* one | |
two */ | |
float/* test */j = 42; | |
/******************* | |
* here we return * | |
* the value for a * | |
* successful exit * | |
*******************/ | |
return EXIT_SUCCESS; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use warnings; | |
use strict; | |
package Lexer; | |
sub new { | |
my ($proto, %conf) = @_; | |
my $class = ref($proto) || $proto; | |
my $self = bless {}, $class; | |
$self->initialize(%conf); | |
return $self; | |
} | |
sub initialize { | |
my ($self, %conf) = @_; | |
$self->{tokentypes} = []; | |
} | |
# define our tokentypes | |
sub define_tokens { | |
my $self = shift; | |
$self->{tokentypes} = []; | |
foreach my $arg (@_) { | |
push @{$self->{tokentypes}}, $arg; | |
} | |
} | |
# append new tokentypes definitions | |
sub add_token { | |
my ($self, $tokentype) = @_; | |
push @{$self->{tokentypes}}, $tokentype; | |
} | |
sub tokens { | |
my ($self, $input, $tokentypes) = @_; | |
# allow overriding list of tokentypes and matchers | |
$tokentypes //= $self->{tokentypes}; | |
# the current line being lexed | |
my $text; | |
# closures are neat | |
return sub { | |
INPUT: { | |
# get next line if we don't have a line | |
$text = $input->() if not defined $text; | |
# all done when there's no more input | |
return undef if not defined $text; | |
# go through each tokentype | |
foreach my $tokentype (@$tokentypes) { | |
# does this bit of text match this tokentype? | |
if ($text =~ /$tokentype->[1]/gc) { | |
# got a token | |
my $literal = $1; | |
# do we have a specific function to continue lexing this token? | |
if (defined $tokentype->[3]) { | |
return $tokentype->[3]->($input, \$text, $tokentype->[0], $literal); | |
} | |
# do we have a specific function to return this token? | |
if (defined $tokentype->[2]) { | |
return $tokentype->[2]->(); | |
} | |
# return this token | |
return [ $tokentype->[0], $literal ]; | |
} | |
} | |
# end of this input | |
$text = undef; | |
redo INPUT; | |
} | |
} | |
} | |
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
HERE'S A TOKEN: ["IDENT","int"] | |
HERE'S A TOKEN: ["IDENT","main"] | |
HERE'S A TOKEN: ["L_PAREN","("] | |
HERE'S A TOKEN: ["IDENT","void"] | |
HERE'S A TOKEN: ["R_PAREN",")"] | |
HERE'S A TOKEN: ["OTHER","{"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","int"] | |
HERE'S A TOKEN: ["IDENT","i"] | |
HERE'S A TOKEN: ["EQ","="] | |
HERE'S A TOKEN: ["NUM",42] | |
HERE'S A TOKEN: ["TERM",";\n"] | |
HERE'S A TOKEN: ["IDENT","char"] | |
HERE'S A TOKEN: ["IDENT","c"] | |
HERE'S A TOKEN: ["EQ","="] | |
HERE'S A TOKEN: ["SQUOTE_STRING","'b'"] | |
HERE'S A TOKEN: ["TERM",";"] | |
HERE'S A TOKEN: ["COMMENT_EOL","// this is a 'single quoted' literal"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","char"] | |
HERE'S A TOKEN: ["STAR","*"] | |
HERE'S A TOKEN: ["IDENT","s"] | |
HERE'S A TOKEN: ["EQ","="] | |
HERE'S A TOKEN: ["DQUOTE_STRING","\"cat\""] | |
HERE'S A TOKEN: ["TERM",";\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","i"] | |
HERE'S A TOKEN: ["PLUS_PLUS","++"] | |
HERE'S A TOKEN: ["TERM",";"] | |
HERE'S A TOKEN: ["COMMENT_EOL","// increment \"i\" by one"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","puts"] | |
HERE'S A TOKEN: ["L_PAREN","("] | |
HERE'S A TOKEN: ["DQUOTE_STRING","\"string /* with comment */\""] | |
HERE'S A TOKEN: ["R_PAREN",")"] | |
HERE'S A TOKEN: ["TERM",";\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["COMMENT_INLINE","/* \"comment with string\" */"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","printf"] | |
HERE'S A TOKEN: ["L_PAREN","("] | |
HERE'S A TOKEN: ["DQUOTE_STRING","\"\\\"i\\\" = %d, c = '%c'%c\""] | |
HERE'S A TOKEN: ["COMMA",","] | |
HERE'S A TOKEN: ["IDENT","i"] | |
HERE'S A TOKEN: ["COMMA",","] | |
HERE'S A TOKEN: ["IDENT","c"] | |
HERE'S A TOKEN: ["COMMA",","] | |
HERE'S A TOKEN: ["SQUOTE_STRING","'\\n'"] | |
HERE'S A TOKEN: ["R_PAREN",")"] | |
HERE'S A TOKEN: ["TERM",";\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["COMMENT_MULTI","/* one\n two */"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","float"] | |
HERE'S A TOKEN: ["COMMENT_INLINE","/* test */"] | |
HERE'S A TOKEN: ["IDENT","j"] | |
HERE'S A TOKEN: ["EQ","="] | |
HERE'S A TOKEN: ["NUM",42] | |
HERE'S A TOKEN: ["TERM",";\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["COMMENT_MULTI","/*******************\n * here we return * * the value for a * * successful exit * *******************/"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["TERM","\n"] | |
HERE'S A TOKEN: ["IDENT","return"] | |
HERE'S A TOKEN: ["IDENT","EXIT_SUCCESS"] | |
HERE'S A TOKEN: ["TERM",";\n"] | |
HERE'S A TOKEN: ["OTHER","}"] | |
HERE'S A TOKEN: ["TERM","\n"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env perl | |
use warnings; | |
use strict; | |
use Lexer; | |
my $lexer = Lexer->new(debug => $ENV{debug}); | |
sub multiline_comment { | |
my ($input, $text, $tokentype, $buf) = @_; | |
while (1) { | |
$$text = $input->() if not defined $$text; | |
return [$tokentype, $buf] if not defined $$text; | |
if ($$text =~ m{\G( .*? \*/ )}gcx) { | |
$buf .= $1; | |
return [$tokentype, $buf]; | |
} else { | |
$$text =~ m{\G( .* )}gxc; | |
$buf .= $1; | |
$$text = undef; | |
} | |
} | |
} | |
$lexer->define_tokens( | |
['COMMENT_EOL', qr{\G( (?://|\#).*$ )}x], #, sub {''}], | |
['COMMENT_INLINE', qr{\G( /\* .*? \*/ )}x], #, sub {''}], | |
['COMMENT_MULTI', qr{\G( /\* .*?(?!\*/)\s+$ )}x, undef, sub { multiline_comment(@_) }], | |
['DQUOTE_STRING', qr{\G( "(?:[^"\\]|\\.)*" )}x], | |
['SQUOTE_STRING', qr{\G( '(?:[^'\\]|\\.)*' )}x], | |
['EQ_EQ', qr{\G( == )}x], | |
['PLUS_EQ', qr{\G( \+= )}x], | |
['PLUS_PLUS', qr{\G( \+\+ )}x], | |
['EQ', qr{\G( = )}x], | |
['PLUS', qr{\G( \+ )}x], | |
['MINUS', qr{\G( - )}x], | |
['COMMA', qr{\G( , )}x], | |
['STAR', qr{\G( \* )}x], | |
['BSLASH', qr{\G( / )}x], | |
['FSLASH', qr{\G( \\ )}x], | |
['L_PAREN', qr{\G( \( )}x], | |
['R_PAREN', qr{\G( \) )}x], | |
['NUM', qr{\G( [0-9.]+ )}x], | |
['IDENT', qr{\G( [A-Za-z_]\w* )}x], | |
['TERM', qr{\G( ;\n* | \n+ )}x], | |
['WHITESPACE', qr{\G( \s+ )}x, sub {''}], | |
['OTHER', qr{\G( . )}x], | |
); | |
use Data::Dumper; | |
$Data::Dumper::Terse = 1; | |
$Data::Dumper::Indent = 0; | |
$Data::Dumper::Useqq = 1; | |
sub stream_iter { | |
my $handle = $_[0]; | |
sub { | |
return <$handle>; | |
} | |
}; | |
my $input_iter = stream_iter(\*STDIN); | |
my $tokens = $lexer->tokens($input_iter); | |
while (defined (my $token = $tokens->())) { | |
next if not length $token; | |
print "HERE'S A TOKEN: ", (Dumper $token), "\n"; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment