pragma-/Lexer.pm Secret

## input.txt
int main(void) {
   int i = 42;
   char c = 'b';  // this is a 'single quoted' literal
   char *s = "cat";

   i++; // increment "i" by one

   puts("string /* with comment */");

   /* "comment with string" */

   printf("\"i\" = %d, c = '%c'%c", i, c, '\n');

   /* one
      two */

   float/* test */j = 42;

  /*******************
   * here we return  *
   * the value for a *
   * successful exit *
   *******************/

   return EXIT_SUCCESS;
}

## Lexer.pm
#!/usr/bin/env perl

use warnings;
use strict;

package Lexer;

sub new {
    my ($proto, %conf) = @_;
    my $class = ref($proto) || $proto;
    my $self  = bless {}, $class;
    $self->initialize(%conf);
    return $self;
}

sub initialize {
    my ($self, %conf) = @_;

    $self->{tokentypes} = [];
}

# define our tokentypes
sub define_tokens {
    my $self = shift;

    $self->{tokentypes} = [];

    foreach my $arg (@_) {
        push @{$self->{tokentypes}}, $arg;
    }
}

# append new tokentypes definitions
sub add_token {
    my ($self, $tokentype) = @_;
    push @{$self->{tokentypes}}, $tokentype;
}

sub tokens {
    my ($self, $input, $tokentypes) = @_;

    # allow overriding list of tokentypes and matchers
    $tokentypes //= $self->{tokentypes};

    # the current line being lexed
    my $text;

    # closures are neat
    return sub {
        INPUT: {
            # get next line if we don't have a line
            $text = $input->() if not defined $text;

            # all done when there's no more input
            return undef if not defined $text;

            # go through each tokentype
            foreach my $tokentype (@$tokentypes) {
                # does this bit of text match this tokentype?
                if ($text =~ /$tokentype->[1]/gc) {
                    # got a token
                    my $literal = $1;

                    # do we have a specific function to continue lexing this token?
                    if (defined $tokentype->[3]) {
                        return $tokentype->[3]->($input, \$text, $tokentype->[0], $literal);
                    }

                    # do we have a specific function to return this token?
                    if (defined $tokentype->[2]) {
                        return $tokentype->[2]->();
                    }

                    # return this token
                    return [ $tokentype->[0], $literal ];
                }
            }

            # end of this input
            $text = undef;
            redo INPUT;
        }
    }
}

1;

## output.txt
HERE'S A TOKEN: ["IDENT","int"]
HERE'S A TOKEN: ["IDENT","main"]
HERE'S A TOKEN: ["L_PAREN","("]
HERE'S A TOKEN: ["IDENT","void"]
HERE'S A TOKEN: ["R_PAREN",")"]
HERE'S A TOKEN: ["OTHER","{"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","int"]
HERE'S A TOKEN: ["IDENT","i"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["NUM",42]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["IDENT","char"]
HERE'S A TOKEN: ["IDENT","c"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["SQUOTE_STRING","'b'"]
HERE'S A TOKEN: ["TERM",";"]
HERE'S A TOKEN: ["COMMENT_EOL","// this is a 'single quoted' literal"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","char"]
HERE'S A TOKEN: ["STAR","*"]
HERE'S A TOKEN: ["IDENT","s"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["DQUOTE_STRING","\"cat\""]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","i"]
HERE'S A TOKEN: ["PLUS_PLUS","++"]
HERE'S A TOKEN: ["TERM",";"]
HERE'S A TOKEN: ["COMMENT_EOL","// increment \"i\" by one"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","puts"]
HERE'S A TOKEN: ["L_PAREN","("]
HERE'S A TOKEN: ["DQUOTE_STRING","\"string /* with comment */\""]
HERE'S A TOKEN: ["R_PAREN",")"]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["COMMENT_INLINE","/* \"comment with string\" */"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","printf"]
HERE'S A TOKEN: ["L_PAREN","("]
HERE'S A TOKEN: ["DQUOTE_STRING","\"\\\"i\\\" = %d, c = '%c'%c\""]
HERE'S A TOKEN: ["COMMA",","]
HERE'S A TOKEN: ["IDENT","i"]
HERE'S A TOKEN: ["COMMA",","]
HERE'S A TOKEN: ["IDENT","c"]
HERE'S A TOKEN: ["COMMA",","]
HERE'S A TOKEN: ["SQUOTE_STRING","'\\n'"]
HERE'S A TOKEN: ["R_PAREN",")"]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["COMMENT_MULTI","/* one\n      two */"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","float"]
HERE'S A TOKEN: ["COMMENT_INLINE","/* test */"]
HERE'S A TOKEN: ["IDENT","j"]
HERE'S A TOKEN: ["EQ","="]
HERE'S A TOKEN: ["NUM",42]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["COMMENT_MULTI","/*******************\n   * here we return  *   * the value for a *   * successful exit *   *******************/"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["TERM","\n"]
HERE'S A TOKEN: ["IDENT","return"]
HERE'S A TOKEN: ["IDENT","EXIT_SUCCESS"]
HERE'S A TOKEN: ["TERM",";\n"]
HERE'S A TOKEN: ["OTHER","}"]
HERE'S A TOKEN: ["TERM","\n"]

## test.pl
#!/bin/env perl

use warnings;
use strict;

use Lexer;

my $lexer = Lexer->new(debug => $ENV{debug});

sub multiline_comment {
    my ($input, $text, $tokentype, $buf) = @_;

    while (1) {
        $$text = $input->() if not defined $$text;
        return [$tokentype, $buf] if not defined $$text;

        if ($$text =~ m{\G( .*? \*/ )}gcx) {
            $buf .= $1;
            return [$tokentype, $buf];
        } else {
            $$text =~ m{\G( .* )}gxc;
            $buf .= $1;
            $$text = undef;
        }
    }
}

$lexer->define_tokens(
    ['COMMENT_EOL',    qr{\G(   (?://|\#).*$        )}x], #,  sub {''}],
    ['COMMENT_INLINE', qr{\G(   /\* .*? \*/         )}x], #,  sub {''}],
    ['COMMENT_MULTI',  qr{\G(   /\* .*?(?!\*/)\s+$  )}x, undef, sub { multiline_comment(@_) }],
    ['DQUOTE_STRING',  qr{\G(   "(?:[^"\\]|\\.)*"   )}x],
    ['SQUOTE_STRING',  qr{\G(   '(?:[^'\\]|\\.)*'   )}x],
    ['EQ_EQ',          qr{\G(   ==                  )}x],
    ['PLUS_EQ',        qr{\G(   \+=                 )}x],
    ['PLUS_PLUS',      qr{\G(   \+\+                )}x],
    ['EQ',             qr{\G(   =                   )}x],
    ['PLUS',           qr{\G(   \+                  )}x],
    ['MINUS',          qr{\G(   -                   )}x],
    ['COMMA',          qr{\G(   ,                   )}x],
    ['STAR',           qr{\G(   \*                  )}x],
    ['BSLASH',         qr{\G(   /                   )}x],
    ['FSLASH',         qr{\G(   \\                  )}x],
    ['L_PAREN',        qr{\G(   \(                  )}x],
    ['R_PAREN',        qr{\G(   \)                  )}x],
    ['NUM',            qr{\G(   [0-9.]+             )}x],
    ['IDENT',          qr{\G(   [A-Za-z_]\w*        )}x],
    ['TERM',           qr{\G(   ;\n* | \n+          )}x],
    ['WHITESPACE',     qr{\G(   \s+                 )}x,  sub {''}],
    ['OTHER',          qr{\G(   .                   )}x],
);


use Data::Dumper;
$Data::Dumper::Terse = 1;
$Data::Dumper::Indent = 0;
$Data::Dumper::Useqq = 1;

sub stream_iter {
    my $handle = $_[0];

    sub {
        return <$handle>;
    }
};

my $input_iter = stream_iter(\*STDIN);

my $tokens = $lexer->tokens($input_iter);

while (defined (my $token = $tokens->())) {
    next if not length $token;
    print "HERE'S A TOKEN: ", (Dumper $token), "\n";
}
	int main(void) {
	int i = 42;
	char c = 'b'; // this is a 'single quoted' literal
	char *s = "cat";

	i++; // increment "i" by one

	puts("string /* with comment */");

	/* "comment with string" */

	printf("\"i\" = %d, c = '%c'%c", i, c, '\n');

	/* one
	two */

	float/* test */j = 42;

	/*******************
	* here we return *
	* the value for a *
	* successful exit *
	*******************/

	return EXIT_SUCCESS;
	}
	#!/usr/bin/env perl

	use warnings;
	use strict;

	package Lexer;

	sub new {
	my ($proto, %conf) = @_;
	my $class = ref($proto) \|\| $proto;
	my $self = bless {}, $class;
	$self->initialize(%conf);
	return $self;
	}

	sub initialize {
	my ($self, %conf) = @_;

	$self->{tokentypes} = [];
	}

	# define our tokentypes
	sub define_tokens {
	my $self = shift;

	$self->{tokentypes} = [];

	foreach my $arg (@_) {
	push @{$self->{tokentypes}}, $arg;
	}
	}

	# append new tokentypes definitions
	sub add_token {
	my ($self, $tokentype) = @_;
	push @{$self->{tokentypes}}, $tokentype;
	}

	sub tokens {
	my ($self, $input, $tokentypes) = @_;

	# allow overriding list of tokentypes and matchers
	$tokentypes //= $self->{tokentypes};

	# the current line being lexed
	my $text;

	# closures are neat
	return sub {
	INPUT: {
	# get next line if we don't have a line
	$text = $input->() if not defined $text;

	# all done when there's no more input
	return undef if not defined $text;

	# go through each tokentype
	foreach my $tokentype (@$tokentypes) {
	# does this bit of text match this tokentype?
	if ($text =~ /$tokentype->[1]/gc) {
	# got a token
	my $literal = $1;

	# do we have a specific function to continue lexing this token?
	if (defined $tokentype->[3]) {
	return $tokentype->[3]->($input, \$text, $tokentype->[0], $literal);
	}

	# do we have a specific function to return this token?
	if (defined $tokentype->[2]) {
	return $tokentype->[2]->();
	}

	# return this token
	return [ $tokentype->[0], $literal ];
	}
	}

	# end of this input
	$text = undef;
	redo INPUT;
	}
	}
	}

	1;
	HERE'S A TOKEN: ["IDENT","int"]
	HERE'S A TOKEN: ["IDENT","main"]
	HERE'S A TOKEN: ["L_PAREN","("]
	HERE'S A TOKEN: ["IDENT","void"]
	HERE'S A TOKEN: ["R_PAREN",")"]
	HERE'S A TOKEN: ["OTHER","{"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","int"]
	HERE'S A TOKEN: ["IDENT","i"]
	HERE'S A TOKEN: ["EQ","="]
	HERE'S A TOKEN: ["NUM",42]
	HERE'S A TOKEN: ["TERM",";\n"]
	HERE'S A TOKEN: ["IDENT","char"]
	HERE'S A TOKEN: ["IDENT","c"]
	HERE'S A TOKEN: ["EQ","="]
	HERE'S A TOKEN: ["SQUOTE_STRING","'b'"]
	HERE'S A TOKEN: ["TERM",";"]
	HERE'S A TOKEN: ["COMMENT_EOL","// this is a 'single quoted' literal"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","char"]
	HERE'S A TOKEN: ["STAR","*"]
	HERE'S A TOKEN: ["IDENT","s"]
	HERE'S A TOKEN: ["EQ","="]
	HERE'S A TOKEN: ["DQUOTE_STRING","\"cat\""]
	HERE'S A TOKEN: ["TERM",";\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","i"]
	HERE'S A TOKEN: ["PLUS_PLUS","++"]
	HERE'S A TOKEN: ["TERM",";"]
	HERE'S A TOKEN: ["COMMENT_EOL","// increment \"i\" by one"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","puts"]
	HERE'S A TOKEN: ["L_PAREN","("]
	HERE'S A TOKEN: ["DQUOTE_STRING","\"string /* with comment */\""]
	HERE'S A TOKEN: ["R_PAREN",")"]
	HERE'S A TOKEN: ["TERM",";\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["COMMENT_INLINE","/* \"comment with string\" */"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","printf"]
	HERE'S A TOKEN: ["L_PAREN","("]
	HERE'S A TOKEN: ["DQUOTE_STRING","\"\\\"i\\\" = %d, c = '%c'%c\""]
	HERE'S A TOKEN: ["COMMA",","]
	HERE'S A TOKEN: ["IDENT","i"]
	HERE'S A TOKEN: ["COMMA",","]
	HERE'S A TOKEN: ["IDENT","c"]
	HERE'S A TOKEN: ["COMMA",","]
	HERE'S A TOKEN: ["SQUOTE_STRING","'\\n'"]
	HERE'S A TOKEN: ["R_PAREN",")"]
	HERE'S A TOKEN: ["TERM",";\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["COMMENT_MULTI","/* one\n two */"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","float"]
	HERE'S A TOKEN: ["COMMENT_INLINE","/* test */"]
	HERE'S A TOKEN: ["IDENT","j"]
	HERE'S A TOKEN: ["EQ","="]
	HERE'S A TOKEN: ["NUM",42]
	HERE'S A TOKEN: ["TERM",";\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["COMMENT_MULTI","/******************\n here we return * * the value for a * * successful exit * *******************/"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["TERM","\n"]
	HERE'S A TOKEN: ["IDENT","return"]
	HERE'S A TOKEN: ["IDENT","EXIT_SUCCESS"]
	HERE'S A TOKEN: ["TERM",";\n"]
	HERE'S A TOKEN: ["OTHER","}"]
	HERE'S A TOKEN: ["TERM","\n"]
	#!/bin/env perl

	use warnings;
	use strict;

	use Lexer;

	my $lexer = Lexer->new(debug => $ENV{debug});

	sub multiline_comment {
	my ($input, $text, $tokentype, $buf) = @_;

	while (1) {
	$$text = $input->() if not defined $$text;
	return [$tokentype, $buf] if not defined $$text;

	if ($$text =~ m{\G( .? \/ )}gcx) {
	$buf .= $1;
	return [$tokentype, $buf];
	} else {
	$$text =~ m{\G( .* )}gxc;
	$buf .= $1;
	$$text = undef;
	}
	}
	}

	$lexer->define_tokens(
	['COMMENT_EOL', qr{\G( (?://\|\#).*$ )}x], #, sub {''}],
	['COMMENT_INLINE', qr{\G( /\* .? \/ )}x], #, sub {''}],
	['COMMENT_MULTI', qr{\G( /\* .?(?!\/)\s+$ )}x, undef, sub { multiline_comment(@_) }],
	['DQUOTE_STRING', qr{\G( "(?:[^"\\]\|\\.)*" )}x],
	['SQUOTE_STRING', qr{\G( '(?:[^'\\]\|\\.)*' )}x],
	['EQ_EQ', qr{\G( == )}x],
	['PLUS_EQ', qr{\G( \+= )}x],
	['PLUS_PLUS', qr{\G( \+\+ )}x],
	['EQ', qr{\G( = )}x],
	['PLUS', qr{\G( \+ )}x],
	['MINUS', qr{\G( - )}x],
	['COMMA', qr{\G( , )}x],
	['STAR', qr{\G( \* )}x],
	['BSLASH', qr{\G( / )}x],
	['FSLASH', qr{\G( \\ )}x],
	['L_PAREN', qr{\G( \( )}x],
	['R_PAREN', qr{\G( \) )}x],
	['NUM', qr{\G( [0-9.]+ )}x],
	['IDENT', qr{\G( [A-Za-z_]\w* )}x],
	['TERM', qr{\G( ;\n* \| \n+ )}x],
	['WHITESPACE', qr{\G( \s+ )}x, sub {''}],
	['OTHER', qr{\G( . )}x],
	);


	use Data::Dumper;
	$Data::Dumper::Terse = 1;
	$Data::Dumper::Indent = 0;
	$Data::Dumper::Useqq = 1;

	sub stream_iter {
	my $handle = $_[0];

	sub {
	return <$handle>;
	}
	};

	my $input_iter = stream_iter(\*STDIN);

	my $tokens = $lexer->tokens($input_iter);

	while (defined (my $token = $tokens->())) {
	next if not length $token;
	print "HERE'S A TOKEN: ", (Dumper $token), "\n";
	}