Skip to content

Instantly share code, notes, and snippets.

@jddurand
Last active March 20, 2016 07:24
Show Gist options
  • Save jddurand/5052023 to your computer and use it in GitHub Desktop.
Save jddurand/5052023 to your computer and use it in GitHub Desktop.
C to AST using Marpa::R2. Update to update version of Marpa -;
#!env perl
# Execute this script with no parameter to get the online help
use strict;
use diagnostics;
use Marpa::R2;
use File::Slurp;
use Data::Dumper;
use POSIX;
############################
# main
############################
my $file = shift || '';
if (! $file) {
print STDERR "Usage: $^X $0 file.c\n";
print STDERR "\n";
print STDERR "Warning, file.c must contain only 2011 ISO C\n";
print STDERR "With GNU cpp, here is how to get it from a general .c file:\n";
print STDERR "cpp -ansi -U__GNUC__ old.c > new.c\n";
exit(EXIT_FAILURE);
}
my $source = do { local $/; <DATA> };
my $grammar = Marpa::R2::Scanless::G->new(
{
bless_package => 'C::AST',
source => \$source
}
);
my $input = read_file($file);
my $recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } );
if ( ! defined eval { $recce->read(\$input); 1 }) {
my $eval_error = "$@";
chomp $eval_error;
die show_last_expression(), "\n", $eval_error, "\n";
}
my $value_ref = $recce->value();
if ( ! defined $value_ref ) {
die show_last_expression(), "\n",
"No parse was found, after reading the entire input\n";
}
print STDERR Dumper($value_ref);
exit(EXIT_SUCCESS);
############################
# show_last_expression
############################
sub show_last_expression {
my ( $start, $end ) = $recce->last_completed_range('translationUnit');
return 'No expression was successfully parsed' if not defined $start;
my $last_expression = $recce->range_to_string( $start, $end );
return "Last expression successfully parsed was: $last_expression";
}
__DATA__
####################################
# Defaults
####################################
:default ::= action => [values] bless => ::lhs
#####################################################################
## 2011 ISO C, as of http://www.quut.com/c/ANSI-C-grammar-l.html
## http://www.quut.com/c/ANSI-C-grammar-y-2011.html
#####################################################################
######################################################################
## G1 (grammar), c.f. http://www.quut.com/c/ANSI-C-grammar-y-2011.html
######################################################################
:start ::= translationUnit
primaryExpression
::= IDENTIFIER
| constant
| string
| '(' expression ')'
| genericSelection
# I_CONSTANT: includes character_constant
# ENUMERATION_CONSTANT : after it has been defined as such
constant
::= I_CONSTANT
| F_CONSTANT
# | ENUMERATION_CONSTANT
| IDENTIFIER
# before it has been defined as such
enumerationConstant
::= IDENTIFIER
string
::= STRING_LITERAL
| FUNC_NAME
genericSelection
::= GENERIC '(' assignmentExpression ',' genericAssocList ')'
genericAssocList
::= genericAssociation
| genericAssocList ',' genericAssociation
genericAssociation
::= typeName ':' assignmentExpression
| DEFAULT ':' assignmentExpression
postfixExpression
::= primaryExpression
| postfixExpression LBRACKET expression RBRACKET
| postfixExpression '(' ')'
| postfixExpression '(' argumentExpressionList ')'
| postfixExpression '.' IDENTIFIER
| postfixExpression PTR_OP IDENTIFIER
| postfixExpression INC_OP
| postfixExpression DEC_OP
| '(' typeName ')' LCURLY initializerList RCURLY
| '(' typeName ')' LCURLY initializerList ',' RCURLY
argumentExpressionList
::= assignmentExpression
| argumentExpressionList ',' assignmentExpression
unaryExpression
::= postfixExpression
| INC_OP unaryExpression
| DEC_OP unaryExpression
| unaryOperator castExpression
| SIZEOF unaryExpression
| SIZEOF '(' typeName ')'
| ALIGNOF '(' typeName ')'
unaryOperator
::= '&'
| '*'
| '+'
| '-'
| '~'
| '!'
castExpression
::= unaryExpression
| '(' typeName ')' castExpression
multiplicativeExpression
::= castExpression
| multiplicativeExpression '*' castExpression
| multiplicativeExpression '/' castExpression
| multiplicativeExpression '%' castExpression
additiveExpression
::= multiplicativeExpression
| additiveExpression '+' multiplicativeExpression
| additiveExpression '-' multiplicativeExpression
shiftExpression
::= additiveExpression
| shiftExpression LEFT_OP additiveExpression
| shiftExpression RIGHT_OP additiveExpression
relationalExpression
::= shiftExpression
| relationalExpression '<' shiftExpression
| relationalExpression '>' shiftExpression
| relationalExpression LE_OP shiftExpression
| relationalExpression GE_OP shiftExpression
equalityExpression
::= relationalExpression
| equalityExpression EQ_OP relationalExpression
| equalityExpression NE_OP relationalExpression
andExpression
::= equalityExpression
| andExpression '&' equalityExpression
exclusiveOrExpression
::= andExpression
| exclusiveOrExpression '^' andExpression
inclusiveOrExpression
::= exclusiveOrExpression
| inclusiveOrExpression '|' exclusiveOrExpression
logicalAndExpression
::= inclusiveOrExpression
| logicalAndExpression AND_OP inclusiveOrExpression
logicalOrExpression
::= logicalAndExpression
| logicalOrExpression OR_OP logicalAndExpression
conditionalExpression
::= logicalOrExpression
| logicalOrExpression '?' expression ':' conditionalExpression
assignmentExpression
::= conditionalExpression
| unaryExpression assignmentOperator assignmentExpression
assignmentOperator
::= '='
| MUL_ASSIGN
| DIV_ASSIGN
| MOD_ASSIGN
| ADD_ASSIGN
| SUB_ASSIGN
| LEFT_ASSIGN
| RIGHT_ASSIGN
| AND_ASSIGN
| XOR_ASSIGN
| OR_ASSIGN
expression
::= assignmentExpression
| expression ',' assignmentExpression
# with constraints
constantExpression
::= conditionalExpression
declaration
::= declarationSpecifiers ';'
| declarationSpecifiers initDeclaratorList ';'
| staticAssertDeclaration
declarationSpecifiers
::= storageClassSpecifier declarationSpecifiers
| storageClassSpecifier
| typeSpecifier declarationSpecifiers
| typeSpecifier
| typeQualifier declarationSpecifiers
| typeQualifier
| functionSpecifier declarationSpecifiers
| functionSpecifier
| alignmentSpecifier declarationSpecifiers
| alignmentSpecifier
initDeclaratorList
::= initDeclarator
| initDeclaratorList ',' initDeclarator
initDeclarator
::= declarator '=' initializer
| declarator
# TYPEDEF: identifiers must be flagged as TYPEDEF_NAME
storageClassSpecifier
::= TYPEDEF
| EXTERN
| STATIC
| THREAD_LOCAL
| AUTO
| REGISTER
# IMAGINARY: non-mandated extension
# TYPEDEF_NAME : after it has been defined as such
typeSpecifier
::= VOID
| CHAR
| SHORT
| INT
| LONG
| FLOAT
| DOUBLE
| SIGNED
| UNSIGNED
| BOOL
| COMPLEX
| IMAGINARY
| atomicTypeSpecifier
| structOrUnionSpecifier
| enumSpecifier
# | TYPEDEF_NAME
| IDENTIFIER
structOrUnionSpecifier
::= structOrUnion LCURLY structDeclarationList RCURLY
| structOrUnion IDENTIFIER LCURLY structDeclarationList RCURLY
| structOrUnion IDENTIFIER
structOrUnion
::= STRUCT
| UNION
structDeclarationList
::= structDeclaration
| structDeclarationList structDeclaration
# specifierQualifierList ';' : for anonymous struct/union
structDeclaration
::= specifierQualifierList ';'
| specifierQualifierList structDeclaratorList ';'
| staticAssertDeclaration
specifierQualifierList
::= typeSpecifier specifierQualifierList
| typeSpecifier
| typeQualifier specifierQualifierList
| typeQualifier
structDeclaratorList
::= structDeclarator
| structDeclaratorList ',' structDeclarator
structDeclarator
::= ':' constantExpression
| declarator ':' constantExpression
| declarator
enumSpecifier
::= ENUM LCURLY enumeratorList RCURLY
| ENUM LCURLY enumeratorList ',' RCURLY
| ENUM IDENTIFIER LCURLY enumeratorList RCURLY
| ENUM IDENTIFIER LCURLY enumeratorList ',' RCURLY
| ENUM IDENTIFIER
enumeratorList
::= enumerator
| enumeratorList ',' enumerator
# identifiers must be flagged as ENUMERATION_CONSTANT
enumerator
::= enumerationConstant '=' constantExpression
| enumerationConstant
atomicTypeSpecifier
::= ATOMIC '(' typeName ')'
typeQualifier
::= CONST
| RESTRICT
| VOLATILE
| ATOMIC
functionSpecifier
::= INLINE
| NORETURN
alignmentSpecifier
::= ALIGNAS '(' typeName ')'
| ALIGNAS '(' constantExpression ')'
declarator
::= pointer directDeclarator
| directDeclarator
directDeclarator
::= IDENTIFIER
| '(' declarator ')'
| directDeclarator LBRACKET RBRACKET
| directDeclarator LBRACKET '*' RBRACKET
| directDeclarator LBRACKET STATIC typeQualifierList assignmentExpression RBRACKET
| directDeclarator LBRACKET STATIC assignmentExpression RBRACKET
| directDeclarator LBRACKET typeQualifierList '*' RBRACKET
| directDeclarator LBRACKET typeQualifierList STATIC assignmentExpression RBRACKET
| directDeclarator LBRACKET typeQualifierList assignmentExpression RBRACKET
| directDeclarator LBRACKET typeQualifierList RBRACKET
| directDeclarator LBRACKET assignmentExpression RBRACKET
| directDeclarator '(' parameterTypeList ')'
| directDeclarator '(' ')'
| directDeclarator '(' identifierList ')'
pointer
::= '*' typeQualifierList pointer
| '*' typeQualifierList
| '*' pointer
| '*'
typeQualifierList
::= typeQualifier
| typeQualifierList typeQualifier
parameterTypeList
::= parameterList ',' ELLIPSIS
| parameterList
parameterList
::= parameterDeclaration
| parameterList ',' parameterDeclaration
parameterDeclaration
::= declarationSpecifiers declarator
| declarationSpecifiers abstractDeclarator
| declarationSpecifiers
identifierList
::= IDENTIFIER
| identifierList ',' IDENTIFIER
typeName
::= specifierQualifierList abstractDeclarator
| specifierQualifierList
abstractDeclarator
::= pointer directAbstractDeclarator
| pointer
| directAbstractDeclarator
directAbstractDeclarator
::= '(' abstractDeclarator ')'
| LBRACKET RBRACKET
| LBRACKET '*' RBRACKET
| LBRACKET STATIC typeQualifierList assignmentExpression RBRACKET
| LBRACKET STATIC assignmentExpression RBRACKET
| LBRACKET typeQualifierList STATIC assignmentExpression RBRACKET
| LBRACKET typeQualifierList assignmentExpression RBRACKET
| LBRACKET typeQualifierList RBRACKET
| LBRACKET assignmentExpression RBRACKET
| directAbstractDeclarator LBRACKET RBRACKET
| directAbstractDeclarator LBRACKET '*' RBRACKET
| directAbstractDeclarator LBRACKET STATIC typeQualifierList assignmentExpression RBRACKET
| directAbstractDeclarator LBRACKET STATIC assignmentExpression RBRACKET
| directAbstractDeclarator LBRACKET typeQualifierList assignmentExpression RBRACKET
| directAbstractDeclarator LBRACKET typeQualifierList STATIC assignmentExpression RBRACKET
| directAbstractDeclarator LBRACKET typeQualifierList RBRACKET
| directAbstractDeclarator LBRACKET assignmentExpression RBRACKET
| '(' ')'
| '(' parameterTypeList ')'
| directAbstractDeclarator '(' ')'
| directAbstractDeclarator '(' parameterTypeList ')'
initializer
::= LCURLY initializerList RCURLY
| LCURLY initializerList ',' RCURLY
| assignmentExpression
initializerList
::= designation initializer
| initializer
| initializerList ',' designation initializer
| initializerList ',' initializer
designation
::= designatorList '='
designatorList
::= designator
| designatorList designator
designator
::= LBRACKET constantExpression RBRACKET
| '.' IDENTIFIER
staticAssertDeclaration
::= STATIC_ASSERT '(' constantExpression ',' STRING_LITERAL ')' ';'
statement
::= labeledStatement
| compoundStatement
| expressionStatement
| selectionStatement
| iterationStatement
| jumpStatement
labeledStatement
::= IDENTIFIER ':' statement
| CASE constantExpression ':' statement
| DEFAULT ':' statement
compoundStatement
::= LCURLY RCURLY
| LCURLY blockItemList RCURLY
blockItemList
::= blockItem
| blockItemList blockItem
blockItem
::= declaration
| statement
expressionStatement
::= ';'
| expression ';'
selectionStatement
::= IF '(' expression ')' statement ELSE statement
| IF '(' expression ')' statement
| SWITCH '(' expression ')' statement
iterationStatement
::= WHILE '(' expression ')' statement
| DO statement WHILE '(' expression ')' ';'
| FOR '(' expressionStatement expressionStatement ')' statement
| FOR '(' expressionStatement expressionStatement expression ')' statement
| FOR '(' declaration expressionStatement ')' statement
| FOR '(' declaration expressionStatement expression ')' statement
jumpStatement
::= GOTO IDENTIFIER ';'
| CONTINUE ';'
| BREAK ';'
| RETURN ';'
| RETURN expression ';'
translationUnit
::= externalDeclaration
| translationUnit externalDeclaration
externalDeclaration
::= functionDefinition
| declaration
functionDefinition
::= declarationSpecifiers declarator declarationList compoundStatement
| declarationSpecifiers declarator compoundStatement
declarationList
::= declaration
| declarationList declaration
###############################################################
# G0 (tokens), c.f. http://www.quut.com/c/ANSI-C-grammar-l.html
###############################################################
O ~ [0-7]
D ~ [0-9]
D_many ~ D+
NZ ~ [1-9]
L ~ [a-zA-Z_]
A ~ [a-zA-Z_0-9]
H ~ [a-fA-F0-9]
xX ~ [xX]
HP ~ '0' xX
Ee ~ [Ee]
S ~ [-+]
E ~ Ee S D_many
| Ee D_many
Pp ~ [Pp]
P ~ Pp S D_many
| Pp D_many
FS ~ [fFlL]
Uu ~ [Uu]
l ~ 'l'
L ~ 'L'
ll ~ 'll'
LL ~ 'LL'
Lvariant ~ l|L|ll|LL
IS ~ Uu Lvariant
| Uu
| Lvariant Uu
| Lvariant
CP ~ [uUL]
SP ~ 'u8' | CP
XDIGITS ~ [a-fA-F0-9]+
HEXA ~ 'x' XDIGITS
ODIGIT ~ [0-7]
OCTAL ~ ODIGIT ODIGIT ODIGIT
| ODIGIT ODIGIT
| ODIGIT
RESERVED ~ ['"\?\\abfnrtv]
BACKSLASH ~ '\'
ES ~ BACKSLASH RESERVED
| BACKSLASH OCTAL
| BACKSLASH HEXA
WS ~ [ \t\v\n\f]
#
## Discard of a C comment, c.f. https://gist.github.com/jeffreykegler/5015057
# --------------------------------------------------------------------------
<C style comment> ~ '/*' <comment interior> '*/'
<comment interior> ~
<optional non stars>
<optional star prefixed segments>
<optional pre final stars>
<optional non stars> ~ [^*]*
<optional star prefixed segments> ~ <star prefixed segment>*
<star prefixed segment> ~ <stars> [^/*] <optional star free text>
<stars> ~ [*]+
<optional star free text> ~ [^*]*
<optional pre final stars> ~ [*]*
:discard ~ <C style comment>
#
## Discard of a C++ comment
# ------------------------
<Cplusplus style comment> ~ '//' <Cplusplus comment interior>
<Cplusplus comment interior> ~ [^\n]*
:discard ~ <Cplusplus style comment>
AUTO ~ 'auto'
BREAK ~ 'break'
CASE ~ 'case'
CHAR ~ 'char'
CONST ~ 'const'
CONTINUE ~ 'continue'
DEFAULT ~ 'default'
DO ~ 'do'
DOUBLE ~ 'double'
ELSE ~ 'else'
ENUM ~ 'enum'
EXTERN ~ 'extern'
FLOAT ~ 'float'
FOR ~ 'for'
GOTO ~ 'goto'
IF ~ 'if'
INLINE ~ 'inline'
INT ~ 'int'
LONG ~ 'long'
REGISTER ~ 'register'
RESTRICT ~ 'restrict'
RETURN ~ 'return'
SHORT ~ 'short'
SIGNED ~ 'signed'
SIZEOF ~ 'sizeof'
STATIC ~ 'static'
STRUCT ~ 'struct'
SWITCH ~ 'switch'
TYPEDEF ~ 'typedef'
UNION ~ 'union'
UNSIGNED ~ 'unsigned'
VOID ~ 'void'
VOLATILE ~ 'volatile'
WHILE ~ 'while'
ALIGNAS ~ '_Alignas'
ALIGNOF ~ '_Alignof'
ATOMIC ~ '_Atomic'
BOOL ~ '_Bool'
COMPLEX ~ '_Complex'
GENERIC ~ '_Generic'
IMAGINARY ~ '_Imaginary'
NORETURN ~ '_Noreturn'
STATIC_ASSERT ~ '_Static_assert'
THREAD_LOCAL ~ '_Thread_local'
FUNC_NAME ~ '__func__'
# Here we should put contextual tokens like TYPEDEF_NAME or ENUMERATION_CONSTANT
# Until this is known any typedef or enumeration is considered matching the same
# thing as an identifier
A_any ~ A*
IDENTIFIER ~ L A_any
# TYPEDEF_NAME ~ L A_any
# ENUMERATION_CONSTANT ~ L A_any
H_many ~ H+
I_CONSTANT ~ HP H_many IS
| HP H_many
D_any ~ D*
I_CONSTANT ~ NZ D_any IS
| NZ D_any
O_any ~ O*
I_CONSTANT ~ '0' O_any IS
| '0' O_any
ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE ~ [^'\\\n]
ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE_OR_ES ~ ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE
| ES
ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE_OR_ES_many ~ ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE_OR_ES+
SQUOTE ~ [']
I_CONSTANT ~ CP SQUOTE ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE_OR_ES_many SQUOTE
| SQUOTE ANYTHING_EXCEPT_SQUOTE_BACKSLASH_NEWLINE_OR_ES_many SQUOTE
DOT ~ '.'
F_CONSTANT ~ D_many E FS
| D_many E
F_CONSTANT ~ D_any DOT D_many E FS
| D_any DOT D_many E
| D_any DOT D_many FS
| D_any DOT D_many
F_CONSTANT ~ D_many DOT E FS
| D_many DOT E
| D_many DOT FS
| D_many DOT
F_CONSTANT ~ HP H_many P FS
| HP H_many P
H_any ~ H*
F_CONSTANT ~ HP H_any DOT H_many P FS
| HP H_any DOT H_many P
F_CONSTANT ~ HP H_many DOT P FS
| HP H_many DOT P
WS_any ~ WS*
ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE ~ [^"\\\n]
ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE_OR_ES ~ ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE
| ES
ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE_OR_ES_any ~ ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE_OR_ES*
DQUOTE ~ '"'
STRING_LITERAL_UNIT ~ SP DQUOTE ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE_OR_ES_any DQUOTE WS_any
| DQUOTE ANYTHING_EXCEPT_DQUOTE_BACKSLASH_NEWLINE_OR_ES_any DQUOTE WS_any
STRING_LITERAL ~ STRING_LITERAL_UNIT+
ELLIPSIS ~ '...'
RIGHT_ASSIGN ~ '>>='
LEFT_ASSIGN ~ '<<='
ADD_ASSIGN ~ '+='
SUB_ASSIGN ~ '-='
MUL_ASSIGN ~ '*='
DIV_ASSIGN ~ '/='
MOD_ASSIGN ~ '%='
AND_ASSIGN ~ '&='
XOR_ASSIGN ~ '^='
OR_ASSIGN ~ '|='
RIGHT_OP ~ '>>'
LEFT_OP ~ '<<'
INC_OP ~ '++'
DEC_OP ~ '--'
PTR_OP ~ '->'
AND_OP ~ '&&'
OR_OP ~ '||'
LE_OP ~ '<='
GE_OP ~ '>='
EQ_OP ~ '=='
NE_OP ~ '!='
LCURLY ~ '{' | '<%'
RCURLY ~ '}' | '%>'
LBRACKET ~ '[' | '<:'
RBRACKET ~ ']' | ':>'
:discard ~ WS
<hash comment> ~ <terminated hash comment> | <unterminated final hash comment>
<terminated hash comment> ~ '#' <hash comment body> <vertical space char>
<unterminated final hash comment> ~ '#' <hash comment body>
<hash comment body> ~ <hash comment char>*
<vertical space char> ~ [\x{A}\x{B}\x{C}\x{D}\x{2028}\x{2029}]
<hash comment char> ~ [^\x{A}\x{B}\x{C}\x{D}\x{2028}\x{2029}]
:discard ~ <hash comment>
# This will eat any extension to the grammar, for example __attribute__ from GNU C
# BAD_CHARACTER ~ [.]
# :discard ~ BAD_CHARACTER
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment