Skip to content

Instantly share code, notes, and snippets.

@tron1point0
Created May 8, 2014 20:11
Show Gist options
  • Save tron1point0/fb9691a611163e8bbd00 to your computer and use it in GitHub Desktop.
Save tron1point0/fb9691a611163e8bbd00 to your computer and use it in GitHub Desktop.
Java tokenizer/lexer/parser
:- module(parse,[parse/3]).
:- use_module(library(utils)).
parse(Es) --> complation_units(Es).
compilation_units([E|Es]) --> compilation_unit(E), compilation_units(Es).
compilation_units([],A,A).
compilation_unit(unit(Unit)) -->
optional(package_declaration(Package)),
list_of(import_declaration,Imports),
list_of(type_declaration,Types),
{ default_value(Package,[]),
append([Package,Imports,Types],Unit)
}.
/***********
* Package
***********/
package_declaration(package(P)) -->
list_of(package_modifier,Modifiers),
[k(package),i(I)],
list_of(package_identifier,Is),
[s(';')],
{ append([Modifiers,[I|Is]],P) }.
package_identifier(I) --> [s('.'),i(I)].
package_modifier(modifier(M)) --> annotation(M).
/***********
* Imports
***********/
import_declaration(import(D)) --> single_type_import_declaration(D).
import_declaration(import(D)) --> type_import_declaration(D).
import_declaration(import(D)) --> single_static_import_declaration(D).
import_declaration(import(D)) --> static_import_declaration(D).
single_type_import_declaration(T) --> [k(import)], type_name(T), [s(';')].
type_import_declaration(T) -->
[k(import)],
package_or_type_name(Ts),
[s('.'), o('*'), s(';')],
{ append(Ts,['*'],T) }.
single_static_import_declaration(T) -->
[k(import), k(static)],
type_name(Ts),
[s('.'), i(I), s(';')],
{ append(Ts, [I], T) }.
static_import_declaration(T) -->
[k(import), k(static)],
type_name(Ts),
[s('.'), o('*'), s(';')],
{ append(Ts, ['*'], T) }.
/***********
* Types
***********/
type_declaration(C) --> class_declaration(C).
type_declaration(interface(I)) --> interface_declaration(I).
type_parameter_list([P|Ps]) -->
type_parameter(P),
list_of(type_parameter,Ps).
type_parameter(type_parameter(Name,[m-Modifiers,b-Bound])) -->
list_of(type_parameter_modifier,Modifiers),
[i(Name)],
optional(type_bound(Bound)),
{ default_value(Bound,[extends('Object')]) }.
type_parameter_modifier(M) --> annotation(M).
type_bound(extends(B)) --> [k(extends)], type_variable(B).
type_bound(extends([B|Bs])) -->
[k(extends)],
class_or_interface_type(B),
list_of(additional_bound,Bs).
additional_bound(B) --> [o('&')], interface_type(B).
/***********
* Classes
***********/
class_declaration(C) --> normal_class_declaration(C).
class_declaration(enum(C)) --> enum_declaration(C).
normal_class_declaration(class(Name,[m-Modifiers,t-Params,s-Super,i-Interfaces],Body)) -->
list_of(class_modifier,Modifiers),
[k(class), i(Name)],
optional(type_parameters(Params)),
optional(superclass(Super)),
optional(superinterfaces(Interfaces)),
class_body(Body),
{ default_value(Params,[]),
default_value(Super,'Object'),
default_value(Interfaces,[])
}.
class_modifier(A) --> annotation(A).
class_modifier(M) --> { is_class_modifier(M) }, [k(M)].
type_parameters(Ps) --> [s('<')], type_parameter_list(Ps), [s('>')].
superclass(C) --> [k(extends)], class_type(C).
superinterfaces(Is) --> [k(implements)], interface_type_list(Is).
interface_type_list([I|Is]) --> interface_type(I), list_of('$interface_type',Is).
'$interface_type'(I) --> [s(',')], interface_type(I).
class_body(Bs) --> [s('{')], list_of(class_body_declaration,Bs), [s('}')].
class_body_declaration(M) --> class_member_declaration(M).
class_body_declaration(I) --> instance_initializer(I).
class_body_declaration(S) --> static_initializer(S).
class_body_declaration(C) --> constructor_declaration(C).
class_member_declaration(F) --> field_declaration(F).
class_member_declaration(method(M)) --> method_declaration(M).
class_member_declaration(C) --> class_declaration(C).
class_member_declaration(I) --> interface_declaration(I).
class_member_declaration(';') --> [s(';')].
field_declaration(field(Type,[m-Modifiers],Declarators)) -->
list_of(field_modifier,Modifiers),
unann_type(Type),
variable_declarator_list(Declarators),
[s(';')].
field_modifier(A) --> annotation(A).
field_modifier(M) --> { is_field_modifier(M) }, [k(M)].
variable_declarator_list([D|Ds]) -->
variable_declarator(D),
list_of('$variable_declarator',Ds).
'$variable_declarator'(D) --> [s(',')], variable_declarator(D).
variable_declarator(D=V) -->
variable_declarator_id(D),
optional([o('='), variable_initializer(V)]),
{ default_value(V,null) }.
variable_declarator_id(single(D)) --> [i(D)].
variable_declarator_id(array(D,Dims)) --> [i(D)], dims(Dims).
variable_initializer(I) --> expression(I).
variable_initializer(I) --> array_initializer(I).
/**************
* Interfaces
**************/
% annotation(annotation(A)) --> normal_annotation(A).
annotation(annotation(A)) --> marker_annotation(A).
% annotation(annotation(A)) --> single_element_annotation(A).
marker_annotation(A) --> [s(@)], type_name(A).
type_name([T]) --> [i(T)].
type_name([T|Ts]) --> package_or_type_name(T), [s('.'), i(Ts)].
package_or_type_name([T]) --> [i(T)].
package_or_type_name([T|Ts]) --> package_or_type_name(T), [s('.'), i(Ts)].
/*******************
* Class modifiers
*******************/
is_class_modifier(public).
is_class_modifier(protected).
is_class_modifier(private).
is_class_modifier(abstract).
is_class_modifier(static).
is_class_modifier(final).
is_class_modifier(strictfp).
is_field_modifier(public).
is_field_modifier(protected).
is_field_modifier(private).
is_field_modifier(static).
is_field_modifier(final).
is_field_modifier(transient).
is_field_modifier(volatile).
% vim: set filetye=prolog
:- module(tokenize,[tokenize/2]).
:- use_module(library(utils)).
tokenize(Chars,Elements) :-
input(Es,Chars,[]),
only_tokens(Es,Elements).
only_tokens([t(A)|Ts],[A|Rs]) :- !, only_tokens(Ts,Rs).
only_tokens([_|Ts],Rs) :- !, only_tokens(Ts,Rs).
only_tokens([],[]).
input(Elements) --> list_of(input_element,Elements).
input(Elements) --> list_of(input_element,Elements), sub.
input_element(s(E)) --> white_space(E), !.
input_element(c(E)) --> comment(E), !.
input_element(t(E)) --> token(E), !.
white_space(sp) --> sp.
white_space(ht) --> ht.
white_space(ff) --> ff.
white_space(E) --> line_terminator(E).
comment(C) --> traditional_comment(C).
comment(C) --> end_of_line_comment(C).
token(i(E)) --> identifier(E).
token(l(E)) --> literal(E).
token(k(E)) --> keyword(E).
token(s(E)) --> separator(E).
token(o(E)) --> operator(E).
/**************
* Comments
**************/
traditional_comment(String) -->
input_character('/'),
input_character('*'),
comment_tail(Chars),
{ breakpoint(Chars) },
{ atomic_list_concat(Chars,String) }.
comment_tail(Bs) --> input_character('*'), comment_tail_star(Bs).
comment_tail([B|Bs]) --> not_star(B), comment_tail(Bs).
comment_tail_star([]) --> input_character('/').
comment_tail_star(Bs) --> input_character('*'), comment_tail_star(Bs).
comment_tail_star([B|Bs]) --> not_star_not_slash(B), comment_tail(Bs).
not_star(C) --> input_character(C), { C \= '*' }.
not_star('\n') --> line_terminator(_).
not_star_not_slash(C) --> input_character(C), { C \= '*', C \= '/' }.
not_star_not_slash('\n') --> line_terminator(_).
end_of_line_comment(String) -->
input_character('/'),
input_character('/'),
string_of(input_character,String).
/***************
* Keywords
***************/
keyword(E) --> word_of(is_keyword,E).
/***************
* Separators
***************/
separator(C) --> word_of(is_separator,C).
/***************
* Operators
***************/
operator(O) --> word_of(is_operator,O).
/***************
* Literals
***************/
literal(i(E)) --> integer_literal(E).
literal(l(E)) --> integer_literal(E), integer_type_suffix.
literal(E) --> floating_point_literal(E).
literal(b(E)) --> boolean_literal(E).
literal(c(E)) --> character_literal(E).
literal(s(E)) --> string_literal(E).
literal(null) --> null_literal.
integer_literal(E) --> dec_numeral(E).
integer_literal(E) --> hex_numeral(E).
integer_literal(E) --> oct_numeral(E).
integer_literal(E) --> bin_numeral(E).
% Decimal
dec_numeral(V) --> non_zero_digit(W), underscores, digits(Ws), { number_value(10,[W|Ws],V) }.
dec_numeral(V) --> non_zero_digit(W), digits(Ws), { number_value(10,[W|Ws],V) }.
dec_numeral(V) --> non_zero_digit(V).
dec_numeral(0) --> input_character('0').
non_zero_digit(V) -->
input_character(C),
{ char_type(C,digit(V)),
V \= 0
}.
digits([V|Vs]) -->
digit(V),
optional(digits_and_underscores(Ws)),
digit(W),
{ default_value(Ws,[]), append(Ws,[W],Vs) }.
digits([V]) --> digit(V).
digit(V) --> input_character(C), { char_type(C,digit(V)) }.
digits_and_underscores([V|Vs]) --> digit_or_underscore(V), digits_and_underscores(Vs).
digits_and_underscores([V]) --> digit_or_underscore(V).
digit_or_underscore(V) --> digit(V).
digit_or_underscore('_') --> input_character('_').
% Hexadecimal
hex_numeral(V) -->
input_character('0'),
( input_character('x') ; input_character('X') ),
hex_digits(Ds),
{ number_value(16,Ds,V) }.
hex_digits([V|Vs]) -->
hex_digit(V),
optional(hex_digits_and_underscores(Ws)),
hex_digit(W),
{ default_value(Ws,[]), append(Ws,[W],Vs) }.
hex_digits([V]) --> hex_digit(V).
hex_digits_and_underscores([V|Vs]) --> hex_digit_or_underscore(V), hex_digits_and_underscores(Vs).
hex_digits_and_underscores([V]) --> hex_digit_or_underscore(V).
hex_digit_or_underscore(V) --> hex_digit(V).
hex_digit_or_underscore('_') --> input_character('_').
% Octal
oct_numeral(V) -->
input_character('0'),
oct_digits(Ds),
{ number_value(8,Ds,V) }.
oct_digits([V|Vs]) -->
oct_digit(V),
oct_digits_and_underscores(Ws),
oct_digit(W),
{ append(Ws,[W],Vs) }.
oct_digits([V,Vs]) --> oct_digit(V), oct_digit(Vs).
oct_digits([V]) --> oct_digit(V).
oct_digits_and_underscores([V|Vs]) --> oct_digit_or_underscore(V), oct_digits_and_underscores(Vs).
oct_digits_and_underscores([V]) --> oct_digit_or_underscore(V).
oct_digit_or_underscore(V) --> oct_digit(V).
oct_digit_or_underscore('_') --> input_character('_').
% Binary
bin_numeral(V) -->
input_character('0'),
( input_character('b') ; input_character('B') ),
bin_digits(Ds),
{ number_value(2,Ds,V) }.
bin_digits([V|Vs]) -->
bin_digit(V),
bin_digits_and_underscores(Ws),
bin_digit(W),
{ append(Ws,[W],Vs) }.
bin_digits([V,Vs]) --> bin_digit(V), bin_digit(Vs).
bin_digits([V]) --> bin_digit(V).
bin_digits_and_underscores([V|Vs]) --> bin_digit_or_underscore(V), bin_digits_and_underscores(Vs).
bin_digits_and_underscores([V]) --> bin_digit_or_underscore(V).
bin_digit_or_underscore(V) --> bin_digit(V).
bin_digit_or_underscore('_') --> input_character('_').
underscores --> input_character('_').
underscores --> input_character('_'), underscores.
integer_type_suffix --> input_character('l').
integer_type_suffix --> input_character('L').
% Float
floating_point_literal(V) -->
dec_floating_point_literal(Left,Right,Exp,Type),
{ default_value(Exp, 0),
default_value(Type, d),
number_value(10, Left, L),
number_value(0.1, Right, R),
Value is (L + R) * 10 ** Exp,
V =.. [Type, Value]
}.
floating_point_literal(V) -->
hex_floating_point_literal(Left,Right,Exp,Type),
{ default_value(Type, d),
default_value(Left, []),
number_value(16, Left, L),
number_value(0.0625, Right, R),
Value is (L + R) * 16 ** Exp,
V =.. [Type, Value]
}.
% Decimal Float
dec_floating_point_literal(Left,Right,Exp,Type) -->
digits(Left),
input_character('.'),
optional(digits(Right)),
optional(exponent_part(Exp)),
optional(float_type_suffix(Type)).
dec_floating_point_literal([],Right,Exp,Type) -->
input_character('.'),
digits(Right),
optional(exponent_part(Exp)),
optional(float_type_suffix(Type)).
dec_floating_point_literal(Left,[],Exp,Type) -->
digits(Left),
exponent_part(Exp),
optional(float_type_suffix(Type)).
dec_floating_point_literal(Left,[],Exp,Type) -->
digits(Left),
optional(exponent_part(Exp)),
float_type_suffix(Type).
exponent_part(Exp) --> exponent_indicator, signed_integer(Exp).
exponent_indicator --> input_character('e').
exponent_indicator --> input_character('E').
signed_integer(V) -->
optional(sign(S)),
digits(Ds),
{ default_value(S,1),
number_value(10,Ds,N),
V is N * S
}.
sign(1) --> input_character('+').
sign(-1) --> input_character('-').
float_type_suffix(f) --> input_character('f').
float_type_suffix(f) --> input_character('F').
float_type_suffix(d) --> input_character('d').
float_type_suffix(d) --> input_character('D').
% Hexadecimal Float
hex_floating_point_literal(Left, Right, Exp, Type) -->
hex_significand(Left,Right),
binary_exponent(Exp),
optional(floating_type_suffix(Type)).
hex_significand(Ds,[]) -->
input_character('0'),
( input_character('x') ; input_character('X') ),
hex_digits(Ds),
optional(input_character('.')).
hex_significand(Ls,Rs) -->
input_character('0'),
( input_character('x') ; input_character('X') ),
optional(hex_digits(Ls)),
input_character('.'),
hex_digits(Rs).
binary_exponent(Exp) -->
binary_exponent_indicator,
signed_integer(Exp).
binary_exponent_indicator --> input_character('p').
binary_exponent_indicator --> input_character('P').
% Boolean literal
boolean_literal(B) --> word_of(is_boolean_literal,B).
% Character literal
character_literal(C) -->
input_character('\''),
( single_character(C) ; escape_sequence(C) ),
input_character('\'').
single_character(C) -->
input_character(C),
{ C \= '\'', C \= '\\' }.
% String literal
string_literal(String) -->
input_character('"'),
string_of(string_character,String),
input_character('"').
string_character(C) -->
input_character(C),
{ C \= '"', C \= '\\' }.
string_character(C) -->
escape_sequence(C).
escape_sequence('\b') --> input_word([\,b]).
escape_sequence('\t') --> input_word([\,t]).
escape_sequence('\n') --> input_word([\,n]).
escape_sequence('\f') --> input_word([\,f]).
escape_sequence('\r') --> input_word([\,r]).
escape_sequence(Q) --> { char_code(Q,34) }, input_word([\,Q]).
escape_sequence('\'') --> input_word([\,'\'']).
escape_sequence('\\') --> input_word([\,\]).
escape_sequence(C) --> octal_escape(C).
octal_escape(C) --> input_character('\\'), oct_digit(V), { char_code(C,V) }.
octal_escape(C) -->
input_character('\\'),
oct_digit(V1),
oct_digit(V2),
{ number_value(8,[V1,V2],V), char_code(C,V) }.
octal_escape(C) -->
input_character('\\'),
zero_to_three(V1),
oct_digit(V2),
oct_digit(V3),
{ number_value(8,[V1,V2,V3],V), char_code(C,V) }.
zero_to_three(V) --> oct_digit(V), { V < 4 }.
% Null literal
null_literal --> { is_null_literal(N) }, input_word(N).
/****************
* Identifiers
****************/
identifier(E) -->
identifier_chars(E), !,
{ \+( is_keyword(E) ;
is_boolean_literal(E) ;
is_null_literal(E) )
}.
identifier_chars(E) -->
java_letter(C),
list_of(java_letter_or_digit,Cs),
{ atomic_list_concat([C|Cs],E) }.
java_letter(C) -->
input_character(C),
{ char_type(C,csymf) }.
java_letter_or_digit(C) -->
input_character(C),
{ char_type(C,csym) }.
/*******************************
* Helpers / Basic definitions
*******************************/
input_word(Atom) --> { atomic(Atom), atom_codes(Atom,Codes) }, Codes.
input_word(List) --> { is_list(List) }, List.
word_of(Fn,Word) --> { call(Fn,Word) }, input_word(Word).
input_character(C) -->
unicode_character(C),
{ char_code(C,B), \+(lf([B],[]) ; cr([B],[])) }.
line_terminator(lf) --> lf.
line_terminator(cr) --> cr.
line_terminator(crlf) --> cr, lf.
unicode_character(C) --> byte(C).
unicode_character(C) --> unicode_escape(C).
unicode_escape(C) -->
byte(\),
unicode_marker,
times(4,hex_digit,Digits),
{ number_value(16,Digits,Value), char_code(C,Value) }.
unicode_marker --> byte(u).
unicode_marker --> byte(u), unicode_marker.
hex_digit(V) --> input_character(C), { char_type(C,xdigit(V)) }.
oct_digit(V) --> input_character(C), { char_type(C,digit(V)), V < 8 }.
bin_digit(V) --> input_character(C), { char_type(C,digit(V)), V < 2 }.
number_value(Base,Digits,Value) :-
Base >= 1,
number_value(_,Base,Digits,Value).
number_value(Base,Digits,Value) :-
Base < 1,
reverse([0|Digits],Stigid),
number_value(_,Base,Stigid,Value).
number_value(0,_,[],0).
number_value(I,Base,[D|Ds],Value) :-
\+(number(D)),
number_value(I,Base,Ds,Value).
number_value(J,Base,[D|Ds],Value) :-
number(D),
number_value(I,Base,Ds,V),
Value is V + Base ** I * D,
J is I + 1.
/**
* Unifies C with the next character if C is between A and B (inclusive).
*/
range(Fn,A-B,C) -->
{ char_code(A,Min),
char_code(B,Max)
},
call(Fn,C),
{ char_code(C,Code),
Code >= Min,
Code =< Max
}.
range(A-B,C) --> range(unicode_character,A-B,C).
byte_range(A-B,C) --> range(byte,A-B,C).
ht --> [9].
lf --> [10].
ff --> [12].
cr --> [13].
sub --> [26].
sp --> [32].
byte(B) --> [Code], { char_code(B,Code) }.
/**
* Unifies [R|Rs] with the next N results of calling What.
*/
times(0,_,[],[],[]).
times(N,What,[R|Rs]) --> { N > 0, M is N - 1 }, call(What,R), times(M,What,Rs).
is_keyword(abstract).
is_keyword(assert).
is_keyword(boolean).
is_keyword(break).
is_keyword(byte).
is_keyword(case).
is_keyword(catch).
is_keyword(char).
is_keyword(class).
is_keyword(const).
is_keyword(continue).
is_keyword(default).
is_keyword(do).
is_keyword(double).
is_keyword(else).
is_keyword(enum).
is_keyword(extends).
is_keyword(final).
is_keyword(finally).
is_keyword(float).
is_keyword(for).
is_keyword(goto).
is_keyword(if).
is_keyword(implements).
is_keyword(import).
is_keyword(instanceof).
is_keyword(int).
is_keyword(interface).
is_keyword(long).
is_keyword(native).
is_keyword(new).
is_keyword(package).
is_keyword(private).
is_keyword(protected).
is_keyword(public).
is_keyword(return).
is_keyword(short).
is_keyword(static).
is_keyword(strictfp).
is_keyword(super).
is_keyword(switch).
is_keyword(synchronized).
is_keyword(this).
is_keyword(throw).
is_keyword(throws).
is_keyword(transient).
is_keyword(try).
is_keyword(void).
is_keyword(volatile).
is_keyword(while).
is_boolean_literal(true).
is_boolean_literal(false).
is_null_literal(null).
is_separator('...').
is_separator('::').
is_separator('(').
is_separator(')').
is_separator(',').
is_separator('.').
is_separator(';').
is_separator('@').
is_separator('[').
is_separator(']').
is_separator('{').
is_separator('}').
is_operator('>>>=').
is_operator('>>>').
is_operator('>>=').
is_operator('<<=').
is_operator('!=').
is_operator('%=').
is_operator('&&').
is_operator('&=').
is_operator('*=').
is_operator('++').
is_operator('+=').
is_operator('--').
is_operator('-=').
is_operator('->').
is_operator('/=').
is_operator('<<').
is_operator('<=').
is_operator('==').
is_operator('>=').
is_operator('>>').
is_operator('^=').
is_operator('|=').
is_operator('||').
is_operator('!').
is_operator('%').
is_operator('&').
is_operator('+').
is_operator('-').
is_operator('/').
is_operator(':').
is_operator('<').
is_operator('=').
is_operator('>').
is_operator('?').
is_operator('^').
is_operator('*').
is_operator('|').
is_operator('~').
% vim: set filetye=prolog
:- module(utils,[
list_of/4,
string_of/4,
optional/3,
default_value/2,
breakpoint/0,
breakpoint/1
]).
:- meta_predicate list_of(3,?,+,-).
:- meta_predicate string_of(3,?,+,-).
:- meta_predicate optional(2,+,-).
list_of(What,[R|Rs]) --> call(What,R), list_of(What,Rs).
list_of(_,[],A,A).
string_of(What,String) --> list_of(What,Chars), { atomic_list_concat(Chars,String) }.
optional(Fn) --> call(Fn), !.
optional(_,A,A).
default_value(Var,Default) :-
var(Var), Var = Default.
default_value(Var,_) :-
nonvar(Var).
breakpoint.
breakpoint(_).
% vim: set filetype=prolog
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment