grammars
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import fuzzingbook | |
from fuzzingbook import GrammarFuzzer | |
HTML_GRAMMAR = { | |
'<start>': ['<_l_>!DOCTYPE html<_r_><html_document>'], | |
'<_l_>': ['<'], | |
'<_r_>': ['>'], | |
'<_cl_>': ['</'], | |
'<a_tag>': ['<_l_>a<d><_r_><a_content>*<_cl_>a<_r_>'], | |
'<a_content>': ['<heading>', | |
'<text>'], | |
'<abbr_tag>': ['<_l_>abbr<d><_r_><text><_cl_>abbr<_r_>'], | |
'<acronym_tag>': ['<_l_>acronym<d><_r_><text><_cl_>acronym<_r_>'], | |
'<address_tag>': ['<_l_>address<d><_r_><address_content>*<_cl_>address<_r_>'], | |
'<address_content>': ['<p_tag>', | |
'<text>'], | |
'<applet_content>': ['<param>*<body_content>'], | |
'<area>': ['<_l_>area<d><_r_>'], | |
'<applet_tag>': ['<_l_>applet<d><_r_><applet_content><_cl_>applet<_r_>'], | |
'<b_tag>': ['<_l_>b<d><_r_><text><_cl_>b<_r_>'], | |
'<basefont_tag>': ['<_l_>basefront<d><_r_><body_content><_cl_>basefront<_r_>'], | |
'<bdo_tag>': ['<_l_>bdo<d><_r_><text><_cl_>bdo<_r_>'], | |
'<big_tag>': ['<_l_>big<d><_r_><text><_cl_>big<_r_>'], | |
'<blink_tag>': ['<_l_>blink<d><_r_><text><_cl_>blink<_r_>'], | |
'<block>': ['<block_content>*'], | |
'<block_content>': [# '<_l_>isindex<d><_r_>', # -- HTML5LIB | |
'<basefont_tag>', '<blockquote_tag>', '<center_tag>', | |
'<dir_tag>', '<div_tag>', '<dl_tag>', | |
'<form_tag>', '<listing_tag>', '<menu_tag>', | |
'<multicol_tag>', '<nobr_tag>', '<ol_tag>', | |
'<p_tag>', '<pre_tag>', '<table_tag>', | |
'<ul_tag>','<xmp_tag>'], | |
'<blockquote_tag>': ['<_l_>blockquote<d><_r_><body_content><_cl_>blockquote<_r_>'], | |
'<body_content>': ['<_l_>bgsound<d><_r_>', '<_l_>hr<_r_>', | |
'<address_tag>','<block>','<del_tag>', | |
'<heading>','<ins_tag>', '<layer_tag>', | |
'<map_tag>','<marquee_tag>','<text>'], | |
'<body_tag>': ['<_l_>body<d><_r_><body_content>*<_cl_>body<_r_>'], | |
'<caption_tag>': ['<_l_>caption<d><_r_><body_content>*<_cl_>caption<_r_>'], | |
'<center_tag>': ['<_l_>center<d><_r_><body_content>*<_cl_>center<_r_>'], | |
'<cite_tag>': ['<_l_>cite<d><_r_><text><_cl_>cite<_r_>'], | |
'<code_tag>': ['<_l_>code<d><_r_><text><_cl_>code<_r_>'], | |
'<colgroup_content>': ['<_l_>col<d><_r_>*'], | |
'<colgroup_tag>': ['<_l_>colgroup<d><_r_><colgroup_content>'], | |
'<content_style>': ['<abbr_tag>', '<acronym_tag>', '<cite_tag>', | |
'<code_tag>', '<dfn_tag>', '<em_tag>', | |
'<kbd_tag>', '<q_tag>', '<strong_tag>', | |
'<var_tag>'], | |
'<dd_tag>': ['<_l_>dd<d><_r_><flow><_cl_>dd<_r_>'], | |
'<del_tag>': ['<_l_>del<d><_r_><flow><_cl_>del<_r_>'], | |
'<dfn_tag>': ['<_l_>dfn<d><_r_><text><_cl_>dfn<_r_>'], | |
'<dir_tag>': ['<_l_>dir<d><_r_><li_tag>+<_cl_>dir<_r_>'], | |
'<div_tag>': ['<_l_>div<d><_r_><body_content><_cl_>div<_r_>'], | |
'<dl_content>': ['<dt_tag><dd_tag>'], | |
'<dl_tag>': ['<_l_>dl<d><_r_><dl_content>+<_cl_>dl<_r_>'], | |
'<dt_tag>': ['<_l_>dt<d><_r_><text><_cl_>dt<_r_>'], | |
'<em_tag>': ['<_l_>em<d><_r_><text><_cl_>em<_r_>'], | |
'<fieldset_tag>': ['<_l_>fieldset<d><_r_><legend_tag>*<form_content>*<_cl_>fieldset<_r_>'], | |
'<flow>': ['<flow_content>*'], | |
'<flow_content>': ['<block>', | |
'<text>'], | |
'<font_tag>': ['<_l_>font<d><_r_><style_text><_cl_>font<_r_>'], | |
'<form_content>': ['<_l_>input<d><_r_>', '<_l_>keygen<d><_r_>', | |
'<body_content>', '<fieldset_tag>', '<label_tag>', | |
'<select_tag>', '<textarea_tag>'], | |
'<form_tag>': ['<_l_>form<d><_r_><form_content>*<_cl_>form<_r_>'], | |
'<frameset_content>': ['<_l_>frame<d><_r_>', | |
'<noframes_tag>'], | |
'<frameset_tag>': ['<_l_>frameset<d><_r_><frameset_content>*<_cl_>frameset<_r_>'], | |
'<h1_tag>': ['<_l_>h1<d><_r_><text><_cl_>h1<_r_>'], | |
'<h2_tag>': ['<_l_>h2<d><_r_><text><_cl_>h2<_r_>'], | |
'<h3_tag>': ['<_l_>h3<d><_r_><text><_cl_>h3<_r_>'], | |
'<h4_tag>': ['<_l_>h4<d><_r_><text><_cl_>h4<_r_>'], | |
'<h5_tag>': ['<_l_>h5<d><_r_><text><_cl_>h5<_r_>'], | |
'<h6_tag>': ['<_l_>h6<d><_r_><text><_cl_>h6<_r_>'], | |
'<head_content>': [ | |
'<_l_>base<d><_r_>', | |
#'<_l_>isindex<d><_r_>', # HTML5LIB | |
'<_l_>link<d><_r_>', | |
'<_l_>meta<d><_r_>', | |
#'<_l_>nextid<d><_r_>', # HTML5LIB | |
'<style_tag>', | |
'<title_tag>', '<script_tag>'], | |
'<head_tag>': ['<_l_>head<d><_r_><head_content>*<_cl_>head<_r_>'], | |
'<heading>': ['<h1_tag>', '<h2_tag>', '<h3_tag>', | |
'<h4_tag>', '<h5_tag>', '<h6_tag>'], | |
'<html_content>': ['<head_tag><body_tag>', | |
'<head_tag><frameset_tag>'], | |
'<html_document>': ['<html_tag>'], | |
'<html_tag>': ['<_l_>html<_r_><html_content><_cl_>html<_r_>'], | |
'<i_tag>': ['<_l_>i<d><_r_><text><_cl_>i<_r_>'], | |
'<ilayer_tag>': ['<_l_>ilayer<d><_r_><body_content><_cl_>ilayer<_r_>'], | |
'<ins_tag>': ['<_l_>ins<d><_r_><flow><_cl_>ins<_r_>'], | |
'<kbd_tag>': ['<_l_>kbd<d><_r_><text><_cl_>kbd<_r_>'], | |
'<label_content>': ['<_l_>input<d><_r_>', | |
'<body_content>', '<select_tag>', '<textarea_tag>'], | |
'<label_tag>': ['<_l_>label<d><_r_><label_content>*<_cl_>label<_r_>'], | |
'<layer_tag>': [ | |
'<_l_>layer<d><_r_><body_content><_cl_>layer<_r_>' | |
], | |
'<legend_tag>': ['<_l_>legend<d><_r_><text><_cl_>legend<_r_>'], | |
'<li_tag>': ['<_l_>li<d><_r_><flow><_cl_>li<_r_>'], | |
'<literal_text>': ['<plain_text>'], | |
'<listing_tag>': ['<_l_>listing<d><_r_><literal_text><_cl_>listing<_r_>'], | |
'<map_content>': ['<area>*'], | |
'<map_tag>': ['<_l_>map<d><_r_><map_content><_cl_>map<_r_>'], | |
'<marquee_tag>': [ | |
'<_l_>marquee<d><_r_><style_text><_cl_>marquee<_r_>' | |
], | |
'<menu_tag>': ['<_l_>menu<d><_r_><li_tag>*<_cl_>menu<_r_>'], | |
'<multicol_tag>': ['<_l_>multicol<d><_r_><body_content><_cl_>multicol<_r_>'], | |
'<nobr_tag>': ['<_l_>nobr<d><_r_><text><_cl_>nobr<_r_>'], | |
'<noembed_tag>': ['<_l_>noembed<d><_r_><text><_cl_>noembed<_r_>'], | |
'<noframes_tag>': ['<_l_>noframes<d><_r_><body_content>*<_cl_>noframes<_r_>'], | |
'<noscript_tag>': ['<_l_>noscript<d><_r_><text><_cl_>noscript<_r_>'], | |
'<object_content>': ['<param>*<body_content>'], | |
'<object_tag>': ['<_l_>object<d><_r_><object_content><_cl_>object<_r_>'], | |
'<ol_tag>': ['<_l_>ol<d><_r_><li_tag>+<_cl_>ol<_r_>'], | |
'<optgroup_tag>': ['<_l_>optgroup<d><_r_><option_tag>*<_cl_>optgroup<_r_>'], | |
'<option_tag>': ['<_l_>option<d><_r_><plain_text>+<_cl_>option<_r_>'], | |
'<p_tag>': ['<_l_>p<_r_><text><_cl_>p<_r_>'], | |
'<param>': ['<_l_>param<_r_>'], | |
'<plain_text>': ['<entity>*'], | |
'<entity>' : ['<char>', '<ampersand>'], | |
'<char>': list(set(string.printable) - {'\x0b', '&'}), | |
'<ampersand>': [' '], | |
'<physical_style>': ['<b_tag>', '<bdo_tag>', '<big_tag>', | |
'<blink_tag>', '<font_tag>', '<i_tag>', | |
'<s_tag>', '<small_tag>', '<span_tag>', | |
'<strike_tag>', '<sub_tag>', '<sup_tag>', | |
'<tt_tag>', '<u_tag>'], | |
'<pre_content>': ['<_l_>br<_r_>', '<_l_>hr<_r_>', | |
'<a_tag>', '<style_text>'], | |
'<pre_tag>': ['<_l_>pre<_r_><pre_content>*<_cl_>pre<_r_>'], | |
'<q_tag>': ['<_l_>q<_r_><text><_cl_>q<_r_>'], | |
'<s_tag>': ['<_l_>s<_r_><text><_cl_>s<_r_>'], | |
# '<samp_tag>': ['<_l_>samp<_r_><text><_cl_>samp<_r_>'], | |
'<script_tag>': ['<_l_>script<d><_r_><plain_text><_cl_>script<_r_>'], | |
'<select_content>': ['<optgroup_tag>', '<option_tag>'], | |
'<select_tag>': ['<_l_>select<d><_r_><select_content>*<_cl_>select<_r_>'], | |
# '<server_tag>': ['<_l_>server<d><_r_><plain_text><_cl_>server<_r_>'], | |
'<small_tag>': ['<_l_>small<d><_r_><text><_cl_>small<_r_>'], | |
'<span_tag>': ['<_l_>span<d><_r_><text><_cl_>span<_r_>'], | |
'<strike_tag>': ['<_l_>strike<d><_r_><text><_cl_>strike<_r_>'], | |
'<strong_tag>': ['<_l_>strong<d><_r_><text><_cl_>strong<_r_>'], | |
'<style_tag>': ['<_l_>style<d><_r_><plain_text><_cl_>style<_r_>'], | |
'<style_text>': ['<plain_text>'], | |
'<sub_tag>': ['<_l_>sub<d><_r_><text><_cl_>sub<_r_>'], | |
'<sup_tag>': ['<_l_>sup<d><_r_><text><_cl_>sup<_r_>'], | |
'<table_cell>': ['<td_tag>', '<th_tag>'], | |
'<table_content>': ['<_l_>tbody<d><_r_>', '<_l_>tfoot<d><_r_>', '<_l_>thead<d><_r_>', | |
'<tr_tag>'], | |
'<table_tag>': ['<_l_>table<d><_r_><caption_tag>*<colgroup_tag>*<table_content>*<_cl_>table<_r_>'], | |
'<td_tag>': ['<_l_>td<d><_r_><body_content><_cl_>td<_r_>'], | |
'<text>': ['<text_content>*'], | |
'<text_content>': ['<_l_>br<d><_r_>', '<_l_>embed<d><_r_>', '<_l_>iframe<d><_r_>', | |
'<_l_>img<d><_r_>', '<_l_>spacer<d><_r_>', '<_l_>wbr<d><_r_>', | |
'<a_tag>', '<applet_tag>', '<content_style>', | |
'<ilayer_tag>', '<noembed_tag>', '<noscript_tag>', | |
'<object_tag>', '<plain_text>', '<physical_style>'], | |
'<textarea_tag>': ['<_l_>textarea<d><_r_><plain_text><_cl_>textarea<_r_>'], | |
'<th_tag>': ['<_l_>th<d><_r_><body_content><_cl_>th<_r_>'], | |
'<title_tag>': ['<_l_>title<d><_r_><plain_text><_cl_>title<_r_>'], | |
'<tr_tag>': ['<_l_>tr<d><_r_><table_cell>*<_cl_>tr<_r_>'], | |
'<tt_tag>': ['<_l_>tt<d><_r_><text><_cl_>tt<_r_>'], | |
'<u_tag>': ['<_l_>u<d><_r_><text><_cl_>u<_r_>'], | |
'<ul_tag>': ['<_l_>ul<d><_r_><li_tag>*<_cl_>ul<_r_>'], | |
'<var_tag>': ['<_l_>var<d><_r_><text><_cl_>var<_r_>'], | |
'<xmp_tag>': ['<_l_>xmp<d><_r_><literal_text><_cl_>xmp<_r_>'], | |
'<d>': ['<space>+<attributes>*<space>*', ''], | |
'<attribute>': [ | |
'<key>', | |
'<key>="<value>"', | |
'<key>=\'<value>\'', | |
'<key>=<uqvalue>', | |
], | |
## https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 | |
'<key>': ['<allchars>'], | |
#'<letter>': list(string.ascii_letters), | |
'<allchars>': list(set(list(string.printable)) - set(list(' \t\n"\'<>/=\x0b'))), | |
'<value>': ['<anychars>'], | |
'<anychar>': list(string.printable), | |
'<anychars>': ['<anychar>*'], | |
'<uqvalue>': ['<uqchars>'], | |
'<uqchar>': list(set(list(string.printable)) - set(list(' \t\n"\'<>`/=\x0b'))), | |
'<uqchars>': ['<uqchar>+'], | |
'<attributes>': ['<attribute>', '<attribute><space>+<attributes>'], | |
'<space>': [' ', '\t', '\n'] | |
} | |
GrammarFuzzer.is_valid_grammar(HTML_GRAMMAR) | |
from html.parser import HTMLParser | |
parser = HTMLParser() | |
from lxml import etree | |
from io import StringIO | |
import html5lib | |
#parser = etree.HTMLParser(recover=False) | |
gf = GrammarFuzzer.GrammarFuzzer(GrammarFuzzer.convert_ebnf_grammar(HTML_GRAMMAR), min_nonterminals=20) | |
Max = 1000 | |
count = 0 | |
for i in range(Max): | |
p = gf.fuzz() | |
with open('_.html', 'w+') as f: | |
print(p, file=f) | |
#document = html5lib.parse(p) | |
parser = html5lib.HTMLParser(strict=True) | |
try: | |
print(parser.parse(p)) | |
print('+', repr(p)) | |
count += 1 | |
except: | |
print('-', repr(p)) | |
#parser.feed(p) | |
#etree.parse(StringIO(p), parser) | |
print(count, Max) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment