Skip to content

Instantly share code, notes, and snippets.

@vrthra
Last active May 27, 2019
Embed
What would you like to do?
grammars
import string
import fuzzingbook
from fuzzingbook import GrammarFuzzer
HTML_GRAMMAR = {
'<start>': ['<_l_>!DOCTYPE html<_r_><html_document>'],
'<_l_>': ['<'],
'<_r_>': ['>'],
'<_cl_>': ['</'],
'<a_tag>': ['<_l_>a<d><_r_><a_content>*<_cl_>a<_r_>'],
'<a_content>': ['<heading>',
'<text>'],
'<abbr_tag>': ['<_l_>abbr<d><_r_><text><_cl_>abbr<_r_>'],
'<acronym_tag>': ['<_l_>acronym<d><_r_><text><_cl_>acronym<_r_>'],
'<address_tag>': ['<_l_>address<d><_r_><address_content>*<_cl_>address<_r_>'],
'<address_content>': ['<p_tag>',
'<text>'],
'<applet_content>': ['<param>*<body_content>'],
'<area>': ['<_l_>area<d><_r_>'],
'<applet_tag>': ['<_l_>applet<d><_r_><applet_content><_cl_>applet<_r_>'],
'<b_tag>': ['<_l_>b<d><_r_><text><_cl_>b<_r_>'],
'<basefont_tag>': ['<_l_>basefront<d><_r_><body_content><_cl_>basefront<_r_>'],
'<bdo_tag>': ['<_l_>bdo<d><_r_><text><_cl_>bdo<_r_>'],
'<big_tag>': ['<_l_>big<d><_r_><text><_cl_>big<_r_>'],
'<blink_tag>': ['<_l_>blink<d><_r_><text><_cl_>blink<_r_>'],
'<block>': ['<block_content>*'],
'<block_content>': [# '<_l_>isindex<d><_r_>', # -- HTML5LIB
'<basefont_tag>', '<blockquote_tag>', '<center_tag>',
'<dir_tag>', '<div_tag>', '<dl_tag>',
'<form_tag>', '<listing_tag>', '<menu_tag>',
'<multicol_tag>', '<nobr_tag>', '<ol_tag>',
'<p_tag>', '<pre_tag>', '<table_tag>',
'<ul_tag>','<xmp_tag>'],
'<blockquote_tag>': ['<_l_>blockquote<d><_r_><body_content><_cl_>blockquote<_r_>'],
'<body_content>': ['<_l_>bgsound<d><_r_>', '<_l_>hr<_r_>',
'<address_tag>','<block>','<del_tag>',
'<heading>','<ins_tag>', '<layer_tag>',
'<map_tag>','<marquee_tag>','<text>'],
'<body_tag>': ['<_l_>body<d><_r_><body_content>*<_cl_>body<_r_>'],
'<caption_tag>': ['<_l_>caption<d><_r_><body_content>*<_cl_>caption<_r_>'],
'<center_tag>': ['<_l_>center<d><_r_><body_content>*<_cl_>center<_r_>'],
'<cite_tag>': ['<_l_>cite<d><_r_><text><_cl_>cite<_r_>'],
'<code_tag>': ['<_l_>code<d><_r_><text><_cl_>code<_r_>'],
'<colgroup_content>': ['<_l_>col<d><_r_>*'],
'<colgroup_tag>': ['<_l_>colgroup<d><_r_><colgroup_content>'],
'<content_style>': ['<abbr_tag>', '<acronym_tag>', '<cite_tag>',
'<code_tag>', '<dfn_tag>', '<em_tag>',
'<kbd_tag>', '<q_tag>', '<strong_tag>',
'<var_tag>'],
'<dd_tag>': ['<_l_>dd<d><_r_><flow><_cl_>dd<_r_>'],
'<del_tag>': ['<_l_>del<d><_r_><flow><_cl_>del<_r_>'],
'<dfn_tag>': ['<_l_>dfn<d><_r_><text><_cl_>dfn<_r_>'],
'<dir_tag>': ['<_l_>dir<d><_r_><li_tag>+<_cl_>dir<_r_>'],
'<div_tag>': ['<_l_>div<d><_r_><body_content><_cl_>div<_r_>'],
'<dl_content>': ['<dt_tag><dd_tag>'],
'<dl_tag>': ['<_l_>dl<d><_r_><dl_content>+<_cl_>dl<_r_>'],
'<dt_tag>': ['<_l_>dt<d><_r_><text><_cl_>dt<_r_>'],
'<em_tag>': ['<_l_>em<d><_r_><text><_cl_>em<_r_>'],
'<fieldset_tag>': ['<_l_>fieldset<d><_r_><legend_tag>*<form_content>*<_cl_>fieldset<_r_>'],
'<flow>': ['<flow_content>*'],
'<flow_content>': ['<block>',
'<text>'],
'<font_tag>': ['<_l_>font<d><_r_><style_text><_cl_>font<_r_>'],
'<form_content>': ['<_l_>input<d><_r_>', '<_l_>keygen<d><_r_>',
'<body_content>', '<fieldset_tag>', '<label_tag>',
'<select_tag>', '<textarea_tag>'],
'<form_tag>': ['<_l_>form<d><_r_><form_content>*<_cl_>form<_r_>'],
'<frameset_content>': ['<_l_>frame<d><_r_>',
'<noframes_tag>'],
'<frameset_tag>': ['<_l_>frameset<d><_r_><frameset_content>*<_cl_>frameset<_r_>'],
'<h1_tag>': ['<_l_>h1<d><_r_><text><_cl_>h1<_r_>'],
'<h2_tag>': ['<_l_>h2<d><_r_><text><_cl_>h2<_r_>'],
'<h3_tag>': ['<_l_>h3<d><_r_><text><_cl_>h3<_r_>'],
'<h4_tag>': ['<_l_>h4<d><_r_><text><_cl_>h4<_r_>'],
'<h5_tag>': ['<_l_>h5<d><_r_><text><_cl_>h5<_r_>'],
'<h6_tag>': ['<_l_>h6<d><_r_><text><_cl_>h6<_r_>'],
'<head_content>': [
'<_l_>base<d><_r_>',
#'<_l_>isindex<d><_r_>', # HTML5LIB
'<_l_>link<d><_r_>',
'<_l_>meta<d><_r_>',
#'<_l_>nextid<d><_r_>', # HTML5LIB
'<style_tag>',
'<title_tag>', '<script_tag>'],
'<head_tag>': ['<_l_>head<d><_r_><head_content>*<_cl_>head<_r_>'],
'<heading>': ['<h1_tag>', '<h2_tag>', '<h3_tag>',
'<h4_tag>', '<h5_tag>', '<h6_tag>'],
'<html_content>': ['<head_tag><body_tag>',
'<head_tag><frameset_tag>'],
'<html_document>': ['<html_tag>'],
'<html_tag>': ['<_l_>html<_r_><html_content><_cl_>html<_r_>'],
'<i_tag>': ['<_l_>i<d><_r_><text><_cl_>i<_r_>'],
'<ilayer_tag>': ['<_l_>ilayer<d><_r_><body_content><_cl_>ilayer<_r_>'],
'<ins_tag>': ['<_l_>ins<d><_r_><flow><_cl_>ins<_r_>'],
'<kbd_tag>': ['<_l_>kbd<d><_r_><text><_cl_>kbd<_r_>'],
'<label_content>': ['<_l_>input<d><_r_>',
'<body_content>', '<select_tag>', '<textarea_tag>'],
'<label_tag>': ['<_l_>label<d><_r_><label_content>*<_cl_>label<_r_>'],
'<layer_tag>': [
'<_l_>layer<d><_r_><body_content><_cl_>layer<_r_>'
],
'<legend_tag>': ['<_l_>legend<d><_r_><text><_cl_>legend<_r_>'],
'<li_tag>': ['<_l_>li<d><_r_><flow><_cl_>li<_r_>'],
'<literal_text>': ['<plain_text>'],
'<listing_tag>': ['<_l_>listing<d><_r_><literal_text><_cl_>listing<_r_>'],
'<map_content>': ['<area>*'],
'<map_tag>': ['<_l_>map<d><_r_><map_content><_cl_>map<_r_>'],
'<marquee_tag>': [
'<_l_>marquee<d><_r_><style_text><_cl_>marquee<_r_>'
],
'<menu_tag>': ['<_l_>menu<d><_r_><li_tag>*<_cl_>menu<_r_>'],
'<multicol_tag>': ['<_l_>multicol<d><_r_><body_content><_cl_>multicol<_r_>'],
'<nobr_tag>': ['<_l_>nobr<d><_r_><text><_cl_>nobr<_r_>'],
'<noembed_tag>': ['<_l_>noembed<d><_r_><text><_cl_>noembed<_r_>'],
'<noframes_tag>': ['<_l_>noframes<d><_r_><body_content>*<_cl_>noframes<_r_>'],
'<noscript_tag>': ['<_l_>noscript<d><_r_><text><_cl_>noscript<_r_>'],
'<object_content>': ['<param>*<body_content>'],
'<object_tag>': ['<_l_>object<d><_r_><object_content><_cl_>object<_r_>'],
'<ol_tag>': ['<_l_>ol<d><_r_><li_tag>+<_cl_>ol<_r_>'],
'<optgroup_tag>': ['<_l_>optgroup<d><_r_><option_tag>*<_cl_>optgroup<_r_>'],
'<option_tag>': ['<_l_>option<d><_r_><plain_text>+<_cl_>option<_r_>'],
'<p_tag>': ['<_l_>p<_r_><text><_cl_>p<_r_>'],
'<param>': ['<_l_>param<_r_>'],
'<plain_text>': ['<entity>*'],
'<entity>' : ['<char>', '<ampersand>'],
'<char>': list(set(string.printable) - {'\x0b', '&'}),
'<ampersand>': ['&nbsp;'],
'<physical_style>': ['<b_tag>', '<bdo_tag>', '<big_tag>',
'<blink_tag>', '<font_tag>', '<i_tag>',
'<s_tag>', '<small_tag>', '<span_tag>',
'<strike_tag>', '<sub_tag>', '<sup_tag>',
'<tt_tag>', '<u_tag>'],
'<pre_content>': ['<_l_>br<_r_>', '<_l_>hr<_r_>',
'<a_tag>', '<style_text>'],
'<pre_tag>': ['<_l_>pre<_r_><pre_content>*<_cl_>pre<_r_>'],
'<q_tag>': ['<_l_>q<_r_><text><_cl_>q<_r_>'],
'<s_tag>': ['<_l_>s<_r_><text><_cl_>s<_r_>'],
# '<samp_tag>': ['<_l_>samp<_r_><text><_cl_>samp<_r_>'],
'<script_tag>': ['<_l_>script<d><_r_><plain_text><_cl_>script<_r_>'],
'<select_content>': ['<optgroup_tag>', '<option_tag>'],
'<select_tag>': ['<_l_>select<d><_r_><select_content>*<_cl_>select<_r_>'],
# '<server_tag>': ['<_l_>server<d><_r_><plain_text><_cl_>server<_r_>'],
'<small_tag>': ['<_l_>small<d><_r_><text><_cl_>small<_r_>'],
'<span_tag>': ['<_l_>span<d><_r_><text><_cl_>span<_r_>'],
'<strike_tag>': ['<_l_>strike<d><_r_><text><_cl_>strike<_r_>'],
'<strong_tag>': ['<_l_>strong<d><_r_><text><_cl_>strong<_r_>'],
'<style_tag>': ['<_l_>style<d><_r_><plain_text><_cl_>style<_r_>'],
'<style_text>': ['<plain_text>'],
'<sub_tag>': ['<_l_>sub<d><_r_><text><_cl_>sub<_r_>'],
'<sup_tag>': ['<_l_>sup<d><_r_><text><_cl_>sup<_r_>'],
'<table_cell>': ['<td_tag>', '<th_tag>'],
'<table_content>': ['<_l_>tbody<d><_r_>', '<_l_>tfoot<d><_r_>', '<_l_>thead<d><_r_>',
'<tr_tag>'],
'<table_tag>': ['<_l_>table<d><_r_><caption_tag>*<colgroup_tag>*<table_content>*<_cl_>table<_r_>'],
'<td_tag>': ['<_l_>td<d><_r_><body_content><_cl_>td<_r_>'],
'<text>': ['<text_content>*'],
'<text_content>': ['<_l_>br<d><_r_>', '<_l_>embed<d><_r_>', '<_l_>iframe<d><_r_>',
'<_l_>img<d><_r_>', '<_l_>spacer<d><_r_>', '<_l_>wbr<d><_r_>',
'<a_tag>', '<applet_tag>', '<content_style>',
'<ilayer_tag>', '<noembed_tag>', '<noscript_tag>',
'<object_tag>', '<plain_text>', '<physical_style>'],
'<textarea_tag>': ['<_l_>textarea<d><_r_><plain_text><_cl_>textarea<_r_>'],
'<th_tag>': ['<_l_>th<d><_r_><body_content><_cl_>th<_r_>'],
'<title_tag>': ['<_l_>title<d><_r_><plain_text><_cl_>title<_r_>'],
'<tr_tag>': ['<_l_>tr<d><_r_><table_cell>*<_cl_>tr<_r_>'],
'<tt_tag>': ['<_l_>tt<d><_r_><text><_cl_>tt<_r_>'],
'<u_tag>': ['<_l_>u<d><_r_><text><_cl_>u<_r_>'],
'<ul_tag>': ['<_l_>ul<d><_r_><li_tag>*<_cl_>ul<_r_>'],
'<var_tag>': ['<_l_>var<d><_r_><text><_cl_>var<_r_>'],
'<xmp_tag>': ['<_l_>xmp<d><_r_><literal_text><_cl_>xmp<_r_>'],
'<d>': ['<space>+<attributes>*<space>*', ''],
'<attribute>': [
'<key>',
'<key>="<value>"',
'<key>=\'<value>\'',
'<key>=<uqvalue>',
],
## https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
'<key>': ['<allchars>'],
#'<letter>': list(string.ascii_letters),
'<allchars>': list(set(list(string.printable)) - set(list(' \t\n"\'<>/=\x0b'))),
'<value>': ['<anychars>'],
'<anychar>': list(string.printable),
'<anychars>': ['<anychar>*'],
'<uqvalue>': ['<uqchars>'],
'<uqchar>': list(set(list(string.printable)) - set(list(' \t\n"\'<>`/=\x0b'))),
'<uqchars>': ['<uqchar>+'],
'<attributes>': ['<attribute>', '<attribute><space>+<attributes>'],
'<space>': [' ', '\t', '\n']
}
GrammarFuzzer.is_valid_grammar(HTML_GRAMMAR)
from html.parser import HTMLParser
parser = HTMLParser()
from lxml import etree
from io import StringIO
import html5lib
#parser = etree.HTMLParser(recover=False)
gf = GrammarFuzzer.GrammarFuzzer(GrammarFuzzer.convert_ebnf_grammar(HTML_GRAMMAR), min_nonterminals=20)
Max = 1000
count = 0
for i in range(Max):
p = gf.fuzz()
with open('_.html', 'w+') as f:
print(p, file=f)
#document = html5lib.parse(p)
parser = html5lib.HTMLParser(strict=True)
try:
print(parser.parse(p))
print('+', repr(p))
count += 1
except:
print('-', repr(p))
#parser.feed(p)
#etree.parse(StringIO(p), parser)
print(count, Max)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment