Skip to content

Instantly share code, notes, and snippets.

@flavorjones
Last active October 10, 2020 19:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flavorjones/7164487b5af273bef73fcb1a79f89d2d to your computer and use it in GitHub Desktop.
Save flavorjones/7164487b5af273bef73fcb1a79f89d2d to your computer and use it in GitHub Desktop.
Reproduce parse failure in `xmlParseCharData`
ok.xml
fail.xml
run-me.o
run-me

Originally reported via sparklemotion/nokogiri#2028

Reported upstream to libxml2 in https://gitlab.gnome.org/GNOME/libxml2/-/issues/192

To reproduce this issue:

  1. edit Makefile so that it knows where to find (and link) libxml2
  2. run make to generate the input documents and the executable run-me
  3. run run-me

Files:

  • run-me.c - the source code!
  • ok.xml and fail.xml (generated by make inputs) - example input documents
  • nokogiri-reproduction.rb - reproduce this problem using Ruby and the Nokogiri gem

The documents ok.xml and fail.xml are identical except for byte 4001 which is a newline (0x0a) in fail.xml but is a "normal" ASCII character in ok.xml. Parsing fail.xml results in a fatal error "Huge input lookup" because xmlParseCharData falls back unnecessarily to xmlParseCharDataComplex.

default: inputs run-me
CFLAGS=$(shell pkg-config libxml-2.0 --cflags)
LDFLAGS=$(shell pkg-config libxml-2.0 --libs)
# CFLAGS=-I/home/flavorjones/tmp/libxml2/include/libxml2
# LDFLAGS=-lxml2 -L/home/flavorjones/tmp/libxml2/lib
run-me: run-me.o
$(CC) -o run-me run-me.o $(LDFLAGS)
run-me.o: run-me.c
$(CC) $(CFLAGS) -g -c -o run-me.o run-me.c
inputs: ok.xml fail.xml
ok.xml:
> $@
echo "<root><text>" >> $@ # 13 bytes, including newline
yes 12345678901234567890123456789012345678901234567890 | head -79 | tr -d '\n' >> $@ # 50*79=3950 (3963 total)
echo -n 1234567890123456789012345678901234567 >> $@ # 37 (4000 total)
echo -n "-" >> $@ # this is the critical byte, we'll make it a printable character
yes 12345678901234567890123456789012345678901234567890 | head -200000 | tr -d '\n' >> $@ # 50*200000=10,000,000
echo -e "\n</text></root>" >> $@
fail.xml:
echo "<root><text>" >> $@ # 13 bytes, including newline
yes 12345678901234567890123456789012345678901234567890 | head -79 | tr -d '\n' >> $@ # 50*79=3950 (3963 total)
echo -n 1234567890123456789012345678901234567 >> $@ # 37 (4000 total)
echo >> $@ # this is the critical byte, we'll make it a newline
yes 12345678901234567890123456789012345678901234567890 | head -200000 | tr -d '\n' >> $@ # 50*200000=10,000,000
echo -e "\n</text></root>" >> $@
.PHONY: clean
.NOTPARALLEL: clean
clean:
rm -f ok.xml fail.xml run-me.o run-me
#! /usr/bin/env ruby
# encoding: utf-8
require 'nokogiri'
require 'stringio'
class Handler < Nokogiri::XML::SAX::Document
def error(message)
raise message
end
end
def make_stringio(critical_byte, pad_bytes)
template = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<root>
<boom>%s</boom>
</root>
XML
node = ("x" * 3946)
node += critical_byte
node += ("x" * pad_bytes)
StringIO.new(template % node)
end
def ok_stringio(pad_bytes)
make_stringio("-", pad_bytes)
end
def fail_stringio(pad_bytes)
make_stringio("\n", pad_bytes)
end
def try_parse(xml_io)
parser = Nokogiri::XML::SAX::Parser.new(Handler.new)
begin
parser.parse(xml_io)
rescue Exception => e
STDERR.puts "raises #{e}"
return
end
STDERR.puts "OK"
end
try_parse ok_stringio(9_999_000) # OK - but not triggered
try_parse fail_stringio(9_999_000) # OK - node small enough
try_parse ok_stringio(10_000_000) # OK - bug not triggered
try_parse fail_stringio(10_000_000) # FAIL - bug triggered and node large enough
try_parse File.open("ok.xml") # OK
try_parse File.open("fail.xml") # FAIL
#include <fcntl.h>
#include <unistd.h>
#include <stdarg.h>
#include <libxml/parser.h>
#define VERBOSE 0
int readCallback(void * ctx, char * buffer, int len)
{
int fd = *(int*)ctx;
ssize_t read_length ;
read_length = read(fd, buffer, (size_t)len);
#if VERBOSE
fprintf(stderr, "readCallback: read %ld bytes (of %d)\n", read_length, len);
#endif
return read_length;
}
int closeCallback(void * ctx)
{
#if VERBOSE
fprintf(stderr, "closeCallback\n");
#endif
return 0;
}
void startDocCallback(void * ctx)
{
#if VERBOSE
fprintf(stderr, "startDocCallback\n");
#endif
}
void endDocCallback(void * ctx)
{
#if VERBOSE
fprintf(stderr, "endDocCallback\n");
#endif
}
void charactersCallback(void * ctx, const xmlChar * ch, int len)
{
#if VERBOSE
fprintf(stderr, "charactersCallback: received %d characters\n", len);
#endif
}
#define ERROR_MESSAGE_LENGTH 256
void errorCallback(void *ctx, const char *msg, ...)
{
char message[ERROR_MESSAGE_LENGTH];
va_list arg_ptr;
va_start(arg_ptr, msg);
vsnprintf(message, ERROR_MESSAGE_LENGTH, msg, arg_ptr);
va_end(arg_ptr);
fprintf(stderr, "errorCallback: %s", message);
}
void parse(char* filename)
{
int fd;
fd = open(filename, O_RDONLY, 0);
fprintf(stderr, "parse: file %s\n", filename);
xmlSAXHandler handler = { 0 } ;
handler.startDocument = &startDocCallback;
handler.endDocument = &endDocCallback;
handler.characters = &charactersCallback;
handler.error = &errorCallback;
xmlParserCtxtPtr ctxt;
ctxt = xmlCreateIOParserCtxt(NULL, NULL,
(xmlInputReadCallback)readCallback,
(xmlInputCloseCallback)closeCallback,
(void *)(&fd),
XML_CHAR_ENCODING_UTF8);
ctxt->sax = &handler;
xmlParseDocument(ctxt);
}
void main()
{
parse("ok.xml");
parse("fail.xml");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment