Skip to content

Instantly share code, notes, and snippets.

@adilosa
Last active September 22, 2016 08:13
C vs Ruby for SAX Parsing
# Data file is protein sequence database psd7003.xml (683MB) from http://www.cs.washington.edu/research/xmldatasets/www/repository.html#pir
# Structure is list of ~260,000 <ProteinEntry> elements
# Goal is to read data file, and process each <ProteinEntry> onto its own line of output
# This simulates filtering/splitting a large list of similar XML elements
$ time ruby xmlsplit.rb test.xml > output_rb
real 2m35.965s
user 2m28.158s
sys 0m3.340s
$ time ./xmlsplit test.xml > output_c
real 0m18.755s
user 0m17.029s
sys 0m1.667s
#include <stdio.h>
#include <string.h>
#include <libxml/parser.h>
int append = 0;
char data[100000];
int idx = 0;
void OnStartElementNs(
void *ctx,
const xmlChar *localname,
const xmlChar *prefix,
const xmlChar *URI,
int nb_namespaces,
const xmlChar **namespaces,
int nb_attributes,
int nb_defaulted,
const xmlChar **attributes
)
{
if (strcmp(localname, "ProteinEntry") == 0) {
append = 1;
}
if (!append) return;
idx += sprintf(data + idx, "<%s>", localname);
}
void OnEndElementNs(void* ctx, const xmlChar* localname,
const xmlChar* prefix, const xmlChar* URI)
{
if (!append) return;
idx += sprintf(data + idx, "</%s>", localname);
if (strcmp(localname, "ProteinEntry") == 0) {
append = 0;
idx = 0;
printf("%s\n", data);
}
}
void OnCharacters(void *ctx, const xmlChar *ch, int len)
{
if (!append || len == 1) return;
char *s = malloc(len + 1);
strncpy(s, (const char *)ch, len);
s[len] = (char) NULL;
idx += sprintf(data + idx, "%s", s);
}
xmlSAXHandler make_sax_handler()
{
xmlSAXHandler handler;
memset(&handler, 0, sizeof(xmlSAXHandler));
handler.initialized = XML_SAX2_MAGIC;
handler.startElementNs = OnStartElementNs;
handler.endElementNs = OnEndElementNs;
handler.characters = OnCharacters;
return handler;
}
int read_xmlfile(FILE *f)
{
char chars[1024];
int res = fread(chars, 1, 4, f);
if (res <= 0) {
return 1;
}
xmlSAXHandler handler = make_sax_handler();
xmlParserCtxtPtr ctxt = xmlCreatePushParserCtxt(
&handler, NULL, chars, res, NULL
);
while ((res = fread(chars, 1, sizeof(chars), f)) > 0) {
if (xmlParseChunk(ctxt, chars, res, 0)) {
xmlParserError(ctxt, "xmlParseChunk");
return 1;
}
}
xmlParseChunk(ctxt, chars, 0, 1);
xmlFreeParserCtxt(ctxt);
xmlCleanupParser();
return 0;
}
int main(int argc, char **argv)
{
if (argc != 2) {
printf("Expected one argument 'filepath'\n");
exit(-1);
}
FILE *f = fopen(argv[1], "r");
read_xmlfile(f);
fclose(f);
return 0;
}
require 'nokogiri'
class Parser < Nokogiri::XML::SAX::Document
def initialize
@buffer = []
@append = false
end
def start_element(name, attrs=[])
@append = true if name == "ProteinEntry"
return unless @append
@buffer << "<#{name}>"
end
def characters(string)
return unless @append
@buffer << string
end
def end_element(name)
return unless @append
@buffer << "</#{name}>"
if name == "ProteinEntry"
@append = false
puts @buffer.join ''
@buffer = []
end
end
end
Nokogiri::XML::SAX::Parser.new(Parser.new).parse(File.open(ARGV[0]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment