Last active
September 22, 2016 08:13
C vs Ruby for SAX Parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data file is protein sequence database psd7003.xml (683MB) from http://www.cs.washington.edu/research/xmldatasets/www/repository.html#pir | |
# Structure is list of ~260,000 <ProteinEntry> elements | |
# Goal is to read data file, and process each <ProteinEntry> onto its own line of output | |
# This simulates filtering/splitting a large list of similar XML elements | |
$ time ruby xmlsplit.rb test.xml > output_rb | |
real 2m35.965s | |
user 2m28.158s | |
sys 0m3.340s | |
$ time ./xmlsplit test.xml > output_c | |
real 0m18.755s | |
user 0m17.029s | |
sys 0m1.667s |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <libxml/parser.h> | |
int append = 0; | |
char data[100000]; | |
int idx = 0; | |
void OnStartElementNs( | |
void *ctx, | |
const xmlChar *localname, | |
const xmlChar *prefix, | |
const xmlChar *URI, | |
int nb_namespaces, | |
const xmlChar **namespaces, | |
int nb_attributes, | |
int nb_defaulted, | |
const xmlChar **attributes | |
) | |
{ | |
if (strcmp(localname, "ProteinEntry") == 0) { | |
append = 1; | |
} | |
if (!append) return; | |
idx += sprintf(data + idx, "<%s>", localname); | |
} | |
void OnEndElementNs(void* ctx, const xmlChar* localname, | |
const xmlChar* prefix, const xmlChar* URI) | |
{ | |
if (!append) return; | |
idx += sprintf(data + idx, "</%s>", localname); | |
if (strcmp(localname, "ProteinEntry") == 0) { | |
append = 0; | |
idx = 0; | |
printf("%s\n", data); | |
} | |
} | |
void OnCharacters(void *ctx, const xmlChar *ch, int len) | |
{ | |
if (!append || len == 1) return; | |
char *s = malloc(len + 1); | |
strncpy(s, (const char *)ch, len); | |
s[len] = (char) NULL; | |
idx += sprintf(data + idx, "%s", s); | |
} | |
xmlSAXHandler make_sax_handler() | |
{ | |
xmlSAXHandler handler; | |
memset(&handler, 0, sizeof(xmlSAXHandler)); | |
handler.initialized = XML_SAX2_MAGIC; | |
handler.startElementNs = OnStartElementNs; | |
handler.endElementNs = OnEndElementNs; | |
handler.characters = OnCharacters; | |
return handler; | |
} | |
int read_xmlfile(FILE *f) | |
{ | |
char chars[1024]; | |
int res = fread(chars, 1, 4, f); | |
if (res <= 0) { | |
return 1; | |
} | |
xmlSAXHandler handler = make_sax_handler(); | |
xmlParserCtxtPtr ctxt = xmlCreatePushParserCtxt( | |
&handler, NULL, chars, res, NULL | |
); | |
while ((res = fread(chars, 1, sizeof(chars), f)) > 0) { | |
if (xmlParseChunk(ctxt, chars, res, 0)) { | |
xmlParserError(ctxt, "xmlParseChunk"); | |
return 1; | |
} | |
} | |
xmlParseChunk(ctxt, chars, 0, 1); | |
xmlFreeParserCtxt(ctxt); | |
xmlCleanupParser(); | |
return 0; | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc != 2) { | |
printf("Expected one argument 'filepath'\n"); | |
exit(-1); | |
} | |
FILE *f = fopen(argv[1], "r"); | |
read_xmlfile(f); | |
fclose(f); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
class Parser < Nokogiri::XML::SAX::Document | |
def initialize | |
@buffer = [] | |
@append = false | |
end | |
def start_element(name, attrs=[]) | |
@append = true if name == "ProteinEntry" | |
return unless @append | |
@buffer << "<#{name}>" | |
end | |
def characters(string) | |
return unless @append | |
@buffer << string | |
end | |
def end_element(name) | |
return unless @append | |
@buffer << "</#{name}>" | |
if name == "ProteinEntry" | |
@append = false | |
puts @buffer.join '' | |
@buffer = [] | |
end | |
end | |
end | |
Nokogiri::XML::SAX::Parser.new(Parser.new).parse(File.open(ARGV[0])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment