Skip to content

Instantly share code, notes, and snippets.

@7shi
Created June 30, 2012 02:00
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 7shi/3021786 to your computer and use it in GitHub Desktop.
Save 7shi/3021786 to your computer and use it in GitHub Desktop.
Pythonの簡易XMLパーサとC++高速版
# public domain
from StringIO import *
def replaces(s, args):
for key, value in args.iteritems():
s = s.replace(key, value)
return s
def from_entity(s):
return replaces(s, {
"&lt;" : '<',
"&gt;" : '>',
"&quot;": '"',
"&nbsp;": ' ',
"&amp;" : '&'})
class reader:
pos = 0
reserved = ""
text = ""
tag = ""
values = {}
def __init__(self, src):
self.src = src
def __getitem__(self, key):
return self.values[key]
def has_key(self, key):
return self.values.has_key(key)
def check(self, tag, values):
if tag != self.tag: return False
for key, value in values.iteritems():
k = unicode(key)
if not self.has_key(k) or self[k] != value:
return False
return True
def find(self, tag, values = {}):
while self.read():
if self.check(tag, values):
return True
return False
def each(self, tag = "", values = {}):
end = "/" + self.tag
i = 0
while self.tag != end and self.read():
if tag == "" or self.check(tag, values):
yield i
i += 1
def read(self):
self.text = ""
self.tag = ""
self.values = {}
if self.pos >= len(self.src):
return False
elif self.reserved != "":
self.tag = self.reserved
self.reserved = ""
else:
self.read_text()
return True
def read_text(self):
p = self.src.find("<", self.pos)
if p < 0:
self.text = from_entity(self.src[self.pos:])
self.pos = len(self.src)
else:
self.text = from_entity(self.src[self.pos:p])
self.pos = p + 1
self.read_tag()
def read_char(self):
if self.pos >= len(self.src):
self.cur = ""
else:
self.cur = self.src[self.pos]
self.pos += 1
return self.cur
def read_tag(self):
t = StringIO()
while self.read_char() != "":
ch = self.cur
if ch == ">" or (ch == "/" and t.pos > 0):
break
elif ch > " ":
t.write(ch)
if t.pos == 3 and t.getvalue() == "!--":
break
elif t.pos > 0:
break
self.tag = t.getvalue().lower()
t.close()
if ch == "/":
self.reserved = "/" + self.tag
ch = self.read_char()
if ch != ">":
if self.tag == "!--":
self.read_comment()
else:
while self.read_values(): pass
def read_comment(self):
p = self.src.find("-->", self.pos)
if p < 0:
self.values["comment"] = self.src[self.pos:]
self.pos = len(self.src)
else:
self.values["comment"] = self.src[self.pos:p]
self.pos = p + 3
def read_values(self):
nm = self.read_value(True).lower()
if nm == "": return False
if self.cur == "/":
self.reserved = "/" + self.tag
if self.cur == "=":
self.values[nm] = self.read_value(False)
else:
self.values[nm] = ""
return self.cur != ">"
def read_value(self, isleft):
v = StringIO()
while self.read_char() != "":
ch = self.cur
if ch == ">" or (isleft and (ch == "=" or ch == "/")):
break
elif ch == '"':
while self.read_char() != "":
if self.cur == '"': break
v.write(self.cur)
break
elif ch > " ":
v.write(ch)
elif v.pos > 0:
break
ret = v.getvalue()
v.close()
return ret
// public domain
#include <string>
#include <map>
#include <cstdio>
#include <cctype>
using namespace std;
struct reader {
int pos, cur;
wstring src, reserved, text, tag;
map<wstring, wstring> values;
};
extern "C" {
reader *alloc(const wchar_t *);
void release(reader *);
const wchar_t *gettext(reader *);
const wchar_t *gettag(reader *);
const wchar_t *getitem(reader *, const wchar_t *);
bool has_key(reader *, const wchar_t *);
bool check(reader *, const wchar_t *, const wchar_t **);
bool find(reader *, const wchar_t *, const wchar_t **);
bool read(reader *);
}
static void read_text(reader *);
static int read_char(reader *);
static void read_tag(reader *);
static void read_comment(reader *);
static bool read_values(reader *);
static wstring read_value(reader *, bool);
static wstring replace(const wstring &src, const wstring &s1, const wstring &s2) {
wstring ret;
for (int p = 0;;) {
int pp = src.find(s1, p);
if (pp < 0) {
ret += src.substr(p);
break;
}
ret += src.substr(p, pp - p);
ret += s2;
p = pp + s1.size();
}
return ret;
}
static wstring lower(const wstring &src) {
wstring ret;
for (auto it = src.begin(); it != src.end(); ++it) {
auto ch = *it;
ret += islower(ch) ? tolower(ch) : ch;
}
return ret;
}
static wstring from_entity(wstring s) {
s = replace(s, L"&lt;", L"<");
s = replace(s, L"&gt;", L">");
s = replace(s, L"&quot;", L"\"");
s = replace(s, L"&nbsp;", L" ");
s = replace(s, L"&amp;", L"&");
return s;
}
reader *alloc(const wchar_t *src) {
auto ret = new reader;
ret->pos = 0;
ret->cur = -1;
ret->src = src;
return ret;
}
void release(reader *self) {
delete self;
}
const wchar_t *gettext(reader *self) {
return self->text.c_str();
}
const wchar_t *gettag(reader *self) {
return self->tag.c_str();
}
const wchar_t *getitem(reader *self, const wchar_t *key) {
auto it = self->values.find(key);
return it != self->values.end() ? it->second.c_str() : NULL;
}
bool has_key(reader *self, const wchar_t *key) {
return self->values.find(key) != self->values.end();
}
bool check(reader *self, const wchar_t *tag, const wchar_t **values) {
if (self->tag != tag) return false;
for (auto it = values; it[0]; it += 2) {
auto it2 = self->values.find(it[0]);
if (it2 == self->values.end() || it2->second != it[1])
return false;
}
return true;
}
bool find(reader *self, const wchar_t *tag, const wchar_t **values) {
while (read(self))
if (check(self, tag, values))
return true;
return false;
}
bool read(reader *self) {
self->text.clear();
self->tag.clear();
self->values.clear();
if (self->pos >= self->src.size())
return false;
else if (!self->reserved.empty()) {
self->tag = self->reserved;
self->reserved.clear();
} else
read_text(self);
return true;
}
static void read_text(reader *self) {
int p = self->src.find('<', self->pos);
if (p < 0) {
self->text = from_entity(self->src.substr(self->pos));
self->pos = self->src.size();
} else {
self->text = from_entity(self->src.substr(self->pos, p - self->pos));
self->pos = p + 1;
read_tag(self);
}
}
static int read_char(reader *self) {
if (self->pos >= self->src.size())
self->cur = -1;
else {
self->cur = self->src[self->pos];
self->pos++;
}
return self->cur;
}
static void read_tag(reader *self) {
wstring t;
wchar_t ch;
while (read_char(self) != -1) {
ch = self->cur;
if (ch == '>' || (ch == '/' && !t.empty()))
break;
else if (ch > ' ') {
t += ch;
if (t == L"!--")
break;
} else if (!t.empty())
break;
}
self->tag = lower(t);
if (ch == '/') {
self->reserved = L"/" + self->tag;
ch = read_char(self);
}
if (ch != '>') {
if (self->tag == L"!--")
read_comment(self);
else
while (read_values(self));
}
}
static void read_comment(reader *self) {
int p = self->src.find(L"-->", self->pos);
if (p < 0) {
self->values[L"comment"] = self->src.substr(self->pos);
self->pos = self->src.size();
} else {
self->values[L"comment"] = self->src.substr(self->pos, p - self->pos);
self->pos = p + 3;
}
}
static bool read_values(reader *self) {
wstring nm = lower(read_value(self, true));
if (nm.empty()) return false;
if (self->cur == '/')
self->reserved = L"/" + self->tag;
if (self->cur == '=')
self->values[nm] = read_value(self, false);
else
self->values[nm] = L"";
return self->cur != '>';
}
static wstring read_value(reader *self, bool isleft) {
wstring v;
while (read_char(self) != -1) {
wchar_t ch = self->cur;
if (ch == '>' || (isleft && (ch == '=' || ch == '/')))
break;
else if (ch == '"') {
while (read_char(self) != -1) {
if (self->cur == '"') break;
v += self->cur;
}
break;
} else if (ch > ' ')
v += ch;
else if (!v.empty())
break;
}
return v;
}
# public domain
from ctypes import *
def getfunc(dll, res, name, arg):
ret = dll[name]
ret.restype = res
ret.argtypes = arg
return ret
dll = cdll.LoadLibrary("xml7shi2.dll")
alloc = getfunc(dll, c_void_p , "alloc" , [c_wchar_p])
release = getfunc(dll, None , "release", [c_void_p])
gettext = getfunc(dll, c_wchar_p, "gettext", [c_void_p])
gettag = getfunc(dll, c_wchar_p, "gettag" , [c_void_p])
getitem = getfunc(dll, c_wchar_p, "getitem", [c_void_p, c_wchar_p])
has_key = getfunc(dll, c_bool , "has_key", [c_void_p, c_wchar_p])
check = getfunc(dll, c_bool , "check" , [c_void_p, c_wchar_p, POINTER(c_wchar_p)])
find = getfunc(dll, c_bool , "find" , [c_void_p, c_wchar_p, POINTER(c_wchar_p)])
read = getfunc(dll, c_bool , "read" , [c_void_p])
def convmap(src):
ret = (c_wchar_p * (len(src) + 1))()
i = 0
for key, value in src.iteritems():
ret[i] = key
ret[i + 1] = value
i += 2
return ret
class reader:
text = property(lambda self: gettext(self.cself))
tag = property(lambda self: gettag (self.cself))
def __init__(self, src):
self.cself = alloc(src)
def __del__(self):
release(self.cself)
del self.cself
def __getitem__(self, key):
return getitem(self.cself, key)
def has_key(self, key):
return has_key(self.cself, key)
def find(self, tag, values = {}):
return find(self.cself, tag, convmap(values))
def each(self, tag = "", values = {}):
end = "/" + self.tag
i = 0
vmap = convmap(values)
while self.tag != end and self.read():
if tag == "" or check(self.cself, tag, vmap):
yield i
i += 1
def read(self):
return read(self.cself)
@7shi
Copy link
Author

7shi commented Aug 15, 2023

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment