-
-
Save airween/8c5bfd5be011359edd918e964ca86178 to your computer and use it in GitHub Desktop.
Using PCRE throught CFFI - Python3 example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import cffi | |
import sys | |
import argparse | |
import datetime | |
import time | |
aparser = argparse.ArgumentParser() | |
aparser.add_argument("patternfrom", type=str, help="Add filename to read the PATTERN") | |
aparser.add_argument("-s", "--sofile", type=str, help="Add path to shared object") | |
args = aparser.parse_args() | |
# default shared obj | |
sofile = "/lib/x86_64-linux-gnu/libpcre.so.3" | |
if args.sofile is not None: | |
sofile = args.sofile | |
# load ffi | |
ffi = cffi.FFI() | |
lib = ffi.dlopen(sofile) | |
C = ffi.dlopen(None) | |
# used types | |
header_content = """ | |
typedef unsigned char uschar; | |
typedef struct real_pcre { | |
unsigned long int magic_number; | |
size_t size; | |
const unsigned char *tables; | |
unsigned long int options; | |
unsigned short int top_bracket; | |
unsigned short int top_backref; | |
uschar first_char; | |
uschar req_char; | |
uschar code[1]; | |
} real_pcre; | |
typedef struct real_pcre pcre; | |
typedef struct real_pcre_extra { | |
uschar options; | |
uschar start_bits[32]; | |
} real_pcre_extra; | |
typedef struct real_pcre_extra pcre_extra; | |
pcre *pcre_compile(const char * pattern, int options, | |
const char ** errptr, int * erroroffset, | |
const unsigned char * tableptr); | |
int pcre_exec (const pcre *code, const pcre_extra *extra, | |
const char *subject, int length, int startoffset, | |
int options, int *ovector, int ovecsize); | |
pcre_extra * pcre_study(const pcre *code, int options, const char **errptr); | |
int printf(const char *format, ...); | |
#define PCRE_STUDY_JIT_COMPILE 0x0001 | |
""" | |
ffi.cdef(header_content) | |
# used functions | |
pcre_compile_fn = lib.pcre_compile | |
pcre_exec_fn = lib.pcre_exec | |
pcre_study_fn = lib.pcre_study | |
# used variables | |
# ============== | |
errptr = ffi.new("char **", None) | |
erroroffset = ffi.new("int *", None) | |
tableptr = ffi.new("unsigned char *", None) | |
# set up the pattern from file | |
with open(args.patternfrom, "r") as p: | |
pattern_raw = p.read().strip().encode("ascii") | |
pattern = ffi.new("char []", pattern_raw) | |
p.close() | |
re = pcre_compile_fn(pattern, 0, errptr, erroroffset, tableptr) | |
re_extra = pcre_study_fn(re, 1, errptr) | |
if re == ffi.NULL: | |
print("Invalid regex") | |
sys.exit(-2) | |
pextra = ffi.new("pcre_extra *", None) | |
subject_raw = b"1"*(128*1024) # subject for search | |
subject = ffi.new("char []", subject_raw) | |
ovector = ffi.new("int[900]") | |
print("Using pattern: %s" % (pattern_raw)) | |
if len(subject_raw) > 30: | |
print("Using subject: %s..." % subject_raw[0:30]) | |
else: | |
print("Using subject: %s".encode("ascii") % subject_raw) | |
for i in range(10): | |
dstart = datetime.datetime.now() | |
rc = pcre_exec_fn(re, re_extra, subject, len(ffi.string(subject)), 0, 0, ovector, 900) | |
dend = datetime.datetime.now() | |
tstart = time.mktime(dstart.timetuple()) + (dstart.microsecond / 1000000.0) | |
tend = time.mktime(dend.timetuple()) + (dend.microsecond / 1000000.0) | |
print("Time elapsed: %f" % (tend-tstart)) |
I updated the gist, now it gives a bit better for the "right" pattern.
Time elapsed: 0.004939
Time elapsed: 0.005070
Time elapsed: 0.004940
Time elapsed: 0.004769
Time elapsed: 0.004793
Time elapsed: 0.004731
Time elapsed: 0.004900
Time elapsed: 0.004723
Time elapsed: 0.004952
Time elapsed: 0.004797
=====
Time elapsed: 0.016973
Time elapsed: 0.016931
Time elapsed: 0.011371
Time elapsed: 0.011062
Time elapsed: 0.010838
Time elapsed: 0.011087
Time elapsed: 0.010877
Time elapsed: 0.010957
Time elapsed: 0.010830
Time elapsed: 0.010937
And the original pattern also gives better time :)
Time elapsed: 148.239001
Time elapsed: 143.499912
But we are not using the new re_extra
, instead will still call prre_exec
with pextra = ffi.new("pcre_extra *", None)
Ah, thanks - updated again, but now I'm confused, why it got better the result...
re = pcre_compile_fn(pattern, 0, errptr, erroroffset, tableptr)
re_extra = pcre_study_fn(re, 1, errptr)
instead of
re = pcre_compile_fn(pattern, 0, errptr, erroroffset, tableptr)
re_extra = pcre_study_fn(pattern, 1, errptr)
some working C++ code (all error handling removed)
m_pc = pcre_compile(pattern.c_str(), PCRE_DOTALL|PCRE_MULTILINE, &errptr, &erroffset, NULL);
m_pce = pcre_study(m_pc, PCRE_STUDY_JIT_COMPILE, &errptr);
...
pcre_exec(m_pc, m_pce, input.c_str(), input.size(), offset, 0, ovector, ovector_size);
I see - thanks again. Fixed. The code is familiar to me... looks like it's from the libmodsecurity :).
But now it's very fast. The original pattern too. I'm still confused.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ah, you're right - I think
python-pcre
uses JIT by defaultI'm going to update the gist.