Last active
October 17, 2018 14:23
-
-
Save gteissier/b1fe0d1285fc89c5e04a7f564d641629 to your computer and use it in GitHub Desktop.
Binary grep: Python re, C Knuth-Pratt-Morris, C memmem, Ragel based. Indication of performance: Ragel is the winner, memmem is the worst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <unistd.h> | |
#include <fcntl.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <sys/mman.h> | |
#include <assert.h> | |
/* | |
to generate the partial match table of your needle, use kmp-compute.py: | |
$ ./kmp-compute.py 000000404142 | |
static const unsigned char needle[6] = {0x00, 0x00, 0x00, 0x40, 0x41, 0x42}; | |
static const int table[7] = {-1, -1, -1, 2, 0, 0, 0}; | |
and copy the table definition instead of the one below | |
*/ | |
static const unsigned char needle[6] = {0x00, 0x00, 0x00, 0x40, 0x41, 0x42}; | |
static const int table[7] = {-1, -1, -1, 2, 0, 0, 0}; | |
int main(int argc, char **argv) { | |
int fd; | |
int ret; | |
struct stat props; | |
const unsigned char *ptr; | |
int s, k; | |
fd = open(argv[1], O_RDONLY); | |
assert(fd != -1); | |
ret = fstat(fd, &props); | |
assert(ret == 0); | |
ptr = mmap(NULL, props.st_size, PROT_READ, MAP_SHARED, fd, 0); | |
assert(ptr != MAP_FAILED); | |
s = 0; | |
k = 0; | |
while (s < props.st_size) { | |
while (k != -1 && (k == sizeof(needle) || needle[k] != ptr[s])) { | |
k = table[k]; | |
} | |
k += 1, s += 1; | |
if (k == sizeof(needle)) { | |
printf("offset 0x%lx\n", s-sizeof(needle)); | |
} | |
} | |
error: | |
munmap((void *) ptr, props.st_size); | |
close(fd); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <unistd.h> | |
#include <fcntl.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <sys/mman.h> | |
#include <assert.h> | |
static const unsigned char needle[6] = {0x00, 0x00, 0x00, 0x40, 0x41, 0x42}; | |
int main(int argc, char **argv) { | |
int fd; | |
int ret; | |
struct stat props; | |
const unsigned char *ptr; | |
const unsigned char *p, *pe; | |
fd = open(argv[1], O_RDONLY); | |
assert(fd != -1); | |
ret = fstat(fd, &props); | |
assert(ret == 0); | |
ptr = mmap(NULL, props.st_size, PROT_READ, MAP_SHARED, fd, 0); | |
assert(ptr != MAP_FAILED); | |
p = ptr; | |
pe = ptr + props.st_size; | |
for (; p < pe; ) { | |
p = memmem(p, pe - p, needle, sizeof(needle)); | |
if (p == NULL) | |
break; | |
printf("offset 0x%lx\n", p-ptr); | |
p += 1; | |
} | |
error: | |
munmap((void *) ptr, props.st_size); | |
close(fd); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <unistd.h> | |
#include <fcntl.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <sys/mman.h> | |
#include <assert.h> | |
%%{ | |
machine binary; | |
alphtype unsigned char; | |
write data; | |
}%% | |
int main(int argc, char **argv) { | |
int fd; | |
int ret; | |
struct stat props; | |
const unsigned char *ptr; | |
/* ragel needs these */ | |
const unsigned char *p, *pe, *eof; | |
int cs = 0; | |
%%{ | |
action match { printf("offset 0x%lx\n", fpc-ptr - 6 + 1); } | |
needle = 0x00 0x00 0x00 0x40 0x41 0x42 @match; | |
main := (any* needle any*)*; | |
}%% | |
%%write init; | |
fd = open(argv[1], O_RDONLY); | |
assert(fd != -1); | |
ret = fstat(fd, &props); | |
assert(ret == 0); | |
ptr = mmap(NULL, props.st_size, PROT_READ, MAP_SHARED, fd, 0); | |
assert(ptr != MAP_FAILED); | |
p = ptr; | |
pe = ptr + props.st_size; | |
eof = pe; | |
%%write exec; | |
if (cs == %%{write error;}%%) { | |
fprintf(stderr, "error in parser\n"); | |
goto error; | |
} | |
error: | |
munmap((void *) ptr, props.st_size); | |
close(fd); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import mmap | |
import sys | |
import os | |
from binascii import unhexlify | |
pattern = re.escape(unhexlify(sys.argv[2])) | |
pattern = re.compile(pattern, re.S) | |
with open(sys.argv[1], 'rb') as f: | |
st = os.stat(sys.argv[1]) | |
size = st.st_size | |
data = mmap.mmap(f.fileno(), size, mmap.MAP_SHARED, mmap.PROT_READ) | |
for m in re.finditer(pattern, data): | |
print('offset 0x%x' % m.start()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
def build_table(W): | |
T = [0 for i in range(0, len(W)+1)] | |
T[0] = -1 | |
pos = 1 | |
cnd = 0 | |
while pos < len(W): | |
if W[pos] == W[cnd]: | |
T[pos] = T[cnd] | |
pos += 1 | |
cnd += 1 | |
else: | |
T[pos] = cnd | |
cnd = T[cnd] | |
while cnd >= 0 and W[pos] != W[cnd]: | |
cnd = T[cnd] | |
pos += 1 | |
cnd += 1 | |
T[pos] = cnd | |
return T | |
assert(build_table('ABCDABD') == [-1, 0, 0, 0, -1, 0, 2, 0]) | |
assert(build_table('ABACABABC') == [-1, 0, -1, 1, -1, 0, -1, 3, 2, 0]) | |
assert(build_table('PARTICIPATE IN PARACHUTE') == [-1, 0, 0, 0, 0, 0, 0, -1, 0, 2, 0, 0, 0, 0, 0, -1, 0, 0, 3, 0, 0, 0, 0, 0, 0]) | |
def to_c(table): | |
return 'static const int table[%d] = {%s};' % (len(table), ', '.join('%d' % x for x in table)) | |
import sys | |
from binascii import unhexlify | |
needle = unhexlify(sys.argv[1]) | |
print('static const unsigned char needle[%d] = {%s};' % (len(needle), ', '.join('0x%02x' % ord(x) for x in needle))) | |
print(to_c(build_table(needle))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CFLAGS=-g -O2 | |
all: bgrep-ragel.rl bgrep-kmp.c | |
ragel -C -G2 -o bgrep-ragel.c bgrep-ragel.rl | |
gcc -Wno-unused-const-variable $(CFLAGS) -c bgrep-ragel.c -o bgrep-ragel.o | |
gcc -o bgrep-ragel bgrep-ragel.o | |
gcc $(CFLAGS) -c bgrep-kmp.c -o bgrep-kmp.o | |
gcc -o bgrep-kmp bgrep-kmp.o | |
gcc $(CFLAGS) -c bgrep-memmem.c -o bgrep-memmem.o | |
gcc -o bgrep-memmem bgrep-memmem.o |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment