Skip to content

Instantly share code, notes, and snippets.

@gteissier
Last active October 17, 2018 14:23
Show Gist options
  • Save gteissier/b1fe0d1285fc89c5e04a7f564d641629 to your computer and use it in GitHub Desktop.
Save gteissier/b1fe0d1285fc89c5e04a7f564d641629 to your computer and use it in GitHub Desktop.
Binary grep: Python re, C Knuth-Pratt-Morris, C memmem, Ragel based. Indication of performance: Ragel is the winner, memmem is the worst
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <assert.h>
/*
to generate the partial match table of your needle, use kmp-compute.py:
$ ./kmp-compute.py 000000404142
static const unsigned char needle[6] = {0x00, 0x00, 0x00, 0x40, 0x41, 0x42};
static const int table[7] = {-1, -1, -1, 2, 0, 0, 0};
and copy the table definition instead of the one below
*/
static const unsigned char needle[6] = {0x00, 0x00, 0x00, 0x40, 0x41, 0x42};
static const int table[7] = {-1, -1, -1, 2, 0, 0, 0};
int main(int argc, char **argv) {
int fd;
int ret;
struct stat props;
const unsigned char *ptr;
int s, k;
fd = open(argv[1], O_RDONLY);
assert(fd != -1);
ret = fstat(fd, &props);
assert(ret == 0);
ptr = mmap(NULL, props.st_size, PROT_READ, MAP_SHARED, fd, 0);
assert(ptr != MAP_FAILED);
s = 0;
k = 0;
while (s < props.st_size) {
while (k != -1 && (k == sizeof(needle) || needle[k] != ptr[s])) {
k = table[k];
}
k += 1, s += 1;
if (k == sizeof(needle)) {
printf("offset 0x%lx\n", s-sizeof(needle));
}
}
error:
munmap((void *) ptr, props.st_size);
close(fd);
return 0;
}
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <assert.h>
static const unsigned char needle[6] = {0x00, 0x00, 0x00, 0x40, 0x41, 0x42};
int main(int argc, char **argv) {
int fd;
int ret;
struct stat props;
const unsigned char *ptr;
const unsigned char *p, *pe;
fd = open(argv[1], O_RDONLY);
assert(fd != -1);
ret = fstat(fd, &props);
assert(ret == 0);
ptr = mmap(NULL, props.st_size, PROT_READ, MAP_SHARED, fd, 0);
assert(ptr != MAP_FAILED);
p = ptr;
pe = ptr + props.st_size;
for (; p < pe; ) {
p = memmem(p, pe - p, needle, sizeof(needle));
if (p == NULL)
break;
printf("offset 0x%lx\n", p-ptr);
p += 1;
}
error:
munmap((void *) ptr, props.st_size);
close(fd);
return 0;
}
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <assert.h>
%%{
machine binary;
alphtype unsigned char;
write data;
}%%
int main(int argc, char **argv) {
int fd;
int ret;
struct stat props;
const unsigned char *ptr;
/* ragel needs these */
const unsigned char *p, *pe, *eof;
int cs = 0;
%%{
action match { printf("offset 0x%lx\n", fpc-ptr - 6 + 1); }
needle = 0x00 0x00 0x00 0x40 0x41 0x42 @match;
main := (any* needle any*)*;
}%%
%%write init;
fd = open(argv[1], O_RDONLY);
assert(fd != -1);
ret = fstat(fd, &props);
assert(ret == 0);
ptr = mmap(NULL, props.st_size, PROT_READ, MAP_SHARED, fd, 0);
assert(ptr != MAP_FAILED);
p = ptr;
pe = ptr + props.st_size;
eof = pe;
%%write exec;
if (cs == %%{write error;}%%) {
fprintf(stderr, "error in parser\n");
goto error;
}
error:
munmap((void *) ptr, props.st_size);
close(fd);
return 0;
}
#!/usr/bin/env python
import re
import mmap
import sys
import os
from binascii import unhexlify
pattern = re.escape(unhexlify(sys.argv[2]))
pattern = re.compile(pattern, re.S)
with open(sys.argv[1], 'rb') as f:
st = os.stat(sys.argv[1])
size = st.st_size
data = mmap.mmap(f.fileno(), size, mmap.MAP_SHARED, mmap.PROT_READ)
for m in re.finditer(pattern, data):
print('offset 0x%x' % m.start())
#!/usr/bin/env python
def build_table(W):
T = [0 for i in range(0, len(W)+1)]
T[0] = -1
pos = 1
cnd = 0
while pos < len(W):
if W[pos] == W[cnd]:
T[pos] = T[cnd]
pos += 1
cnd += 1
else:
T[pos] = cnd
cnd = T[cnd]
while cnd >= 0 and W[pos] != W[cnd]:
cnd = T[cnd]
pos += 1
cnd += 1
T[pos] = cnd
return T
assert(build_table('ABCDABD') == [-1, 0, 0, 0, -1, 0, 2, 0])
assert(build_table('ABACABABC') == [-1, 0, -1, 1, -1, 0, -1, 3, 2, 0])
assert(build_table('PARTICIPATE IN PARACHUTE') == [-1, 0, 0, 0, 0, 0, 0, -1, 0, 2, 0, 0, 0, 0, 0, -1, 0, 0, 3, 0, 0, 0, 0, 0, 0])
def to_c(table):
return 'static const int table[%d] = {%s};' % (len(table), ', '.join('%d' % x for x in table))
import sys
from binascii import unhexlify
needle = unhexlify(sys.argv[1])
print('static const unsigned char needle[%d] = {%s};' % (len(needle), ', '.join('0x%02x' % ord(x) for x in needle)))
print(to_c(build_table(needle)))
CFLAGS=-g -O2
all: bgrep-ragel.rl bgrep-kmp.c
ragel -C -G2 -o bgrep-ragel.c bgrep-ragel.rl
gcc -Wno-unused-const-variable $(CFLAGS) -c bgrep-ragel.c -o bgrep-ragel.o
gcc -o bgrep-ragel bgrep-ragel.o
gcc $(CFLAGS) -c bgrep-kmp.c -o bgrep-kmp.o
gcc -o bgrep-kmp bgrep-kmp.o
gcc $(CFLAGS) -c bgrep-memmem.c -o bgrep-memmem.o
gcc -o bgrep-memmem bgrep-memmem.o
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment