mildsunrise/searcher.c

## searcher.c
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>


#define LOOKAHEAD 8
#define START 0
#define BLOCK_SIZE 4096

static inline int search_match(const uint8_t *data) {
  /* Replace with your criteria */
  return data[3] == 0xCE
      && data[4] == 0xCF
      && data[5] == 0xCD
      && data[6] == 0xCD
      && data[7] == 0x2C
      && data[8] == 0x51

      && data[0] == 0x78
      && data[1] == 0x01
      && data[2] == 0x4B;
}


int main(int argc, char **argv) {
  off_t offset = START + LOOKAHEAD;
  uint8_t data [LOOKAHEAD + BLOCK_SIZE];
  uint8_t *buffer = data + LOOKAHEAD;
  const uint8_t *start, *end;

  /* Seek to the start, if necessary. */
  if (START) assert(lseek(0, START, SEEK_SET) != -1);
  /* Read the lookahead part, if necessary */
  if (LOOKAHEAD) assert(read(0, data, LOOKAHEAD) == LOOKAHEAD);

  while (1) {
    /* Read next chunk of data */
    end = data + read(0, buffer, BLOCK_SIZE);
    if (end == data) break;
    if (end < data) {
      fprintf(stderr, "I/O error when reading at offset %ld.\n", offset);
      return 1;
    }

    /* Search that chunk for occurences */
    for (start = data; start < end; start++) {
      if (!search_match(start)) continue;
      printf("%ld\n", offset + (off_t)(start - data));
      fflush(stdout);
    }

    /* Prepare for the next round */
    offset += (off_t)(start - data);
    if (start - data >= LOOKAHEAD) memcpy(data, start, LOOKAHEAD);
    else memmove(data, start, LOOKAHEAD);
  }

  fprintf(stderr, "Finished at offset %ld.\n", offset);
  return 0;
}

## searcher.md

      
    Raw
  

              searcher.md
            
          
    Template for a simple, fast searcher. It just reads from stdin
(optionally starting from a specified offset) and searches for
a hardcoded string (or more complex criteria) and prints out the
offsets of the occurences as a line to stdout. The program will
stop searching if an I/O error is found, though that's easy to
change.
This can work with huge files correctly. It's zero-dependency
and should build and run in any UNIX. Make sure to build with -O3!
Things you can adjust:


Search match function: this function should check for an
occurency at the beginning of data, and print the offset
if there is.


Lookahead: guarantee that at least N bytes are available
after the byte pointed by data. For example, if lookahead
is set to 2, then data[0], data[1] and data[2] point to
valid data. If it's set to 0, then only data[0] should be
accessed by the search match function. Don't set this higher
than what you need to read, or you'll miss occurences near
to the end of the file.


Start offset: Start searching from that offset in file.
This won't work if stdin is a pipe or any other non-seekable file;
in that case, it should be left as zero.


Block size: Size of each call to read. It should be a power
of two, such as 512, 1024, 2048, 4096. Only affects performance
and memory usage. The default should be fine.


Efficiency

For better performance, the search match function should start by
checking for uncommon patterns, and return as soon as it knows
there is no occurency in that offset.
Example: Git commits stored in repos typically start with
0x78 0x01 0x4B 0xCE 0xCF 0xCD 0xCD 0x2C 0x51

Our function could start by checking that data[0] is 0x78, but
that's an 'x' so it's a very common character to find in an HDD dump.
Instead, it's better to check that data[3] is 0xCE first, then
check data[4] is 0xCF, and the same with data[5]. If these tests
all succeed, the probability of having found a Git commit is high,
and we can now check the rest of the bytes to be sure.
Tweaks

Instead of constantly checking start < end, one could just append
an occurency to the end of the data. This alone would make the search
up to 67% faster.
The code could be modified to have the buffer aligned, and compare
shorts, or quads, instead of single bytes. This would, if done
correctly, give much better performance, since a lot less offsets
would now pass the first check.
	#include <stdint.h>
	#include <stdio.h>
	#include <sys/types.h>
	#include <unistd.h>
	#include <string.h>
	#include <assert.h>


	#define LOOKAHEAD 8
	#define START 0
	#define BLOCK_SIZE 4096

	static inline int search_match(const uint8_t *data) {
	/* Replace with your criteria */
	return data[3] == 0xCE
	&& data[4] == 0xCF
	&& data[5] == 0xCD
	&& data[6] == 0xCD
	&& data[7] == 0x2C
	&& data[8] == 0x51

	&& data[0] == 0x78
	&& data[1] == 0x01
	&& data[2] == 0x4B;
	}


	int main(int argc, char **argv) {
	off_t offset = START + LOOKAHEAD;
	uint8_t data [LOOKAHEAD + BLOCK_SIZE];
	uint8_t *buffer = data + LOOKAHEAD;
	const uint8_t start, end;

	/* Seek to the start, if necessary. */
	if (START) assert(lseek(0, START, SEEK_SET) != -1);
	/* Read the lookahead part, if necessary */
	if (LOOKAHEAD) assert(read(0, data, LOOKAHEAD) == LOOKAHEAD);

	while (1) {
	/* Read next chunk of data */
	end = data + read(0, buffer, BLOCK_SIZE);
	if (end == data) break;
	if (end < data) {
	fprintf(stderr, "I/O error when reading at offset %ld.\n", offset);
	return 1;
	}

	/* Search that chunk for occurences */
	for (start = data; start < end; start++) {
	if (!search_match(start)) continue;
	printf("%ld\n", offset + (off_t)(start - data));
	fflush(stdout);
	}

	/* Prepare for the next round */
	offset += (off_t)(start - data);
	if (start - data >= LOOKAHEAD) memcpy(data, start, LOOKAHEAD);
	else memmove(data, start, LOOKAHEAD);
	}

	fprintf(stderr, "Finished at offset %ld.\n", offset);
	return 0;
	}