Skip to content

Instantly share code, notes, and snippets.

@xigh
Last active January 23, 2019 18:06
Show Gist options
  • Save xigh/0a8f520010322a6bcd3a0a96732f087c to your computer and use it in GitHub Desktop.
Save xigh/0a8f520010322a6bcd3a0a96732f087c to your computer and use it in GitHub Desktop.
Retrieve openoffice documents from formatted harddrive
1- calls mmap on the disk device
2- looks for zip headers
3- retrieves the begining of the zip file
4- check if it contains a mimetype file with the expected mimetype
5- save the result to a file
#define _GNU_SOURCE
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <assert.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>
#include <sys/ioctl.h>
#include <linux/fs.h>
#include <stdint.h>
#include <stdlib.h>
typedef unsigned char byte;
void dump(byte* ptr, int64_t off, int64_t len, int lines, char *indent) {
int64_t i, j;
byte c;
for (i = 0; i < len-off; i++) {
if (i%16 == 0) {
if (i > 0) {
printf(" ");
for (j = i - 16; j < i; j++) {
c = ptr[off+j];
if (!isprint(c)) {
c = '-';
}
printf("%c ", c);
}
printf("\n");
if (i/16 >= lines) {
return;
}
}
printf("%s%016lx: ", indent, off+i);
}
printf("%02x ", ptr[off+i]);
}
j = i;
while (j%16 != 0) {
printf("-- ");
j++;
}
printf(" ");
for (j = i & ~0xf; j < len-off; j++) {
c = ptr[off+j];
if (!isprint(c)) {
c = '-';
}
printf("%c ", c);
}
printf("\n");
}
byte pkzip[] = { 0x50, 0x4b, 0x05, 0x06 };
byte local[] = { 0x50, 0x4b, 0x03, 0x04 };
char odt[] = "application/vnd.oasis.opendocument.text";
int odts = sizeof(odt) - 1;
char ods[] = "application/vnd.oasis.opendocument.spreadsheet";
int odss = sizeof(ods) - 1;
char odp[] = "application/vnd.oasis.opendocument.presentation";
int odps = sizeof(odp) - 1;
char *ext[] = { "B", "kB", "MB", "GB", "TB" };
uint16_t getUint16(byte *data, int64_t off)
{
uint16_t lo = data[off + 0];
uint16_t hi = data[off + 1];
return lo + (hi << 8);
}
uint32_t getUint32(byte *data, int64_t off)
{
uint32_t lo = getUint16(data, off);
uint32_t hi = getUint16(data, off + 2);
return lo + (hi << 16);
}
int savezip(byte *data, int64_t off, int64_t len, int idx)
{
int64_t head = getUint32(data, off + 16);
int64_t size = getUint32(data, off + 12);
int64_t pos;
uint16_t r, n, m;
uint32_t o, c, u;
char *path, *ext = 0;
int err = 0;
uint16_t disk = getUint16(data, off + 4);
uint16_t start = getUint16(data, off + 6);
uint16_t recs = 0, nrecs = getUint16(data, off + 10);
int hasMimetype = 0;
int hasContent = 0;
if (disk != 0 || start != 0) {
printf(" ** bad header **\n\n");
return 1;
}
// dump(data, off, len, 4, "");
// printf("disk: %d\n", getUint16(data, off + 4));
// printf("start: %d\n", getUint16(data, off + 6));
// printf("numdisks: %d\n", getUint16(data, off + 8));
// printf("records: %d\n", nrecs);
// printf("size: %ld\n", size);
// printf("offset: %ld [%016lx]\n", head, off-head);
pos = off - size;
while (err == 0 && pos < off)
{
// dump(data, pos, len, 4);
r = getUint16(data, pos + 10);
c = getUint32(data, pos + 20);
u = getUint32(data, pos + 24);
// printf("\tcompr: %d\n", r);
// printf("\tcsize: %d\n", c);
// printf("\tusize: %d\n", u);
n = getUint16(data, pos + 28);
m = getUint16(data, pos + 30);
o = getUint32(data, pos + 42);
// printf("\tnamelen: %d\n", n);
// printf("\textralen: %d\n", m);
// printf("\tf-offset: %d\n", o);
// printf("\t%08x: %s (%d -> %d) [%d]\n", o, path, c, u, r);
// dump(data, off - head - size + o, len, 4, "\t + ");
if (memcmp(data + off - head - size + o, local, sizeof local) != 0) {
printf("\t ** invalid head\n");
err = 1;
}
else {
uint16_t n2 = getUint16(data, off - head - size + o + 26);
uint16_t m2 = getUint16(data, off - head - size + o + 28);
if (n != n2) {
printf("\t ** invalid head [2]\n");
err = 1;
}
else {
path = strndup(data + off - head - size + o + 30, n2);
if (path != 0) {
if (strcmp(path, "mimetype") == 0) {
hasMimetype = 1;
if (c == u && r == 0) {
byte *ptr = data + off - head - size + o + 30 + n2 + m2;
if ((c == odts) && (memcmp(ptr, odt, odts) == 0))
ext = "odt";
else if ((c == odss) && (memcmp(ptr, ods, odss) == 0))
ext = "ods";
else if ((c == odps) && (memcmp(ptr, odp, odps) == 0))
ext = "odp";
// dump(data, off - head - size + o + 30 + n2 + m2, len, 3, "\t");
}
}
if (strcmp(path, "content.xml") == 0) {
hasContent = 1;
}
printf("\t%08x: %s [%s] (%d -> %d) [%d]\n", o, path, ext ? ext : "", c, u, r);
free(path);
} else {
printf("\t ** could not allocate memory for path\n");
err = 1;
}
}
}
recs += 1;
pos += 46 + n + m;
// printf("\n");
}
printf("\trecs=%d/%d, zip at %016lx\n", recs, nrecs, off - head - size);
if (recs == nrecs && err == 0) {
char *name;
int l = 0;
if (hasMimetype && hasContent && ext != 0) {
l = asprintf(&name, "tmp2/file_%06d.%s", idx, ext);
} else {
l = asprintf(&name, "tmp/file_%06d.zip", idx);
}
if (l > 0) {
int fd = open(name, O_WRONLY|O_CREAT|O_TRUNC, 0664);
if (fd >= 0) {
ssize_t sz = write(fd, data + off - head - size, head + size + 46 + m + n);
if (sz < 0) {
printf("## failed to save %s\n", name);
err = 0;
}
close(fd);
} else {
printf("## failed to open %s\n", name);
err = 0;
}
printf(" => file saved to %s\n", name);
free(name);
} else {
printf("## could not allocate memory for path\n");
err = 0;
}
}
printf("\n");
return err;
}
int main(int argc, char **argv)
{
int z, n, fd, err;
struct stat st;
byte *data, *pk;
int64_t off, sz;
err = stat(argv[1], &st);
if (err == -1) {
printf("stat failed with: %s\n", strerror(errno));
return 0;
}
fd = open(argv[1], O_RDONLY);
if (fd == -1) {
printf("open failed with: %s\n", strerror(errno));
return 0;
}
if (st.st_size == 0) {
err = ioctl(fd, BLKGETSIZE64, &st.st_size);
if (err == -1) {
printf("ioctl failed with: %s\n", strerror(errno));
return 0;
}
}
n = 0;
sz = st.st_size;
while (sz > 1024ull) {
sz /= 1024ull;
n += 1;
}
printf("device size: %zd (%zd%s)\n\n", st.st_size, sz, ext[n]);
// dump(pkzip, 0, sizeof pkzip, 1);
// printf("\n");
data = (byte *) mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED) {
printf("mmap failed with: %s\n", strerror(errno));
return 0;
}
z = 0;
pk = data+0;
for (n = 0; n < 20000; n++) {
off = pk-data;
pk = memmem(pk, st.st_size-off, pkzip, sizeof(pkzip));
if (pk == 0) {
break;
}
off = pk-data;
printf("%d: found PK at offset: %zd (%.1f%%)\n", n + 1, off, (100.0 * (float) off) / (float) st.st_size);
if (0 == savezip(data, off, st.st_size, z)) {
z += 1;
}
if (z > 2000) {
break;
}
pk += 4;
}
printf("found %d zip files\n", z);
err = munmap(data, st.st_size);
if (err != 0) {
printf("munmap failed with: %s\n", strerror(errno));
return 0;
}
close(fd);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment