Skip to content

Instantly share code, notes, and snippets.

@icedraco
Created May 8, 2020 01:02
Show Gist options
  • Save icedraco/8417d28604a41b948af6cf2de9ff9726 to your computer and use it in GitHub Desktop.
Save icedraco/8417d28604a41b948af6cf2de9ff9726 to your computer and use it in GitHub Desktop.
ByteCompare: A means to find different blocks of bytes inside two otherwise very similar files
#pragma once
#include <exception>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include "DiffBlock.hpp"
using namespace std;
const static int buffer_size = 8192;
class BinaryFileNotGood : public exception {
public:
const string& path;
BinaryFileNotGood(const string& name) : path(name) {}
~BinaryFileNotGood() {}
const char* what() const throw () {
return (string("cannot open ") + path).c_str();
}
};
class BinaryFile {
private:
ifstream _ifs;
public:
const string& path;
const ios::pos_type size;
BinaryFile(const string& path) :
_ifs(path, ios::binary | ios::ate),
path(path),
size(_ifs.tellg())
{
if (!_ifs.good()) {
throw BinaryFileNotGood(path);
}
rewind();
}
BinaryFile(const BinaryFile& other) = delete;
~BinaryFile() {
_ifs.close();
}
friend ostream& operator<<(ostream& os, const BinaryFile& bf) {
os << bf.path;
return os;
}
inline void rewind() {
_ifs.seekg(0);
}
vector<DiffBlock> compare(BinaryFile& other);
private:
inline ifstream& stream() {
return _ifs;
}
};
vector<DiffBlock>
BinaryFile::compare(BinaryFile& other) {
vector<DiffBlock> blocks;
auto it_this = istreambuf_iterator<char>(stream());
auto it_other = istreambuf_iterator<char>(other.stream());
auto eof = istreambuf_iterator<char>();
// get a ref to the shortest iterator between the two
istreambuf_iterator<char>& min_iter(size < other.size ? it_this : it_other);
size_t offset_counter = 0;
size_t largest_block_size = 0;
while (min_iter != eof) {
// find the first differing byte
while (min_iter != eof && *it_this == *it_other) {
++it_this;
++it_other;
++offset_counter;
}
// if we found one (i.e., we're not at end of file)
if (min_iter != eof) {
size_t offset = offset_counter;
size_t size = 0;
// find the next matching byte
while (min_iter != eof && *it_this != *it_other) {
++it_this;
++it_other;
++size;
}
offset_counter += size;
blocks.push_back(DiffBlock { offset, size });
if (size > largest_block_size) {
largest_block_size = size;
}
}
}
return blocks;
}
#pragma once
struct DiffBlock {
size_t offset;
size_t size;
// Returns the offset after this block
size_t next() const { return offset + size; }
// Returns a DiffBlock that consists of the merger between this block,
// the next block, and all that's in between
DiffBlock operator+(const DiffBlock& other) const {
return DiffBlock { offset, other.next() - offset };
}
// Returns a distance (in bytes) between this block and the next.
// A distance is amount of bytes between the last byte of this block and
// the first byte of the next block.
size_t distance(const DiffBlock& next) const {
return next.offset - this->next();
}
};
#include <algorithm>
#include <iostream>
#include <fstream>
#include <vector>
#include "BinaryFile.hpp"
#include "DiffBlock.hpp"
using namespace std;
const static size_t max_gap = 8; // for block consolidation
static void print_blocks(const vector<DiffBlock>& blocks) {
cout << "--- Differing Blocks --------------------------" << endl;
for (auto it = blocks.cbegin(); it != blocks.cend(); ++it) {
cout << " > 0x" << hex << it->offset << ": " << it->size << " bytes" << endl;
}
}
static void consolidate(vector<DiffBlock>& blocks, size_t max_distance) {
for (size_t i = 0; i + 1 < blocks.size();) {
DiffBlock& current = blocks[i];
// merge next blocks into this one as long as they're close
while (i + 1 < blocks.size()) {
DiffBlock& next = blocks[i+1];
if (current.distance(next) <= max_distance) {
current.size = next.next() - current.offset;
blocks.erase(blocks.begin() + i + 1);
} else {
i++;
break;
}
}
}
}
static int compare(string& left, string& right) {
BinaryFile left_file(left);
BinaryFile right_file(right);
vector<DiffBlock> diff_blocks = left_file.compare(right_file);
consolidate(diff_blocks, max_gap);
print_blocks(diff_blocks);
return 0;
}
static void print_syntax(char **argv) {
cout << "Syntax: " << argv[0] << " <left_file> <right_file>" << endl;
}
int main(int argc, char **argv) {
if (argc < 3) {
print_syntax(argv);
return 1;
}
string left(argv[1]);
string right(argv[2]);
return compare(left, right);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment