Skip to content

Instantly share code, notes, and snippets.

@vsrinivas
Created April 15, 2019 02:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsrinivas/1287d576369bacdcdd2fcc3726d5dcde to your computer and use it in GitHub Desktop.
Save vsrinivas/1287d576369bacdcdd2fcc3726d5dcde to your computer and use it in GitHub Desktop.
#define _XOPEN_SOURCE 500
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <string.h>
#include <unordered_set>
#include <leveldb/db.h>
/* dbtransactor [ref] [src] */
int main(int argc, char* argv[])
{
std::unordered_set<std::string> ref_hashes;
int i;
int nRefFiles = 0;
int nDuplicates = 0;
int nMisses = 0;
leveldb::DB* ref;
leveldb::Options ref_options;
ref_options.create_if_missing = false;
leveldb::Status status = leveldb::DB::Open(ref_options, std::string(argv[1]), &ref);
if (!status.ok()) {
printf("Error opening ref\n");
return -1;
}
leveldb::DB* candidate;
status = leveldb::DB::Open(ref_options, std::string(argv[2]), &candidate);
if (!status.ok()) {
printf("Error opening candidate\n");
return -1;
}
leveldb::ReadOptions r_options;
r_options.verify_checksums = true;
r_options.fill_cache = false;
leveldb::Iterator* it = ref->NewIterator(r_options);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
if (!it->status().ok()) {
printf("Iteration error!\n");
break;
}
auto unique = ref_hashes.insert(it->value().ToString());
if (!unique.second)
++nDuplicates;
++nRefFiles;
}
delete it;
delete ref;
it = candidate->NewIterator(r_options);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
if (!it->status().ok()) {
printf("Iteration error!\n");
break;
}
bool found = ref_hashes.find(it->value().ToString()) != ref_hashes.end();
if (!found) {
printf("Not found in ref: %s, %s\n",
it->key().ToString().c_str(),
it->value().ToString().c_str());
++nMisses;
}
}
delete it;
delete candidate;
printf("nRefFiles %d\n", nRefFiles);
printf("nDuplicates %d\n", nDuplicates);
printf("nRefUnique %d\n", ref_hashes.size()); // nDuplicates + this == nRefFiles
printf("nMisses %d\n", nMisses);
}
#define _XOPEN_SOURCE 500
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <fcntl.h>
#include <ftw.h>
#include <string.h>
#include <leveldb/db.h>
#include "md5.h"
static leveldb::DB* db;
static int nFiles;
int cb(const char *path, const struct stat *sb, int typeflag, struct FTW *) {
char* val;
int rc = 0;
switch (typeflag) {
case FTW_F: {
if ((sb->st_mode & S_IFMT) != S_IFREG)
break;
leveldb::Slice key(path, strlen(path) + 1);
val = md5sum(path);
if (!val) {
printf("md5sum error: %s\n", path);
rc = -1;
break;
}
printf("==> %s, %s\n", path, val);
leveldb::Slice value(val, 32 + 1);
++nFiles;
auto s = db->Put(leveldb::WriteOptions(), key, value);
if (!s.ok()) {
printf("leveldb error: %s\n", path);
rc = -1;
}
free(val);
break;
}
default:
break;
}
return rc;
}
/* ftwdb2_leveldb <path> <dbfile> */
/* Make a LevelDB mapping every file to its MD5SUM (in 32-byte ascii string) */
int main(int argc, char *argv[])
{
int i;
leveldb::Options options;
options.create_if_missing = true;
leveldb::Status status = leveldb::DB::Open(options, argv[2], &db);
if (!status.ok()) {
printf("Error creating db\n");
return -1;
}
i = nftw(argv[1], cb, 64, FTW_PHYS);
if (i) printf("%d err %d\n", i, errno);
delete db;
printf("%d files\n", nFiles);
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <limits.h>
#include <openssl/md5.h>
/* MD5 a file */
char *md5sum(const char *path) {
FILE* fp;
unsigned char hash_buf[MD5_DIGEST_LENGTH];
MD5_CTX hash_context;
size_t bytes;
char data[1];
unsigned char sum[2 * MD5_DIGEST_LENGTH + 1];
int i, j;
int error = 0;
fp = fopen(path, "rb");
if (!fp)
return NULL;
j = 0;
bytes = 0;
bzero(data, sizeof(data));
bzero(sum, sizeof(sum));
MD5_Init(&hash_context);
for (;;) {
bytes = fread(data, sizeof(data), 1, fp);
if (bytes < sizeof(data)) {
if (ferror(fp)) {
// Error!
error = 1;
break;
}
}
MD5_Update(&hash_context, data, bytes);
if (bytes < sizeof(data))
if (feof(fp))
break;
}
MD5_Final(hash_buf, &hash_context);
fclose(fp);
for (i = 0; i < MD5_DIGEST_LENGTH; i++)
j += sprintf(&sum[j], "%02x", hash_buf[i]);
if (error)
return NULL;
return strdup(sum);
}
char *md5sumbuf(const char *buf, size_t size) {
unsigned char sum[2 * MD5_DIGEST_LENGTH + 1];
unsigned char hash_buf[MD5_DIGEST_LENGTH];
int i, j;
MD5_CTX hash_context;
MD5_Init(&hash_context);
MD5_Update(&hash_context, buf, size);
MD5_Final(hash_buf, &hash_context);
for (i = 0, j = 0; i < MD5_DIGEST_LENGTH; i++)
j += sprintf(&sum[j], "%02x", hash_buf[i]);
return strdup(sum);
}
#ifndef _MD5_H_
#define _MD5_H_
#include <stddef.h>
/* MD5 a file; return 32-byte string of 128-bit hash */
/* Caller is responsible for deallocating return value w/ ::free */
#ifdef __cplusplus
extern "C"
#endif
char *md5sum(const char *path);
#ifdef __cplusplus
extern "C"
#endif
char *md5sumbuf(const char *buf, size_t size);
#endif // _MD5_H_
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment