Skip to content

Instantly share code, notes, and snippets.

@longlongh4
Last active April 8, 2020 06:38
Show Gist options
  • Save longlongh4/4d8c53fbbc4f47a99d060ee39c34bb99 to your computer and use it in GitHub Desktop.
Save longlongh4/4d8c53fbbc4f47a99d060ee39c34bb99 to your computer and use it in GitHub Desktop.
A demo to use FAISS to build indexes for Hamming Distance based fingerprints.
#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
#include <faiss/IndexBinaryIVF.h>
#include <faiss/AutoTune.h>
#include <faiss/IndexBinaryFlat.h>
using namespace std;
const string hashPath = "/home/hailong/labs/ads/hash.txt";
/****
dataset example:
37526101-d42e-416f-956d-31744915e0e4:2702284295253482914,1961827062776793808,15338216169055532216,1277173557604361546,323801233997848268,6859946266805363904,5389844540866461070,5463698079089396960,12707568597411666992,322984150776704616,5156804278819165924,7653749133710948762,35570857859768012,12088489394757528714,8796776315060649254,18059698677919318550
60277e94-b9a1-4539-ba42-7a8431caef18:18393304965971490470,6748912653283961598,14160794321219562102,12355755556287909842,12810266938956409008
*******/
class VideoHashes
{
public:
VideoHashes(const string line, int64_t id);
// dont't use UUID here, because it is too big as a payload for indexing
int64_t videoID;
// uint64 needs to be represented in vector of uint8 to be inserted into FAISS
vector<uint8_t> hashes;
};
VideoHashes::VideoHashes(const string line, int64_t id)
{
videoID = id;
istringstream iss(line);
iss.ignore(38, ':');
for (uint64_t hash; iss >> hash;)
{
if (hash != 0)
{
for (int i = 0; i < sizeof(uint64_t); i++)
{
hashes.push_back(uint8_t(hash >> 8 * (7 - i) & 0xFF));
}
}
if (iss.peek() == ',')
{
iss.ignore();
}
}
}
int main()
{
vector<VideoHashes> videoHashesArray;
ifstream input(hashPath);
int64_t videoIndex = 1;
for (string line; getline(input, line);)
{
videoHashesArray.push_back(VideoHashes(line, videoIndex++));
}
printf("parsed %lu videos\n", videoHashesArray.size());
vector<uint8_t> trainData;
for (VideoHashes video : videoHashesArray)
{
trainData.insert(end(trainData), begin(video.hashes), end(video.hashes));
}
// Dimension of the vectors
int d = 64;
// Initializing the quantizer.
faiss::IndexBinaryFlat quantizer(d);
// Number of clusters.
int nlist = 32;
// Initializing index.
faiss::IndexBinaryIVF index(&quantizer, d, nlist);
index.nprobe = 4; // Number of nearest clusters to be searched per query.
index.train(trainData.size() / 8, trainData.data());
for (VideoHashes video : videoHashesArray)
{
vector<faiss::Index::idx_t> labels(video.hashes.size() / 8, video.videoID);
index.add_with_ids(video.hashes.size() / 8, video.hashes.data(), labels.data());
}
cout << "total fingerprints:" << index.ntotal << endl;
// How many neighbours to return for each fingerprint
int k = 10;
// how many fingerprints in the query(we can query with more than one fingerprint)
faiss::Index::idx_t n = videoHashesArray[0].hashes.size() / 8;
// use this vector to get the distance result
vector<int32_t> distance(k * n);
// use this vector to get the labels result
vector<faiss::Index::idx_t> labels(k * n);
index.search(n, videoHashesArray[0].hashes.data(), k, distance.data(), labels.data());
for (int i = 0; i < n; i++)
{
cout << "frame index:" << i << endl;
for (int j = 0; j < k; j++)
{
cout << "distance:" << distance[i * 8 + j] << ',' << "labels:" << labels[i * 8 + j] << endl;
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment