Skip to content

Instantly share code, notes, and snippets.

View amallia's full-sized avatar
🏠
In Pisa

Antonio Mallia amallia

🏠
In Pisa
View GitHub Profile
syntax = "proto3";
// dense.proto
// This file defines the structure for a Dense Index in the Common Index File Format (CIFF).
// A Dense Index is used to store high-dimensional embeddings of documents,
// typically used in vector space models for information retrieval.
// An Embedding message represents a dense vector for a document.
// Each embedding is a high-dimensional vector, where each dimension is a float.
message Embedding {
class MonoBERT(BertPreTrainedModel):
def __init__(self, config):
super(MonoBERT, self).__init__(config)
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.bert = BertForSequenceClassification(config)
self.init_weights()
def forward(self, input_ids, attention_mask, token_type_ids):
outputs = self.model(input_ids, attention_mask, token_type_ids)
logits = outputs[0]
checkpoint_callback = ModelCheckpoint(dirpath=os.path.join(
os.getcwd(), "checkpoints"), filename='step{step:02d}-train_loss{train_loss:.3f}', auto_insert_metric_name=False, verbose=True, every_n_train_steps=250)
trainer = pl.Trainer.from_argparse_args(
args, callbacks=[checkpoint_callback], accumulate_grad_batches=2, gradient_clip_val=2)
trainer.fit(model)
#include <iostream>
#include <optional>
#include <unordered_set>
#include "boost/algorithm/string/classification.hpp"
#include "boost/algorithm/string/split.hpp"
#include <boost/functional/hash.hpp>
#include "boost/algorithm/string/split.hpp"
#include "mio/mmap.hpp"
1d03d845d745bf9da5b91407ec9bc4dd ./NYT-Corpus/data/1991/08.tgz
c93f03e6e77b7d4ac288cfb3e4d488f4 ./NYT-Corpus/data/1990/02.tgz
9957d82c9943ea56eb041198c9d7e6b3 ./NYT-Corpus/data/1990/07.tgz
a6df9a82a892963b7eb1dae32aa81451 ./NYT-Corpus/data/1990/12.tgz
807396434c0569fb5619df27339491f1 ./NYT-Corpus/data/1990/01.tgz
0c1a2cd83e28bee2470820bb34c68426 ./NYT-Corpus/data/1990/03.tgz
c9b50b7284217c2e99e375aa8d778533 ./NYT-Corpus/data/1990/04.tgz
f18d5732e58f786b71c44464d4134277 ./NYT-Corpus/data/1990/10.tgz
06640752bc94bc23efdf3f27b1c3c93c ./NYT-Corpus/data/1991/06.tgz
fee662846305bacab24676a785a7cb39 ./NYT-Corpus/data/1991/10.tgz
@amallia
amallia / machine_learned_index.py
Created March 2, 2019 07:42 — forked from tokestermw/machine_learned_index.py
Using deep learning to approximate a B-Tree index from this paper: https://arxiv.org/abs/1712.01208 (The Case for Learned Index Structures)
import click
import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable
import os
import random
import math
#pragma once
template <typename WandType>
struct block_max_maxscore_query {
typedef bm25 scorer_type;
block_max_maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
template <typename Index>
@amallia
amallia / custom_iterator.cpp
Created September 29, 2018 00:30 — forked from jeetsukumaran/custom_iterator.cpp
Sample C++/STL custom iterator
// Sample custom iterator.
// By perfectly.insane (http://www.dreamincode.net/forums/index.php?showuser=76558)
// From: http://www.dreamincode.net/forums/index.php?showtopic=58468
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include <cassert>
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
## All plt.rcParams lines are defined globally for all the following plots.
# Use latex to produce it and use the ACM default sans-serif package
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.preamble'] = r'\usepackage{libertine}\usepackage{fixltx2e}'
plt.rcParams['font.family'] = "sans-serif"
@amallia
amallia / search.cpp
Last active December 19, 2017 22:23
// search.cpp
// g++ -march=native -std=c++14 search.cpp -o search
#include <random>
#include <iostream>
#include <cstdlib>
#include <smmintrin.h>
#include <vector>
#include <algorithm>
#include <chrono>