Skip to content

Instantly share code, notes, and snippets.

View IlyaGusev's full-sized avatar

Ilya Gusev IlyaGusev

  • Booking.com
  • Amsterdam
View GitHub Profile
@IlyaGusev
IlyaGusev / config.json
Last active January 17, 2022 12:29
Clustering embedder train
{
"max_tokens": 150,
"min_agreement": 0.79,
"epochs": 3,
"eval_steps": 32,
"warmup_steps": 16,
"lr": 0.00003,
"seed": 42,
"batch_size": 16,
"grad_accum_steps": 4,
@IlyaGusev
IlyaGusev / train.py
Last active July 10, 2023 06:58
Generic Trasformer clf finetune
import argparse
import json
import random
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from tqdm import tqdm
@IlyaGusev
IlyaGusev / calc_labse.py
Last active December 16, 2021 19:31
LABSE inference
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
DEFAULT_ENCODER_PATH = "https://tfhub.dev/google/LaBSE/2"
DEFAULT_PREPROCESSOR_PATH = "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2"
rmq:
image: rabbitmq:3.8.7
container_name: rabbitmq
volumes:
- /data/rabbitmq/etc/:/etc/rabbitmq/
- /data/rabbitmq/data/:/var/lib/rabbitmq/
- /data/rabbitmq/logs/:/var/log/rabbitmq/
ports:
- 5672:5672
- 15672:15672
@IlyaGusev
IlyaGusev / test_mbart.py
Created May 26, 2020 14:27
test_mbart.py
import torch
from tqdm import tqdm
from transformers import BartForConditionalGeneration, MBartTokenizer
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
def generate_summaries(examples, model_name, batch_size):
import torch
from transformers import (
BartConfig,
BartForConditionalGeneration
)
def remove_ignore_keys_(state_dict):
ignore_keys = [
"encoder.version",
"decoder.version",
@IlyaGusev
IlyaGusev / preprocess_bart.sh
Created April 28, 2020 10:31
BART dataset preprocessing
BART_PATH="mbart.cc25"
TASK="data"
rm -rf "${TASK}-bin/"
fairseq-preprocess \
--source-lang "source" \
--target-lang "target" \
--trainpref "${TASK}/train.bpe" \
--validpref "${TASK}/val.bpe" \
--testpref "${TASK}/test.bpe" \
--destdir "${TASK}-bin/" \
@IlyaGusev
IlyaGusev / setup.py
Created April 28, 2020 10:27
Fixed setup.py for APEX
import torch
from setuptools import setup, find_packages
import subprocess
import sys
import warnings
import os
# ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__))
@IlyaGusev
IlyaGusev / predict_bart.sh
Last active April 28, 2020 10:35
predict_bart.sh
FAIRSEQ_PATH="fairseq"
CHECKPOINT_PATH="checkpoints/checkpoint_best.pt"
DATA_BIN_PATH="data-bin"
BART_PATH="mbart.cc25"
langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
python3.7 "${FAIRSEQ_PATH}"/generate.py "${DATA_BIN_PATH}" --path "${CHECKPOINT_PATH}" \
--task translation_from_pretrained_bart --gen-subset test -t target -s source \
--bpe 'sentencepiece' --sentencepiece-vocab "${BART_PATH}/sentence.bpe.model" \
--sacrebleu --max-sentences 32 --langs $langs > predicted.txt
@IlyaGusev
IlyaGusev / setup.sh
Last active March 19, 2023 18:17
vast.ai setup
# Image: nvidia/cuda:11.7.0-cudnn8-devel-ubuntu20.04
touch ~/.no_auto_tmux
apt-get update
apt-get install -y vim screen wget g++ git python3.8 python3.8-dev python3-distutils git-lfs
wget https://www.dropbox.com/s/cxws7jpt3nlxn2l/vimrc -O ~/.vimrc
wget https://www.dropbox.com/s/wbdlntxmujpat9o/screenrc -O ~/.screenrc
wget https://bootstrap.pypa.io/get-pip.py && python3.8 get-pip.py
python3.8 -m pip install torch transformers