Skip to content

Instantly share code, notes, and snippets.

View macleginn's full-sized avatar

Dmitry Nikolayev macleginn

View GitHub Profile
import pickle
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
with open('sts_attributions/shelf_approx_attr_l-9_N-100.pkl', 'rb') as inp:
shelf_approx = pickle.load(inp)
@macleginn
macleginn / xsbert_worker_process.py
Last active September 8, 2023 12:50
XSBERT worker process
import os
import sys
import pickle
import requests
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Pooling
from sentence_transformers import util
from xsbert import models
@macleginn
macleginn / xsbert_queue_server.py
Created September 8, 2023 11:44
XSBERT queue server
import json
from http.server import BaseHTTPRequestHandler, HTTPServer
import pandas as pd
hostName = "localhost"
serverPort = 20000
# A global variable to store the queue elements
queue = []
@macleginn
macleginn / clusterise_domain.py
Created December 20, 2022 08:28
Clusterisation of fine-grained CMP domains based on SBERT sentence similarities
from collections import defaultdict
from itertools import combinations
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
def compute_kernel_bias(vecs, k=None):
"""
Code taken from: https://github.com/bojone/BERT-whitening
import os
import sys
import shutil
def copy_tree(src, dst):
'''
Copy a directory tree from src to dst ignoring dangling
symlinks, retrieving files symlinks point to, and
breaking the cycles, i.e. never copying the same
@macleginn
macleginn / predict_from_CLS.py
Last active June 3, 2022 13:54
Training and evaluation code for a simple model that predicts a token removed from a sentence
import json
from math import ceil
from random import shuffle
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW, get_scheduler
@macleginn
macleginn / simulation_step.py
Last active February 11, 2022 12:01
A step in the simulation of random feature spread on a network guided by NPM
import numpy as np
# We're given an n by n distance matrix *D* with transfer
# probabilities for a given pair of nodes (for any feature),
# a feature matrix *M*, and a dropout probability p_d.
# We convert the transfer probabilities to no-transfer probabilities
# and take their logs
L = np.log(1 - D)
# Собираем вместе все возможные знаки пунктуации
import sys
from unicodedata import category
chrs = (chr(i) for i in range(sys.maxunicode + 1))
punctuation = set(c for c in chrs if category(c).startswith("P"))
# Дефис бывает внутри слов
punctuation.remove('-')
def tokenize(s, lower_case=False):
@macleginn
macleginn / get_roberta_word_embeddings.py
Created June 21, 2021 07:17
Code for extracting word embeddings from RoBERTa
def rm_whitespace(s):
if s.startswith('Ġ'):
return s[1:]
else:
return s
def get_tokens_with_ranges(input_string, tokenizer):
'''
RoBERTa prepends 'Ġ' to the beginning of what it
import pandas as pd
import matplotlib.pyplot as plt
d = pd.read_excel('spectrograms-relative-20.xlsx', header=None)
# Combine the first two columns in a new index
index_col = [ f'{a}-{b}' for a, b in zip(d.iloc[:,0], d.iloc[:,1]) ]
d.index = index_col
# Delete old index columns
del d[0]
del d[1]