Skip to content

Instantly share code, notes, and snippets.

View dchaplinsky's full-sized avatar

Dmitry Chaplinsky dchaplinsky

View GitHub Profile
import os
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
# Load pre-trained model for sentence embeddings
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
# Set up LSTM model
input_size = 768 # Size of the sentence embeddings
@dchaplinsky
dchaplinsky / export_ukr_news_dataset.py
Last active April 12, 2023 16:34
A set of scripts to export and deduplicate data from different ukrainian corpora for the GPT-x tuning
import json
import argparse
from typing import Dict
from pathlib import Path
import smart_open
import ftfy
from tqdm import tqdm
import html2text
from datasets import load_dataset
@dchaplinsky
dchaplinsky / embedder.py
Last active April 7, 2023 11:41
A script to embed sentences using different pooling strategy and rnn-like Flair embeddings
import argparse
from flair.data import Sentence
from flair.embeddings import (
DocumentEmbeddings,
FlairEmbeddings,
DocumentLMEmbeddings,
DocumentPoolEmbeddings,
)
from torch import Tensor
@dchaplinsky
dchaplinsky / instructions_retrieval.sh
Created April 6, 2023 14:30
Small bash script which downloads 1.6TB of extracted structured data of the common crawl and finds pages where HowTo/FAQ structured data is available.
#!/bin/bash
# You will need `apt get parallel pv` to make it run
# download file containing urls
curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt
# create output file
touch output.txt
@dchaplinsky
dchaplinsky / wiki_parser.py
Created March 25, 2023 15:39
A custom fork of the gensim's library wikipedia reader which is better suited for the dump of Ukrainian wikipedia
import bz2
import logging
import multiprocessing
import re
from pickle import PicklingError
# LXML isn't faster, so let's go with the built-in solution
from xml.etree.ElementTree import iterparse
@dchaplinsky
dchaplinsky / inflector.py
Last active May 7, 2023 19:06
A script for the pymorhpy3/pymorphy3-dicts-uk to generate inflections of ngrams
# pip install pymorphy3
# pip install pymorphy3-dicts-uk
import pymorphy3
from collections import defaultdict
from itertools import product
from typing import List, List
morph = pymorphy3.MorphAnalyzer(lang="uk")
import os.path
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
def train_flair_embeddings(
corpus_path="/data/ubertext/for_flair",
dictionary_path="/home/dima/Projects/flair_embeddings/flair_dictionary.pkl",
lm_file="./language_model_forward_no_amp_accum_grad_fixed",
@dchaplinsky
dchaplinsky / pwn_synset_cardinality.py
Created October 4, 2022 11:07
A simple way to calculate how many leaves the synset has and what its level in the hypernym/hyponym tree
import wn
import csv
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
wn.download("pwn:3.1")
pwn = wn.Wordnet("pwn:3.1")
{
"lotNumberStr": "27059380",
"ln": 27059380,
"mkn": "DODGE",
"lm": "CHALLENGER",
"lcy": 2018,
"fv": "2C3CDZGGXJH289026",
"la": 24998,
"rc": 27890,
"obc": "A",
--- ../venv/lib/python3.6/site-packages/flask_lambda.py 2017-07-04 02:51:23.000000000 +0300
+++ lambdas/venues_finder/venues_finder/lambdify.py 2017-07-10 16:49:41.000000000 +0300
@@ -31,7 +31,7 @@
except ImportError:
from io import StringIO
-from werkzeug.wrappers import BaseRequest
+from werkzeug.wrappers import BaseRequest, Response