Skip to content

Instantly share code, notes, and snippets.

View dchaplinsky's full-sized avatar

Dmitry Chaplinsky dchaplinsky

View GitHub Profile
@dchaplinsky
dchaplinsky / inflector.py
Last active May 7, 2023 19:06
A script for the pymorhpy3/pymorphy3-dicts-uk to generate inflections of ngrams
# pip install pymorphy3
# pip install pymorphy3-dicts-uk
import pymorphy3
from collections import defaultdict
from itertools import product
from typing import List, List
morph = pymorphy3.MorphAnalyzer(lang="uk")
import os
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
# Load pre-trained model for sentence embeddings
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
# Set up LSTM model
input_size = 768 # Size of the sentence embeddings
@dchaplinsky
dchaplinsky / export_ukr_news_dataset.py
Last active April 12, 2023 16:34
A set of scripts to export and deduplicate data from different ukrainian corpora for the GPT-x tuning
import json
import argparse
from typing import Dict
from pathlib import Path
import smart_open
import ftfy
from tqdm import tqdm
import html2text
from datasets import load_dataset
@dchaplinsky
dchaplinsky / embedder.py
Last active April 7, 2023 11:41
A script to embed sentences using different pooling strategy and rnn-like Flair embeddings
import argparse
from flair.data import Sentence
from flair.embeddings import (
DocumentEmbeddings,
FlairEmbeddings,
DocumentLMEmbeddings,
DocumentPoolEmbeddings,
)
from torch import Tensor
@dchaplinsky
dchaplinsky / instructions_retrieval.sh
Created April 6, 2023 14:30
Small bash script which downloads 1.6TB of extracted structured data of the common crawl and finds pages where HowTo/FAQ structured data is available.
#!/bin/bash
# You will need `apt get parallel pv` to make it run
# download file containing urls
curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt
# create output file
touch output.txt
@dchaplinsky
dchaplinsky / wiki_parser.py
Created March 25, 2023 15:39
A custom fork of the gensim's library wikipedia reader which is better suited for the dump of Ukrainian wikipedia
import bz2
import logging
import multiprocessing
import re
from pickle import PicklingError
# LXML isn't faster, so let's go with the built-in solution
from xml.etree.ElementTree import iterparse
import os.path
from flair.data import Dictionary
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
def train_flair_embeddings(
corpus_path="/data/ubertext/for_flair",
dictionary_path="/home/dima/Projects/flair_embeddings/flair_dictionary.pkl",
lm_file="./language_model_forward_no_amp_accum_grad_fixed",
@dchaplinsky
dchaplinsky / avatars.py
Created September 28, 2011 18:12
Get avatars from social networks (facebook and google) with django-social-auth
from social_auth.backends.facebook import FacebookBackend
from social_auth.backends import google
def social_extra_values(sender, user, response, details, **kwargs):
result = False
if "id" in response:
from apps.photo.models import Photo
from urllib2 import urlopen, HTTPError
from django.template.defaultfilters import slugify
@dchaplinsky
dchaplinsky / pwn_synset_cardinality.py
Created October 4, 2022 11:07
A simple way to calculate how many leaves the synset has and what its level in the hypernym/hyponym tree
import wn
import csv
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
wn.download("pwn:3.1")
pwn = wn.Wordnet("pwn:3.1")
@dchaplinsky
dchaplinsky / Blackboard.tmTheme
Created January 3, 2013 21:42
Fixed Blackboard.tmTheme for Sublime Text 2 (now with proper coloring for Diff). Matches diff colouring from TextMate Blackboard theme.
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>name</key>
<string>Blackboard</string>
<key>author</key>
<string>Domenico Carbotta</string>
<key>settings</key>
<array>