Skip to content

Instantly share code, notes, and snippets.

@alexeyev
alexeyev / simple_text_classification_distilbert.py
Created July 30, 2023 07:50
Binary classification with DistilBERT, minimal example
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
@alexeyev
alexeyev / apertium_tokenizer.py
Last active July 26, 2023 17:41
Apertium-Kir-Based Tokenizer
# coding: utf-8
"""
Tokenization as it is done in Apertium; may not be blazing fast,
since a full-scale morphological analysis is carried out
"""
import apertium
import re
from typing import List, Tuple
from streamparser import LexicalUnit, reading_to_string
@alexeyev
alexeyev / LICENSE
Last active July 21, 2023 14:20
Converting Doccano NER task export (JSONL file) to a CONLL03-formatted file
MIT License
Copyright (c) 2023 Anton Alekseev
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
# coding: utf-8
sentences = []
for line in open("test.txt", "r", encoding="utf-8").readlines()[1:]:
seq = line.strip().split(" ")
if len(seq) == 1:
sentences.append([])
@alexeyev
alexeyev / onnx2pytorch.py
Created September 30, 2021 23:18 — forked from qinjian623/onnx2pytorch.py
ONNX file to Pytorch model
import onnx
import struct
import torch
import torch.nn as nn
import torchvision as tv
import warnings
# enum DataType {
# UNDEFINED = 0;
@alexeyev
alexeyev / raspberry_pi_camera_telegram_bot.py
Last active August 31, 2021 18:13
Считаем ворон с помощью Raspberry Pi и Telegram API
# coding: utf-8
import configparser
import logging
import telebot
from time import sleep
from picamera import PiCamera
logger = logging.getLogger("counting-crops")
logger.setLevel(logging.DEBUG)
@alexeyev
alexeyev / hogweed_photos_collector_bot.py
Created August 31, 2021 12:12
Telegram bot saving (hogweed) photos on disk
# coding: utf-8
import configparser
import logging
import telebot
logger = logging.getLogger("hogweed-ground-level")
logger.setLevel(logging.DEBUG)
from functools import lru_cache
from nltk import TweetTokenizer, WordNetLemmatizer
from tqdm import tqdm
from gsdmm import MovieGroupProcess
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import pickle
import nltk
# coding: utf-8
from difflib import SequenceMatcher
t0 = open("text0.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
t1 = open("text1.txt", "r+").read().strip().replace("\n", " ").replace(" ", " ")
matcher = SequenceMatcher(a=t0, b=t1)
ratio = matcher.ratio()
mbs = matcher.get_matching_blocks()
#!/usr/bin/env python3
"""
We do not recommend using this script for any purposes other than learning to use Selenium;
for batched machine translation via Google Translate using 'document' translation feature
is arguably the most suitable. For regular translations one should use the Cloud API.
"""
import time
from selenium.common.exceptions import TimeoutException