Skip to content

Instantly share code, notes, and snippets.

@kzinmr
kzinmr / vecs.py
Last active December 31, 2023 17:16
Simplify the [supabase/vecs](https://github.com/supabase/vecs) library
import uuid
from collections.abc import Iterable
from dataclasses import dataclass
from enum import Enum
from typing import Any, Self
from typing import cast as typing_cast
from pgvector.sqlalchemy import Vector
from sqlalchemy import (
BinaryExpression,
import multiprocessing
import kenlm
import MeCab
lm_model = None
def to_punct_between_texts(
text_l, text_r, l_model, tokenizer, threshold=0.0, punct="。", print_score=False,
import os
from dataclasses import dataclass
from typing import Optional
import fugashi
import unidic_lite
@dataclass
class Token:
from textdistance import damerau_levenshtein
import mojimoji
import regex as re
def partial_ratio(s1, s2, levenshtein_ratio=damerau_levenshtein.normalized_similarity):
def _preprocess(s):
s = mojimoji.zen_to_han(s, kana=False, ascii=True, digit=True)
s = s.lower()
s = re.sub('\s+', '', s)
return s
import math
from collections import Counter
from dataclasses import dataclass
from itertools import tee, zip_longest
from typing import List
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import regex as re
# https://pypi.org/project/stop-words/
from stop_words import get_stop_words
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en')) #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
@kzinmr
kzinmr / kwdlc2camphr.py
Created February 12, 2021 10:02
Convert KWDLC to camphr NER format using https://github.com/ku-nlp/kyoto-reader
import os
import kyoto_reader #import KyotoReader, Document
def convert_camphr_dataset(doc: kyoto_reader.Document) -> list:
mrphs = doc.mrph_list()
named_entities = [ent for ent in doc.named_entities]
entities = []
midasis = [mrph.midasi for mrph in mrphs]
text = ''.join(midasis)
# 会社名サンプラー
import os
import pickle
import random
from dataclasses import dataclass
from typing import List
import numpy as np
def seed_everything(seed: int = 1234):
import torch
class WordDropout(torch.nn.Module):
"""
Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space.
"""
def __init__(self, dropout_rate=0.05, inplace=False):
super(WordDropout, self).__init__()
self.dropout_rate = dropout_rate