This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import uuid | |
from collections.abc import Iterable | |
from dataclasses import dataclass | |
from enum import Enum | |
from typing import Any, Self | |
from typing import cast as typing_cast | |
from pgvector.sqlalchemy import Vector | |
from sqlalchemy import ( | |
BinaryExpression, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
import kenlm | |
import MeCab | |
lm_model = None | |
def to_punct_between_texts( | |
text_l, text_r, l_model, tokenizer, threshold=0.0, punct="。", print_score=False, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from dataclasses import dataclass | |
from typing import Optional | |
import fugashi | |
import unidic_lite | |
@dataclass | |
class Token: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
find . -type d -empty -delete |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from textdistance import damerau_levenshtein | |
import mojimoji | |
import regex as re | |
def partial_ratio(s1, s2, levenshtein_ratio=damerau_levenshtein.normalized_similarity): | |
def _preprocess(s): | |
s = mojimoji.zen_to_han(s, kana=False, ascii=True, digit=True) | |
s = s.lower() | |
s = re.sub('\s+', '', s) | |
return s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from collections import Counter | |
from dataclasses import dataclass | |
from itertools import tee, zip_longest | |
from typing import List | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import regex as re |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://pypi.org/project/stop-words/ | |
from stop_words import get_stop_words | |
from nltk.corpus import stopwords | |
stop_words = list(get_stop_words('en')) #About 900 stopwords | |
nltk_words = list(stopwords.words('english')) #About 150 stopwords |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import kyoto_reader #import KyotoReader, Document | |
def convert_camphr_dataset(doc: kyoto_reader.Document) -> list: | |
mrphs = doc.mrph_list() | |
named_entities = [ent for ent in doc.named_entities] | |
entities = [] | |
midasis = [mrph.midasi for mrph in mrphs] | |
text = ''.join(midasis) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 会社名サンプラー | |
import os | |
import pickle | |
import random | |
from dataclasses import dataclass | |
from typing import List | |
import numpy as np | |
def seed_everything(seed: int = 1234): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
class WordDropout(torch.nn.Module): | |
""" | |
Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space. | |
""" | |
def __init__(self, dropout_rate=0.05, inplace=False): | |
super(WordDropout, self).__init__() | |
self.dropout_rate = dropout_rate |
NewerOlder