Consider this blog post model:
:
from bs4 import BeautifulSoup | |
from markdown import markdown | |
def markdown_to_text(markdown_string): | |
""" Converts a markdown string to plaintext """ | |
# md -> html -> text since BeautifulSoup can extract text cleanly | |
html = markdown(markdown_string) | |
# remove code snippets |
name: build | |
on: | |
push: | |
branches: | |
- 1.60 | |
jobs: | |
build: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout Source |
"""Script for fine-tuning Pegasus | |
Example usage: | |
# use XSum dataset as example, with first 1000 docs as training data | |
from datasets import load_dataset | |
dataset = load_dataset("xsum") | |
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000] | |
# use Pegasus Large model as base for fine-tuning | |
model_name = 'google/pegasus-large' | |
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels) |
from transformers import AutoTokenizer, AutoModel | |
def mean_pooling(model_output, attention_mask): | |
""" | |
Mean pooling to get sentence embeddings. See: | |
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 | |
""" | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns |
import torch | |
from torch.utils.data import Dataset, DataLoader | |
import numpy as np | |
class MyDataset(Dataset): | |
def __init__(self): | |
x = np.random.rand(1000, 3) # 1000 3-dim samples | |
self.x = [x[i].tolist() for i in range(1000)] | |
y = np.random.randint(low=0, high=2, size=(1000,)) |
certifi==2021.10.8 | |
charset-normalizer==2.0.9 | |
elasticsearch==7.15.2 | |
elasticsearch-dsl==7.4.0 | |
idna==3.3 | |
jieba==0.42.1 | |
python-dateutil==2.8.2 | |
requests==2.26.0 | |
six==1.16.0 | |
tkitJson==0.0.0.3 |
"""Hack to add per-session state to Streamlit. | |
Usage | |
----- | |
>>> import SessionState | |
>>> | |
>>> session_state = SessionState.get(user_name='', favorite_color='black') | |
>>> session_state.user_name | |
'' |
import faiss | |
import numpy as np | |
class FaissKMeans: | |
def __init__(self, n_clusters=8, n_init=10, max_iter=300): | |
self.n_clusters = n_clusters | |
self.n_init = n_init | |
self.max_iter = max_iter | |
self.kmeans = None |
#encoding=utf-8 | |
from transformers import ( | |
BartForConditionalGeneration, BartTokenizer, BartForCausalLM, | |
Seq2SeqTrainingArguments, Seq2SeqTrainer | |
) | |
import torch | |
from torch.utils.data import random_split |