Skip to content

Instantly share code, notes, and snippets.

@napoler
napoler / markdown_to_text.py
Created October 16, 2022 19:56 — forked from lorey/markdown_to_text.py
Markdown to Plaintext in Python
from bs4 import BeautifulSoup
from markdown import markdown
def markdown_to_text(markdown_string):
""" Converts a markdown string to plaintext """
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
# remove code snippets
name: build
on:
push:
branches:
- 1.60
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout Source
@napoler
napoler / 0-howto-listfield-django-admin.rst
Last active August 18, 2022 05:56 — forked from jonashaag/0-howto-listfield-django-admin.rst
Howto use ListFields in Django's admin

Howto use ListFields in Django's admin

Problem

Consider this blog post model:

models.py

:

@napoler
napoler / pegasus_fine_tune.py
Created June 24, 2022 15:41 — forked from jiahao87/pegasus_fine_tune.py
Pytorch script for fine-tuning Pegasus Large model
"""Script for fine-tuning Pegasus
Example usage:
# use XSum dataset as example, with first 1000 docs as training data
from datasets import load_dataset
dataset = load_dataset("xsum")
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
@napoler
napoler / sts_sentence_embedding.py
Created February 18, 2022 13:51 — forked from MathiasGruber/sts_sentence_embedding.py
Embedding questions using sentence transformer model
from transformers import AutoTokenizer, AutoModel
def mean_pooling(model_output, attention_mask):
"""
Mean pooling to get sentence embeddings. See:
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
@napoler
napoler / collate_fn_example.py
Created January 5, 2022 01:41 — forked from subhadarship/collate_fn_example.py
collate_fn for PyTorch DataLoader
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
class MyDataset(Dataset):
def __init__(self):
x = np.random.rand(1000, 3) # 1000 3-dim samples
self.x = [x[i].tolist() for i in range(1000)]
y = np.random.randint(low=0, high=2, size=(1000,))
certifi==2021.10.8
charset-normalizer==2.0.9
elasticsearch==7.15.2
elasticsearch-dsl==7.4.0
idna==3.3
jieba==0.42.1
python-dateutil==2.8.2
requests==2.26.0
six==1.16.0
tkitJson==0.0.0.3
@napoler
napoler / SessionState.py
Created November 20, 2021 12:37 — forked from tvst/SessionState.py
A possible design for doing per-session persistent state in Streamlit
"""Hack to add per-session state to Streamlit.
Usage
-----
>>> import SessionState
>>>
>>> session_state = SessionState.get(user_name='', favorite_color='black')
>>> session_state.user_name
''
@napoler
napoler / kmeans_with_faiss.py
Created November 20, 2021 09:18 — forked from j-adamczyk/kmeans_with_faiss.py
K-Means clustring with faiss library
import faiss
import numpy as np
class FaissKMeans:
def __init__(self, n_clusters=8, n_init=10, max_iter=300):
self.n_clusters = n_clusters
self.n_init = n_init
self.max_iter = max_iter
self.kmeans = None
@napoler
napoler / cliped.py
Created October 24, 2021 08:12
Created with Copy to Gist
#encoding=utf-8
from transformers import (
BartForConditionalGeneration, BartTokenizer, BartForCausalLM,
Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import torch
from torch.utils.data import random_split