Skip to content

Instantly share code, notes, and snippets.

View pszemraj's full-sized avatar

Peter pszemraj

View GitHub Profile
@pszemraj
pszemraj / fuzzy_align.py
Created March 14, 2024 02:49
fuzzy string alignment of two lists
from rapidfuzz import process, fuzz
def fuzzy_align(masterlist, list2, cutoff=70):
# Dictionary to hold matches
matches = {}
# Track used indices to avoid duplicate matches in the masterlist
used_indices = set()
@pszemraj
pszemraj / parse_emails.py
Created March 13, 2024 01:53
parse directory of .eml files to a text dataframe, save to parquet
import logging
from email.parser import BytesParser
from pathlib import Path
import fire
import html2text
import pandas as pd
from tqdm import tqdm
# Setup logging
@pszemraj
pszemraj / datasets_split.py
Created March 12, 2024 07:03
hf datasets train_test_split with stratify_by_column for any type (by tricking it)
import os
import numpy as np
from datasets import ClassLabel, Dataset, DatasetDict
def split_dataset(
dataset: Dataset,
test_size=0.025,
@pszemraj
pszemraj / upload_folder.py
Created March 10, 2024 12:36
upload to hub
"""
this script will upload a folder to Hugging Face Hub
python upload_folder.py --help
pip install fire huggingface-hub
"""
import logging
@pszemraj
pszemraj / local_pastebin.py
Last active March 13, 2024 00:20
local network pastebin server for copy/pasting betwixt 2+ computers
import json
import socket
import uuid
import yake
from flask import Flask, redirect, render_template_string, request, url_for
from markupsafe import escape
app = Flask(__name__)
@pszemraj
pszemraj / enable_tf32.py
Last active June 6, 2024 03:39
modern way to auto enable tf32
import torch
import logging
def check_ampere_gpu():
"""
Check if the GPU supports NVIDIA Ampere or later and enable FP32 in PyTorch if it does.
"""
# Check if CUDA is available
if not torch.cuda.is_available():
@pszemraj
pszemraj / test_textsumdir_ipex.py
Created February 21, 2024 21:40
textsum - run summarization on directory on CPU with IPEX optimization
"""
cli.py - Command line interface for textsum.
this edition: fast CPU inference with intel IPEX https://archive.ph/oY5b1
Usage:
textsum-dir --help
"""
import os
@pszemraj
pszemraj / run_gauntlet_vs_gpt4.py
Last active February 18, 2024 00:49
evaluate a text2text summarization model on cpu on 'the gauntlet' -rouge vs GPT4
import json
import logging
import re
from datetime import datetime
from pathlib import Path
import datasets
import evaluate
import fire
import intel_extension_for_pytorch as ipex
@pszemraj
pszemraj / run_classification.py
Created February 16, 2024 16:08
a less bad version of the hf run_classification script
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
"""Summary
"""
import logging
from pathlib import Path
import fire
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer