Skip to content

Instantly share code, notes, and snippets.

View wesslen's full-sized avatar

Ryan Wesslen wesslen

View GitHub Profile
@wesslen
wesslen / chat1.txt
Last active July 11, 2023 22:34
Python script that processes txt transcript files and outputs .jsonl for Prodigy annotations
AGENT hello this is steve can i ask what's your name
CUSTOMER my name is harry
AGENT thanks harry. how can i help you
@wesslen
wesslen / tokenizer_spacy_german.ipynb
Last active July 7, 2023 19:41
Custom (German) Tokenizer in a SpaCy pipeline
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@wesslen
wesslen / mapping.py
Last active December 24, 2023 16:28
Python function to map random user ID's (e.g., crowdsource platform) to a unique Prodigy session name
import random
import string
def generate_random_userids(num_ids):
passwords = []
for _ in range(num_ids):
password = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
passwords.append(password)
return passwords
@wesslen
wesslen / prodigy-hf-space.py
Created June 26, 2023 23:06
Create new Prodigy instance on HF Space
from huggingface_hub import HfApi
from huggingface_hub import duplicate_space
from huggingface_hub import hf_hub_download
from dotenv import load_dotenv
import os
# create a local .env file with HF_TOKEN (HF Hub Token)
load_dotenv()
HF_TOKEN = os.environ.get("HF_TOKEN")
@wesslen
wesslen / custom.js
Last active July 5, 2023 19:37
Prodigy hierarchical text classification
function toggle(id) {
var x = document.getElementById(id);
if (id == "a"){
reset("b")
}else{
reset("a")
}
if (x.style.display === "none") {
x.style.display = "block";
} else {
@wesslen
wesslen / relations_validation.py
Last active June 5, 2023 17:21
Prodigy relations validation with validate_answer callback that checks that both relations are labeled entities
# Prodigy v1.11.x; some imports will change for v1.12+
import copy
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import srsly
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from spacy.util import filter_spans
@wesslen
wesslen / jsonl-to-coco.py
Last active April 6, 2023 13:45
Convert Prodigy jsonl bounding box annotations to coco format
import json
from typing import List
import srsly
import typer
app = typer.Typer()
def convert_to_coco(input_file: str, output_file: str):
# Load the JSONL file using srsly
{"text":"How Silicon Valley Pushed Coding Into American Classrooms","meta":{"source":"The New York Times","i":0}}
{"text":"Women in Tech Speak Frankly on Culture of Harassment","meta":{"source":"The New York Times","i":1}}
{"text":"Silicon Valley Investors Flexed Their Muscles in Uber Fight","meta":{"source":"The New York Times","i":2}}
{"text":"Uber is a Creature of an Industry Struggling to Grow Up","meta":{"source":"The New York Times","i":3}}
{"text":"\u2018The Internet Is Broken\u2019: @ev Is Trying to Salvage It","meta":{"source":"The New York Times","i":4}}
{"text":"The South Park Commons Fills a Hole in the Tech Landscape","meta":{"source":"The New York Times","i":5}}
{"text":"The Closing of the Republican Mind","meta":{"source":"The New York Times","i":6}}
{"text":"Writers From the Right and Left on Trump Jr., the Future of the F.B.I., Health Care and More","meta":{"source":"The New York Times","i":7}}
{"text":"Daily Report: From Lean to Fat Start-Ups","meta":{"source":"The New York Times","i":8}}
{"
@wesslen
wesslen / overlapping.jsonl
Last active March 16, 2023 15:50
Textcat classification with pre-annotated overlapping spans, see https://support.prodi.gy/t/textcat-using-span-overlapping-view/6434
{"text":"Biomaterials and medical devices are broadly used in the diagnosis, treatment, repair, replacement or enhancing functions of human tissues or organs. Although the living conditions of human beings have been steadily improved in most parts of the world. ","label":"ID: 27047681","spans":[{ "start": 0, "end": 12, "label": "ORG" },{ "start": 0, "end": 12, "label": "ORG_2" }]}
@wesslen
wesslen / ner_manual.py
Created March 1, 2023 13:11
Prodigy ner.manual recipe modifying the port number
from typing import List, Optional
import spacy
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.models.matcher import PatternMatcher
from prodigy.util import split_string
# Helper function for removing token information from examples