-
-
Save ines/fae139c445aa5826e639328aca808b5c to your computer and use it in GitHub Desktop.
Birthday corpus generator as Prodigy recipe: https://gist.github.com/wpm/feb9aea744870674d353ffd55344becb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: prodigy birthday-corpus -n 50 -d birthday_dataset -F birthday_corpus.py | |
import json | |
import re | |
import time | |
from random import choice, random | |
from typing import TextIO, Callable, Sequence, Tuple, Optional | |
from pathlib import Path | |
import prodigy | |
from prodigy import set_hashes | |
from prodigy.components.db import connect | |
NAME = DATE = str | |
SPAN_OFFSET = Tuple[int, int] | |
def generate(name_factory: Callable[[], NAME], lifespan_factory: Callable[[], Tuple[DATE, DATE]]) \ | |
-> Tuple[str, Optional[SPAN_OFFSET], Optional[SPAN_OFFSET]]: | |
def find_span(date: DATE) -> SPAN_OFFSET: | |
i = text.find(date) | |
j = i + len(date) | |
return i, j | |
name = name_factory() | |
born, died = lifespan_factory() | |
texts = [ | |
(f"{name} was born on {born}.", True, False), | |
(f"{name} has a birthday on {born}.", True, False), | |
(f"{name} was born on {born} and died {died}.", True, True), | |
(f"On {born} {name} was born.", True, False), | |
(f"On {died} {name} died.", False, True), | |
(f"{name} died on {died}.", False, True), | |
(f"RIP {name}: {born}-{died}.", True, True), | |
(f"A skilled carpenter, {name} lived from {born} until {died}.", True, True), | |
(f"{died} was the day {name} died.", False, True), | |
(f"{born} was the day {name} was born.", True, False), | |
(f"{name} is a skilled juggler.", False, False), | |
(f"Where are you, {name}?", False, False) | |
] | |
text, contains_born, contains_died = choice(texts) | |
born_span = died_span = None | |
if contains_born: | |
born_span = find_span(born) | |
if contains_died: | |
died_span = find_span(died) | |
return text, born_span, died_span | |
def name_generator(first_names: Sequence[str], last_names: Sequence[str]) -> Callable[[], NAME]: | |
def factory() -> str: | |
if random() < 0.5: | |
return f"{choice(first_names)} {choice(last_names)}" | |
else: | |
return f"{choice(first_names)}" | |
return factory | |
def lifespan_generator(start="1/1/1900", end="12/31/2010") -> Callable[[], Tuple[DATE, DATE]]: | |
start = time.mktime(time.strptime(start, "%m/%d/%Y")) | |
end = time.mktime(time.strptime(end, "%m/%d/%Y")) | |
formats = ["%m/%d/%Y", "%B %d, %Y", "%d %B %Y"] | |
def factory() -> Tuple[DATE, DATE]: | |
def make_date(timestamp): | |
date = time.strftime(fmt, time.localtime(timestamp)) | |
return re.sub(r'\b0(\d)', r'\1', date) # Remove leading zeroes from numbers. | |
born = start + (end - start) * random() | |
died = born + (end - born) * random() | |
fmt = choice(formats) | |
return make_date(born), make_date(died) | |
return factory | |
def read_txt(file_path): | |
with Path(file_path).open('r', encoding='utf8') as txt_file: | |
return [line.strip() for line in txt_file] | |
@prodigy.recipe('birthday-corpus', | |
n_samples=("Number of samples to generate", "option", "n", int), | |
first_names=("List of first names, one per line", "option", "fn", read_txt), | |
last_names=("List of last names, one per line", "option", "ln", read_txt), | |
dataset=("Optional name of dataset to add examples to", "option", "d", str)) | |
def birthday_corpus2(n_samples=1000, first_names=None, last_names=None, | |
dataset=None): | |
""" | |
Generate a corpus of texts describing birth and death dates for people. | |
The texts refer to dates on which a person was born and or died. The appropriate date spans are annoated with a | |
BIRTHDAY label. This is used to create a training file that can be used by Prodigy. | |
If the first names or last names file is not specified, a short default list of names is used. | |
See https://prodi.gy. | |
""" | |
def annotation_span(span, answer): | |
return {"text": text[span[0]:span[1]], "start": span[0], "end": span[1], "label": "BIRTHDAY", "answer": answer} | |
if first_names is not None: | |
first_names_list = list(name.title().strip() for name in first_names) | |
else: | |
first_names_list = ["Mary", "Sue", "John", "Roger"] | |
if last_names is not None: | |
last_names_list = list(name.title().strip() for name in last_names) | |
else: | |
last_names_list = ["Smith", "Jones", "Jackson", "Ruiz"] | |
examples = [] | |
for _ in range(n_samples): | |
text, born_span, died_span = generate(name_generator(first_names_list, last_names_list), lifespan_generator()) | |
spans = [] | |
if born_span: | |
spans.append(annotation_span(born_span, "accept")) | |
if died_span: | |
spans.append(annotation_span(died_span, "reject")) | |
task = {"text": text, "spans": spans} | |
if dataset: | |
examples.append(task) | |
else: | |
print(json.dumps(task)) | |
if dataset: | |
db = connect() | |
if not dataset in db: | |
db.add_dataset(dataset, {'description': 'via birthday_corpus.py'}) | |
print("Created dataset {}".format(dataset)) | |
examples = [set_hashes(eg) for eg in examples] | |
db.add_examples(examples, datasets=[dataset]) | |
print("Added {} examples to dataset {}".format(len(examples), dataset)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment