Skip to content

Instantly share code, notes, and snippets.

@ines
Last active January 3, 2018 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ines/fae139c445aa5826e639328aca808b5c to your computer and use it in GitHub Desktop.
Save ines/fae139c445aa5826e639328aca808b5c to your computer and use it in GitHub Desktop.
Birthday corpus generator as Prodigy recipe: https://gist.github.com/wpm/feb9aea744870674d353ffd55344becb
# Usage: prodigy birthday-corpus -n 50 -d birthday_dataset -F birthday_corpus.py
import json
import re
import time
from random import choice, random
from typing import TextIO, Callable, Sequence, Tuple, Optional
from pathlib import Path
import prodigy
from prodigy import set_hashes
from prodigy.components.db import connect
NAME = DATE = str
SPAN_OFFSET = Tuple[int, int]
def generate(name_factory: Callable[[], NAME], lifespan_factory: Callable[[], Tuple[DATE, DATE]]) \
-> Tuple[str, Optional[SPAN_OFFSET], Optional[SPAN_OFFSET]]:
def find_span(date: DATE) -> SPAN_OFFSET:
i = text.find(date)
j = i + len(date)
return i, j
name = name_factory()
born, died = lifespan_factory()
texts = [
(f"{name} was born on {born}.", True, False),
(f"{name} has a birthday on {born}.", True, False),
(f"{name} was born on {born} and died {died}.", True, True),
(f"On {born} {name} was born.", True, False),
(f"On {died} {name} died.", False, True),
(f"{name} died on {died}.", False, True),
(f"RIP {name}: {born}-{died}.", True, True),
(f"A skilled carpenter, {name} lived from {born} until {died}.", True, True),
(f"{died} was the day {name} died.", False, True),
(f"{born} was the day {name} was born.", True, False),
(f"{name} is a skilled juggler.", False, False),
(f"Where are you, {name}?", False, False)
]
text, contains_born, contains_died = choice(texts)
born_span = died_span = None
if contains_born:
born_span = find_span(born)
if contains_died:
died_span = find_span(died)
return text, born_span, died_span
def name_generator(first_names: Sequence[str], last_names: Sequence[str]) -> Callable[[], NAME]:
def factory() -> str:
if random() < 0.5:
return f"{choice(first_names)} {choice(last_names)}"
else:
return f"{choice(first_names)}"
return factory
def lifespan_generator(start="1/1/1900", end="12/31/2010") -> Callable[[], Tuple[DATE, DATE]]:
start = time.mktime(time.strptime(start, "%m/%d/%Y"))
end = time.mktime(time.strptime(end, "%m/%d/%Y"))
formats = ["%m/%d/%Y", "%B %d, %Y", "%d %B %Y"]
def factory() -> Tuple[DATE, DATE]:
def make_date(timestamp):
date = time.strftime(fmt, time.localtime(timestamp))
return re.sub(r'\b0(\d)', r'\1', date) # Remove leading zeroes from numbers.
born = start + (end - start) * random()
died = born + (end - born) * random()
fmt = choice(formats)
return make_date(born), make_date(died)
return factory
def read_txt(file_path):
with Path(file_path).open('r', encoding='utf8') as txt_file:
return [line.strip() for line in txt_file]
@prodigy.recipe('birthday-corpus',
n_samples=("Number of samples to generate", "option", "n", int),
first_names=("List of first names, one per line", "option", "fn", read_txt),
last_names=("List of last names, one per line", "option", "ln", read_txt),
dataset=("Optional name of dataset to add examples to", "option", "d", str))
def birthday_corpus2(n_samples=1000, first_names=None, last_names=None,
dataset=None):
"""
Generate a corpus of texts describing birth and death dates for people.
The texts refer to dates on which a person was born and or died. The appropriate date spans are annoated with a
BIRTHDAY label. This is used to create a training file that can be used by Prodigy.
If the first names or last names file is not specified, a short default list of names is used.
See https://prodi.gy.
"""
def annotation_span(span, answer):
return {"text": text[span[0]:span[1]], "start": span[0], "end": span[1], "label": "BIRTHDAY", "answer": answer}
if first_names is not None:
first_names_list = list(name.title().strip() for name in first_names)
else:
first_names_list = ["Mary", "Sue", "John", "Roger"]
if last_names is not None:
last_names_list = list(name.title().strip() for name in last_names)
else:
last_names_list = ["Smith", "Jones", "Jackson", "Ruiz"]
examples = []
for _ in range(n_samples):
text, born_span, died_span = generate(name_generator(first_names_list, last_names_list), lifespan_generator())
spans = []
if born_span:
spans.append(annotation_span(born_span, "accept"))
if died_span:
spans.append(annotation_span(died_span, "reject"))
task = {"text": text, "spans": spans}
if dataset:
examples.append(task)
else:
print(json.dumps(task))
if dataset:
db = connect()
if not dataset in db:
db.add_dataset(dataset, {'description': 'via birthday_corpus.py'})
print("Created dataset {}".format(dataset))
examples = [set_hashes(eg) for eg in examples]
db.add_examples(examples, datasets=[dataset])
print("Added {} examples to dataset {}".format(len(examples), dataset))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment