Skip to content

Instantly share code, notes, and snippets.

@wpm
Created January 3, 2018 20:37
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wpm/feb9aea744870674d353ffd55344becb to your computer and use it in GitHub Desktop.
Save wpm/feb9aea744870674d353ffd55344becb to your computer and use it in GitHub Desktop.
Generate a corpus of texts mentioning birthdays that can be used to train a Prodigy named entity recognizer.
import json
import re
import time
from random import choice, random
from typing import TextIO, Callable, Sequence, Tuple, Optional
import click
NAME = DATE = str
SPAN_OFFSET = Tuple[int, int]
def generate(name_factory: Callable[[], NAME], lifespan_factory: Callable[[], Tuple[DATE, DATE]]) \
-> Tuple[str, Optional[SPAN_OFFSET], Optional[SPAN_OFFSET]]:
def find_span(date: DATE) -> SPAN_OFFSET:
i = text.find(date)
j = i + len(date)
return i, j
name = name_factory()
born, died = lifespan_factory()
texts = [
(f"{name} was born on {born}.", True, False),
(f"{name} has a birthday on {born}.", True, False),
(f"{name} was born on {born} and died {died}.", True, True),
(f"On {born} {name} was born.", True, False),
(f"On {died} {name} died.", False, True),
(f"{name} died on {died}.", False, True),
(f"RIP {name}: {born}-{died}.", True, True),
(f"A skilled carpenter, {name} lived from {born} until {died}.", True, True),
(f"{died} was the day {name} died.", False, True),
(f"{born} was the day {name} was born.", True, False),
(f"{name} is a skilled juggler.", False, False),
(f"Where are you, {name}?", False, False)
]
text, contains_born, contains_died = choice(texts)
born_span = died_span = None
if contains_born:
born_span = find_span(born)
if contains_died:
died_span = find_span(died)
return text, born_span, died_span
def name_generator(first_names: Sequence[str], last_names: Sequence[str]) -> Callable[[], NAME]:
def factory() -> str:
if random() < 0.5:
return f"{choice(first_names)} {choice(last_names)}"
else:
return f"{choice(first_names)}"
return factory
def lifespan_generator(start="1/1/1900", end="12/31/2010") -> Callable[[], Tuple[DATE, DATE]]:
start = time.mktime(time.strptime(start, "%m/%d/%Y"))
end = time.mktime(time.strptime(end, "%m/%d/%Y"))
formats = ["%m/%d/%Y", "%B %d, %Y", "%d %B %Y"]
def factory() -> Tuple[DATE, DATE]:
def make_date(timestamp):
date = time.strftime(fmt, time.localtime(timestamp))
return re.sub(r'\b0(\d)', r'\1', date) # Remove leading zeroes from numbers.
born = start + (end - start) * random()
died = born + (end - born) * random()
fmt = choice(formats)
return make_date(born), make_date(died)
return factory
@click.command()
@click.option("--n", default=10000, help="number of samples to generate")
@click.option("--first-names", type=click.File(), help="list of first names, one per line")
@click.option("--last-names", type=click.File(), help="list of last names, one per line")
def birthday_corpus(n: int, first_names: Optional[TextIO], last_names: Optional[TextIO]):
"""
Generate a corpus of texts describing birth and death dates for people.
The texts refer to dates on which a person was born and or died. The appropriate date spans are annoated with a
BIRTHDAY label. This is used to create a training file that can be used by Prodigy.
If the first names or last names file is not specified, a short default list of names is used.
See https://prodi.gy.
"""
def annotation_span(span, accept):
return {"text": text[span[0]:span[1]], "start": span[0], "end": span[1], "label": "BIRTHDAY", "accept": accept}
if first_names is not None:
first_names_list = list(name.title().strip() for name in first_names)
else:
first_names_list = ["Mary", "Sue", "John", "Roger"]
if last_names is not None:
last_names_list = list(name.title().strip() for name in last_names)
else:
last_names_list = ["Smith", "Jones", "Jackson", "Ruiz"]
for _ in range(n):
text, born_span, died_span = generate(name_generator(first_names_list, last_names_list), lifespan_generator())
spans = []
if born_span:
spans.append(annotation_span(born_span, True))
if died_span:
spans.append(annotation_span(died_span, False))
click.echo(json.dumps({"text": text, "spans": spans}))
if __name__ == "__main__":
birthday_corpus()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment