Skip to content

Instantly share code, notes, and snippets.

@morganmcg1
Created April 27, 2021 10:52
Show Gist options
  • Save morganmcg1/a43842b847e2ff7dc78d2c3e5990bb96 to your computer and use it in GitHub Desktop.
Save morganmcg1/a43842b847e2ff7dc78d2c3e5990bb96 to your computer and use it in GitHub Desktop.
Download and convert corpora to .spacy for the spaCy GoEmotions tutorial
# This will download the config file and corpora needed for the spaCy GoEmotions tutorial:
# https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions
# Get CNN Config
os.makedirs(os.path.join(spacy_dir/'training', 'cnn'), exist_ok=True)
cnn_cfg_url = "https://raw.githubusercontent.com/explosion/projects/v3/tutorials/textcat_goemotions/configs/cnn.cfg"
cnn_cfg = spacy_dir/'cnn.cfg'
!wget -q -O $cnn_cfg $cnn_cfg_url
# Get Categories File and Corpora
spacy_dir = Path("spacy_demo")
assets_dir = spacy_dir/"assets"
corpus_dir = spacy_dir/"corpus"
os.makedirs(assets_dir, exist_ok=True)
os.makedirs(corpus_dir, exist_ok=True)
cats_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt"
train_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv"
dev_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv"
test_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv"
cats_file = assets_dir/'categories.txt'
train_file = assets_dir/'train.tsv'
dev_file = assets_dir/'dev.tsv'
test_file = assets_dir/'test.tsv'
!wget -q -O $cats_file $cats_url
!wget -q -O $train_file $train_url
!wget -q -O $dev_file $dev_url
!wget -q -O $test_file $test_url
# Define Convert function
# Taken from https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions/scripts/convert_corpus.py
from pathlib import Path
import typer
from spacy.tokens import DocBin
import spacy
def read_categories(path: Path):
return path.open().read().strip().split("\n")
def read_tsv(file_):
for line in file_:
text, labels, annotator = line.split("\t")
yield {
"text": text,
"labels": [int(label) for label in labels.split(",")],
"annotator": annotator
}
def convert_record(nlp, record, categories):
"""Convert a record from the tsv into a spaCy Doc object."""
doc = nlp.make_doc(record["text"])
# All categories other than the true ones get value 0
doc.cats = {category: 0 for category in categories}
# True labels get value 1
for label in record["labels"]:
doc.cats[categories[label]] = 1
return doc
def convert_corpus(assets_dir: Path=assets_dir, corpus_dir: Path=corpus_dir, lang: str="en"):
"""Convert the GoEmotion corpus's tsv files to spaCy's binary format."""
categories = read_categories(assets_dir / "categories.txt")
nlp = spacy.blank(lang)
for tsv_file in assets_dir.iterdir():
if not tsv_file.parts[-1].endswith(".tsv"):
continue
records = read_tsv(tsv_file.open(encoding="utf8"))
docs = [convert_record(nlp, record, categories) for record in records]
out_file = corpus_dir / tsv_file.with_suffix(".spacy").parts[-1]
out_data = DocBin(docs=docs).to_bytes()
with out_file.open("wb") as file_:
file_.write(out_data)
print(f'{tsv_file} converted')
# Convert Files
convert_corpus(assets_dir=assets_dir, corpus_dir=corpus_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment