morganmcg1/convert_corpora.py

## convert_corpora.py
# This will download the config file and corpora needed for the spaCy GoEmotions tutorial:
# https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions

# Get CNN Config
os.makedirs(os.path.join(spacy_dir/'training', 'cnn'), exist_ok=True)
cnn_cfg_url = "https://raw.githubusercontent.com/explosion/projects/v3/tutorials/textcat_goemotions/configs/cnn.cfg"
cnn_cfg = spacy_dir/'cnn.cfg'
!wget -q -O $cnn_cfg $cnn_cfg_url


# Get Categories File and Corpora
spacy_dir = Path("spacy_demo")
assets_dir = spacy_dir/"assets"
corpus_dir = spacy_dir/"corpus"
os.makedirs(assets_dir, exist_ok=True)
os.makedirs(corpus_dir, exist_ok=True)

cats_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt"
train_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv"
dev_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv"
test_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv"

cats_file = assets_dir/'categories.txt'
train_file = assets_dir/'train.tsv'
dev_file = assets_dir/'dev.tsv'
test_file = assets_dir/'test.tsv'

!wget -q -O $cats_file $cats_url
!wget -q -O $train_file $train_url
!wget -q -O $dev_file $dev_url
!wget -q -O $test_file $test_url


# Define Convert function
# Taken from https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions/scripts/convert_corpus.py
from pathlib import Path
import typer
from spacy.tokens import DocBin
import spacy

def read_categories(path: Path):
    return path.open().read().strip().split("\n")

def read_tsv(file_):
    for line in file_:
        text, labels, annotator = line.split("\t")
        yield {
            "text": text,
            "labels": [int(label) for label in labels.split(",")],
            "annotator": annotator
        }

def convert_record(nlp, record, categories):
    """Convert a record from the tsv into a spaCy Doc object."""
    doc = nlp.make_doc(record["text"])
    # All categories other than the true ones get value 0
    doc.cats = {category: 0 for category in categories}
    # True labels get value 1
    for label in record["labels"]:
        doc.cats[categories[label]] = 1
    return doc

def convert_corpus(assets_dir: Path=assets_dir, corpus_dir: Path=corpus_dir, lang: str="en"):
    """Convert the GoEmotion corpus's tsv files to spaCy's binary format."""
    categories = read_categories(assets_dir / "categories.txt")
    nlp = spacy.blank(lang)
    for tsv_file in assets_dir.iterdir():
        if not tsv_file.parts[-1].endswith(".tsv"):
            continue
        records = read_tsv(tsv_file.open(encoding="utf8"))
        docs = [convert_record(nlp, record, categories) for record in records]
        out_file = corpus_dir / tsv_file.with_suffix(".spacy").parts[-1]
        out_data = DocBin(docs=docs).to_bytes()
        with out_file.open("wb") as file_:
            file_.write(out_data)
        print(f'{tsv_file} converted')

# Convert Files
convert_corpus(assets_dir=assets_dir, corpus_dir=corpus_dir)
	# This will download the config file and corpora needed for the spaCy GoEmotions tutorial:
	# https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions

	# Get CNN Config
	os.makedirs(os.path.join(spacy_dir/'training', 'cnn'), exist_ok=True)
	cnn_cfg_url = "https://raw.githubusercontent.com/explosion/projects/v3/tutorials/textcat_goemotions/configs/cnn.cfg"
	cnn_cfg = spacy_dir/'cnn.cfg'
	!wget -q -O $cnn_cfg $cnn_cfg_url


	# Get Categories File and Corpora
	spacy_dir = Path("spacy_demo")
	assets_dir = spacy_dir/"assets"
	corpus_dir = spacy_dir/"corpus"
	os.makedirs(assets_dir, exist_ok=True)
	os.makedirs(corpus_dir, exist_ok=True)

	cats_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/emotions.txt"
	train_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv"
	dev_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv"
	test_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv"

	cats_file = assets_dir/'categories.txt'
	train_file = assets_dir/'train.tsv'
	dev_file = assets_dir/'dev.tsv'
	test_file = assets_dir/'test.tsv'

	!wget -q -O $cats_file $cats_url
	!wget -q -O $train_file $train_url
	!wget -q -O $dev_file $dev_url
	!wget -q -O $test_file $test_url


	# Define Convert function
	# Taken from https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions/scripts/convert_corpus.py
	from pathlib import Path
	import typer
	from spacy.tokens import DocBin
	import spacy

	def read_categories(path: Path):
	return path.open().read().strip().split("\n")

	def read_tsv(file_):
	for line in file_:
	text, labels, annotator = line.split("\t")
	yield {
	"text": text,
	"labels": [int(label) for label in labels.split(",")],
	"annotator": annotator
	}

	def convert_record(nlp, record, categories):
	"""Convert a record from the tsv into a spaCy Doc object."""
	doc = nlp.make_doc(record["text"])
	# All categories other than the true ones get value 0
	doc.cats = {category: 0 for category in categories}
	# True labels get value 1
	for label in record["labels"]:
	doc.cats[categories[label]] = 1
	return doc

	def convert_corpus(assets_dir: Path=assets_dir, corpus_dir: Path=corpus_dir, lang: str="en"):
	"""Convert the GoEmotion corpus's tsv files to spaCy's binary format."""
	categories = read_categories(assets_dir / "categories.txt")
	nlp = spacy.blank(lang)
	for tsv_file in assets_dir.iterdir():
	if not tsv_file.parts[-1].endswith(".tsv"):
	continue
	records = read_tsv(tsv_file.open(encoding="utf8"))
	docs = [convert_record(nlp, record, categories) for record in records]
	out_file = corpus_dir / tsv_file.with_suffix(".spacy").parts[-1]
	out_data = DocBin(docs=docs).to_bytes()
	with out_file.open("wb") as file_:
	file_.write(out_data)
	print(f'{tsv_file} converted')

	# Convert Files
	convert_corpus(assets_dir=assets_dir, corpus_dir=corpus_dir)