Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Created August 11, 2018 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/cf5d776b17dc84b6e4c8a6fe3c785d88 to your computer and use it in GitHub Desktop.
Save vanatteveldt/cf5d776b17dc84b6e4c8a6fe3c785d88 to your computer and use it in GitHub Desktop.
import spacy
from pathlib import Path
from spacy.util import minibatch, compounding
from tools import progress_iter
import pandas as pd
labels = pd.read_csv("data/labels.csv").drop("label", axis=1).rename({'label2': 'label'}, axis='columns')
data = pd.read_csv("coded.csv").merge(labels)
N_ITER = 10
TEST = False
if TEST:
train = data.sample(frac=0.8, random_state=12345)
test = data.drop(train.index)
else:
train = data
print("TEST: {}, #train: {}, #test: {}".format(TEST, len(train), TEST and len(test)))
# Set up spacy pipeline
nlp = spacy.load('nl_core_news_sm')
pretrained = nlp.pipe_names
textcat = nlp.create_pipe('textcat')
nlp.add_pipe(textcat, last=True)
labels = list(data.label3.unique())
for label in labels:
textcat.add_label(label)
with nlp.disable_pipes(*pretrained):
optimizer = nlp.begin_training()
for i in range(N_ITER):
losses = {}
batches = [train.loc[b] for b in minibatch(train.index, size=compounding(4., 32., 1.001))]
for batch in progress_iter(batches, prefix="[{i:3}] Train".format(**locals()), finish=False):
annotations = [{'cats': {l: l == label for l in labels}} for label in batch.label3]
print(annotations[1])
import sys; sys.exit()
nlp.update(batch.fulltext, annotations, sgd=optimizer, drop=0.2, losses=losses)
if TEST:
with textcat.model.use_params(optimizer.averages):
ix = progress_iter(test.index, prefix="[{i:3}] Test".format(**locals()), finish=False, clear=True)
out = textcat.pipe(nlp.tokenizer(test.at[i, "fulltext"]) for i in ix)
predictions = list(max(doc.cats, key=doc.cats.get) for doc in out)
acc = sum(test.label3 == predictions) / len(test.label3)
print("[{i:3}] acc: {acc:1.3f}, Losses: {loss:1.3f}"
.format(loss=losses['textcat'], **locals()))
##if i > 0 and i % 5 == 0:
# output_dir = Path('/tmp/model_{i}'.format(**locals()))
# nlp.to_disk(output_dir)
# print("Saved model to", output_dir)
if not TEST:
print("Saving...")
output_dir = Path('initial_model_topics')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment