Created
August 11, 2018 14:51
-
-
Save vanatteveldt/cf5d776b17dc84b6e4c8a6fe3c785d88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from pathlib import Path | |
from spacy.util import minibatch, compounding | |
from tools import progress_iter | |
import pandas as pd | |
labels = pd.read_csv("data/labels.csv").drop("label", axis=1).rename({'label2': 'label'}, axis='columns') | |
data = pd.read_csv("coded.csv").merge(labels) | |
N_ITER = 10 | |
TEST = False | |
if TEST: | |
train = data.sample(frac=0.8, random_state=12345) | |
test = data.drop(train.index) | |
else: | |
train = data | |
print("TEST: {}, #train: {}, #test: {}".format(TEST, len(train), TEST and len(test))) | |
# Set up spacy pipeline | |
nlp = spacy.load('nl_core_news_sm') | |
pretrained = nlp.pipe_names | |
textcat = nlp.create_pipe('textcat') | |
nlp.add_pipe(textcat, last=True) | |
labels = list(data.label3.unique()) | |
for label in labels: | |
textcat.add_label(label) | |
with nlp.disable_pipes(*pretrained): | |
optimizer = nlp.begin_training() | |
for i in range(N_ITER): | |
losses = {} | |
batches = [train.loc[b] for b in minibatch(train.index, size=compounding(4., 32., 1.001))] | |
for batch in progress_iter(batches, prefix="[{i:3}] Train".format(**locals()), finish=False): | |
annotations = [{'cats': {l: l == label for l in labels}} for label in batch.label3] | |
print(annotations[1]) | |
import sys; sys.exit() | |
nlp.update(batch.fulltext, annotations, sgd=optimizer, drop=0.2, losses=losses) | |
if TEST: | |
with textcat.model.use_params(optimizer.averages): | |
ix = progress_iter(test.index, prefix="[{i:3}] Test".format(**locals()), finish=False, clear=True) | |
out = textcat.pipe(nlp.tokenizer(test.at[i, "fulltext"]) for i in ix) | |
predictions = list(max(doc.cats, key=doc.cats.get) for doc in out) | |
acc = sum(test.label3 == predictions) / len(test.label3) | |
print("[{i:3}] acc: {acc:1.3f}, Losses: {loss:1.3f}" | |
.format(loss=losses['textcat'], **locals())) | |
##if i > 0 and i % 5 == 0: | |
# output_dir = Path('/tmp/model_{i}'.format(**locals())) | |
# nlp.to_disk(output_dir) | |
# print("Saved model to", output_dir) | |
if not TEST: | |
print("Saving...") | |
output_dir = Path('initial_model_topics') | |
nlp.to_disk(output_dir) | |
print("Saved model to", output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment