Skip to content

Instantly share code, notes, and snippets.

@landonstewart
Created March 7, 2021 07:12
Show Gist options
  • Save landonstewart/f1e1e85baf692fea6711e3d7118100e3 to your computer and use it in GitHub Desktop.
Save landonstewart/f1e1e85baf692fea6711e3d7118100e3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""Convert to spacy binary format."""
import random
import pandas as pd
import spacy
from spacy.training import Example
from spacy.util import minibatch
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")
df = pd.read_json('query_result.json')
# Format of query_result.json:
# [{"category": "Phishing",
# {"content": "You are hosting a phishing site at https://somebadurl.com/phishingpage.html"},
# ...etc
# ]
#
# Get a unique list of all categories.
CATS = df.category.unique()
examples = []
for record in tqdm(df.itertuples(),
total=len(df.index),
desc="Examples created",
unit=" documents"):
cats = {'cats': {l: l == record.category for l in CATS}}
# Produces a dict like this:
# {
# "cats": {
# "Phishing": False,
# "Malware": False,
# "Spam": True,
# ...etc
# }
# }
examples.append(Example.from_dict(nlp.make_doc(record.content), cats))
nlp.initialize(lambda: examples)
for i in tqdm(range(20), desc="Batches updated.", unit=" batches"):
random.shuffle(examples)
for batch in minibatch(examples, size=8):
nlp.update(batch)
# This doesn't do what I thought it would. There's no examples or .spacy files
# whatsoever in the directory it creates.
nlp.to_disk('nlpmodel')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment