Skip to content

Instantly share code, notes, and snippets.

@alelom
Created June 15, 2021 07:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alelom/6776fe79eee4989a6e193ffcc4c9761b to your computer and use it in GitHub Desktop.
Save alelom/6776fe79eee4989a6e193ffcc4c9761b to your computer and use it in GitHub Desktop.
import spacy
from collections import Counter, defaultdict
import srsly
from datetime import datetime
import pandas as pd
counts = defaultdict(Counter)
nlp = spacy.load(SPACY_MODEL)
data = srsly.read_jsonl(DATA_FILE)
data_tuples = ((eg["text"], eg) for eg in data)
for doc, eg in nlp.pipe(data_tuples, as_tuples=True, n_process=N_PROCESSES):
timestamp = int(eg["meta"]["utc"])
year_month = datetime.utcfromtimestamp(timestamp).strftime("%Y-%m")
for ent in doc.ents:
if ent.label_ == ENTITY_LABEL:
counts[ent.lower_][year_month] += 1
df = pd.DataFrame(data=counts).transpose()
df.to_csv(OUTPUT_FILE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment