Skip to content

Instantly share code, notes, and snippets.

@isoboroff
Created December 15, 2019 16:35
Show Gist options
  • Save isoboroff/03b8f11cc2315a2ec6a8a97a53977789 to your computer and use it in GitHub Desktop.
Save isoboroff/03b8f11cc2315a2ec6a8a97a53977789 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
if __name__ == "__main__":
import json
import argparse
import spacy
import dateparser
import signal
from contextlib import contextmanager
from tqdm import tqdm
parser = argparse.ArgumentParser(description='Preprocess web news text with Spacy')
parser.add_argument('bundle', help='Bundle to index (zip file)')
args = parser.parse_args()
nlp = spacy.load('en_core_web_lg')
# Cool trick from https://www.jujens.eu/posts/en/2018/Jun/02/python-timeout-function/
# Use a context manager to timeout functions by wrapping them in with.. clauses.
@contextmanager
def timeout(time):
signal.signal(signal.SIGALRM, raise_timeout)
signal.alarm(time)
try:
yield
except TimeoutError:
pass
finally:
signal.signal(signal.SIGALRM, signal.SIG_IGN)
def raise_timeout(signum, frame):
raise TimeoutError
# The file is JSON lines...
def process(docstring):
obj = json.loads(docstring)
doc = nlp(obj['text'])
for ent in doc.ents:
# I wish I had proper metadata, but since I don't,
# Try to parse the first thing in the document
# that Spacy thinks is a DATE.
if 'first_date' not in obj and ent.label_ == 'DATE':
date = dateparser.parse(ent.text)
if date is not None:
obj['first_date'] = date.isoformat()
obj['first_stamp'] = str(int(date.timestamp()))
if ent.label_ not in obj:
obj[ent.label_] = ent.text
else:
obj[ent.label_] += " " + ent.text
return obj
with open(args.bundle, 'r') as bundle:
linecount = 0
for line in bundle:
linecount += 1
with open(args.bundle, 'r') as bundle:
for line in tqdm(bundle, total=linecount):
with timeout(10):
try:
print(json.dumps(process(line)))
except:
print(json.dumps(line))
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment