Skip to content

Instantly share code, notes, and snippets.

@cneud
Last active October 26, 2023 11:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cneud/a7b3593943f51b060440828b09923359 to your computer and use it in GitHub Desktop.
Save cneud/a7b3593943f51b060440828b09923359 to your computer and use it in GitHub Desktop.
blm_extract.py
import click
import json
import requests
import os
from tqdm import tqdm
def get_text(obj, sub_entries=None):
if sub_entries is None:
sub_entries=[]
obj = obj.copy()
if 'imdasid' in obj:
del obj['imdasid']
extensions=[]
for se in sub_entries:
if se in obj:
extensions.append(obj[se].copy())
del obj[se]
try:
ret = ",".join(obj.values())
except TypeError:
import ipdb;ipdb.set_trace()
try:
ext_text = [get_text(ext_entry) for ext in extensions for ext_entry in ext]
ext_text = ",".join(ext_text)
except TypeError:
import ipdb;ipdb.set_trace()
return ret + ext_text
@click.command()
@click.argument('input-file', type=click.Path(exists=True))
@click.argument('output-file', type=click.Path(exists=False))
@click.option('--ner-endpoint', type=str, default=None, help="")
@click.option('--ned-endpoint', type=str, default=None, help="")
def cli(input_file, output_file, ner_endpoint, ned_endpoint):
os.environ['no_proxy'] = '*'
with open(input_file, mode='r', encoding="utf-8") as f:
data = json.load(f)
records = data['records']
results = []
for i,rec in tqdm(enumerate(records), total=len(records)):
persons = rec['person'] if 'person' in rec else []
locations = rec['ort'] if 'ort' in rec else []
per_text=""
for per in persons:
per_text = per_text + ";" if len(per_text)>0 else "" + get_text(per, ['beruf'])
loc_text=""
for loc in locations:
loc = loc.copy()
loc_text = loc_text + ";" if len(loc_text)>0 else "" + get_text(loc, ['zusatz'])
result={}
result['text'] = per_text + " " if len(per_text) > 0 and len(loc_text) > 0 else "" + loc_text
if ner_endpoint is not None:
resp = requests.post(url=ner_endpoint, json={'text': result['text']})
ner = json.loads(resp.content)
if ned_endpoint is not None:
resp = requests.post(url=ned_endpoint + "/parse", json=ner)
ner = json.loads(resp.content)
resp = requests.post(url=ned_endpoint + "/ned", json=ner)
ned = json.loads(resp.content)
result['ned'] = ned
result['ner'] = ner
results.append(result)
if len(results) % 100 == 0:
with open(output_file, mode='w', encoding="utf-8") as f:
f.write(json.dumps(results, indent=2, ensure_ascii=False))
with open(output_file, mode='w', encoding="utf-8") as f:
f.write(json.dumps(results, indent=2, ensure_ascii=False))
if __name__ == '__main__':
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment