Skip to content

Instantly share code, notes, and snippets.

@tsh-code
Created March 4, 2024 08:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsh-code/f9c71917a99950406231793663fb6ba0 to your computer and use it in GitHub Desktop.
Save tsh-code/f9c71917a99950406231793663fb6ba0 to your computer and use it in GitHub Desktop.
final solution
from flask import Flask, request
from datasets import load_dataset, Dataset
import json
from nltk.tokenize import sent_tokenize, word_tokenize
nlp = spacy.load("en_core_web_trf")
nlp.add_pipe("span_marker",config={"model": "lxyuan/span-marker-bert-base-multilingual-cased-multinerd"})
app = Flask(__name__)
@app.route("/people", methods=['POST'])
def people():
data = request.json
content = data.get('content')
entities = extract_people(text = content, model=SpanMarkerModel)
return {
"entities": entities,
}
def extract_people(text:str, model)->set:
docs = [word_tokenize(sent) for sent in sent_tokenize(text)]
data_dict = {
"tokens": [],
"document_id": [],
"sentence_id": [],
}
for sentence_id, sentence in enumerate(docs):
data_dict["document_id"].append(0)
data_dict["sentence_id"].append(sentence_id)
data_dict["tokens"].append(sentence)
dataset = Dataset.from_dict(data_dict)
entities = nlp.predict(dataset)
people_only = [entity for doc in entities for entity in doc if entity.label_ in ['PER', 'PERSON']
def format_people(people_list):
formatted_names = []
for person in people_list:
name = ' '.join(person['span'])
formatted_names.append(name)
return formatted_names
return format_people(people_only)
if __name__ == "__main__":
app.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment