Skip to content

Instantly share code, notes, and snippets.

View pafonta's full-sized avatar

Pierre-Alexandre Fonta pafonta

View GitHub Profile
# For an example of use, see https://gist.github.com/pafonta/21f3db4d9c31f6a1c2f7ede8cbf3406b#gistcomment-3970844.
"""Entity Linking - Link mentions from texts to terms in ontologies.
Use character-based embedding to handle plurals, misspellings, partial matches, ...
"""
import pickle
import faiss
# For an example of use, see https://gist.github.com/pafonta/37762f56e8c1879569bca64901d0a000#gistcomment-3968062.
"""Collect statistics on PubMed articles."""
from __future__ import annotations
from pathlib import Path
from defusedxml import ElementTree
from tqdm import tqdm
# For an example of use, see https://gist.github.com/pafonta/d33a0d5d849932f8ceab8b711d995497#gistcomment-3965575.
"""Find MeSH terms in the MeSH tree simply (i.e. without using a graph)."""
from __future__ import annotations
import json
from collections.abc import Iterator
from xml.etree.ElementTree import Element # nosec
@pafonta
pafonta / nlm_mesh.py
Last active November 19, 2021 15:27
# For an example of use, see https://gist.github.com/pafonta/162c1b9ec0380e95a017297a707a4d66#gistcomment-3935739.
"""Find & Rank MeSH terms associated with an author."""
from __future__ import annotations
import json
from collections import Counter
from collections.abc import Iterator
from xml.etree.ElementTree import Element # nosec
@pafonta
pafonta / cv_tf-idf.py
Created September 29, 2020 13:50
Embed sentences into counts and TF-IDF spaces
import pandas as pd
# Corpus
filename = 'sentences_cord19_v47.parquet'
sentences = pd.read_parquet(filename)
# This contains all the sentences as a list.
corpus = sentences.text.tolist()
# Count
@pafonta
pafonta / tf_idf_2.py
Created September 4, 2020 15:50
stand-up 07.09.2020
import datetime
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
def p(text: str) -> None:
now = datetime.datetime.now()
time = now.strftime("%Y-%m-%d %H:%M:%S")
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
nlp2 = load_model(MODEL)
nlp2.add_pipe(nlp2.create_pipe("merge_entities"))
nlp2.add_pipe(nlp2.create_pipe("merge_noun_chunks"))
def extract_relations_umls(text, umls):
ents = {x[0].lower(): x[2] for x in umls}
rels = []
for x in nlp2(text):
if x.dep_ != "ROOT":
if x.ent_type_ == "ENTITY":
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.