Skip to content

Instantly share code, notes, and snippets.

@DrDub
Created December 29, 2022 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DrDub/9413410626b5a77d8f1f576f6447d64e to your computer and use it in GitHub Desktop.
Save DrDub/9413410626b5a77d8f1f576f6447d64e to your computer and use it in GitHub Desktop.
Python UIMA-CPP Concept code
# this is a concept file showcasing what a deep Python-UIMACPP could enable
from uima import AnalysisEngine, AnalysisEngineType
from uima.framework import buildPipeline, TypeMapper, SetFeature, Remote
from uima.index import Index, AnnotationIndex
from uima.typesystem.fs import (
TOP,
Annotation,
FSFloatArray,
FSString,
FSBoolean,
FSFloat,
)
from nltk.uima import PunkTokenizer, NEChunkParser
from uima.wrappers import SpacyAnnotator
from uima.wrappers import BertAnnotator
import numpy as np
# custom type system
class MyToken(Annotation):
pass
class MySentence(Annotation):
Embedding = FloatArray()
Score = FSFloat
class MyNER(Annotation):
Source = FSString
Selected = FSBoolean
MAIN_EMBEDDING = [] # this is a hack, it would be nice to be able to hack things
@AnalysisEngineType(
input=[MySentence],
output=[MySentence.Score, MyNER.Selected],
indexes=[
Index(
"SelectedIndex",
type_="sorted",
fs=MyNER,
key=MyNER.Selected,
comparator="inverted",
),
Index(
"ScoredIndex",
type="_sorted",
fs=MySentence,
key=FSFloat,
comparator="inverted",
),
],
)
class MyAE(AnalysisEngine):
def __init__(self, top_sentences):
super().__init__()
self.top_sentences = top_sentences
def process(self, cas):
global MAIN_EMBEDDING
if MAIN_EMBEDDING:
for sentence in cas.indices[AnnotationIndex(MySentence)]:
sentence[MySentence.Score] = np.dot(
sentence[MySentence.Embedding], MAIN_EMBEDDING
)
cas.addToIndexes
for sentence in cas.indices["ScoredIndex"][: self.top_sentences]:
for ner in sentence.subiterator(cas.indices[AnnotationIndex(MyNer)]):
ner[MyNER.Selected] = True
else:
for sentence in cas.indices[AnnotationIndex(MySentence)]:
MAIN_EMBEDDING = clone(sentence[MySentence.Embedding])
pipeline = buildPipeline(
[
# NLTK tokens
TypeMapper(output={nltk.Token: MyToken}).wrap(PunkTokenizer()),
# spacy sentence boundaries and NER
SetFeature({MyNER.Source: "spaCy"}).wrap(
TypeMapper(output={spacy.Sentence: MySentence, spacy.NER: MyNER}).wrap(
SpacyAnnotator({"load": "en"})
)
),
# NLTK NERs over spaCy sentences
SetFeature({MyNER.Source: "spaCy"}).wrap(
TypeMapper(
input={MyToken: nltk.Token, MySentence: nltk.Sentence},
output={nltk.NamedEntity: MyNER},
).wrap(NEChunkParser())
),
# BERT embeddings over spaCy sentences
# here only text and sentences go over the wire and only embeddings come back
TypeMapper(
input={bert.Text: MySentence},
output={bert.Text.FullEmbedding: MySentence.Embedding},
).wrap(
Remote(server="http://localhost:8000", protocol="zmq").wrap(
BertAnnotator({"model": "uncased_L-12_H-768_A-12"})
)
),
# Compute embedding distance to a global query and select NERs inside
MyAE(2),
]
)
aCas = pipeline.newCAS()
aCas.setDocumentText("query sentence") # (internally resets the cas)
pipeline.process(aCas) # compute embedding for query
aCas.setDocumentText("very long text with NERs")
pipeline.process(aCas) # compute selected NERs and embeddings
# get the tokens of the NERs out for further use
selected = []
for ner in aCas.indices["SelectedIndex"]:
if not ner[
MyNER.Selected
]: # as the index is sorted, we can stop after the true values
break
# something could be done with the 'Source' field over here...
selected.append(
[
token.coveredText()
for token in ner.subiterator(cas.indices[AnnotationIndex(MyToken)])
]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment