DrDub/uima_python_concept.py

## uima_python_concept.py
# this is a concept file showcasing what a deep Python-UIMACPP could enable

from uima import AnalysisEngine, AnalysisEngineType
from uima.framework import buildPipeline, TypeMapper, SetFeature, Remote
from uima.index import Index, AnnotationIndex
from uima.typesystem.fs import (
    TOP,
    Annotation,
    FSFloatArray,
    FSString,
    FSBoolean,
    FSFloat,
)

from nltk.uima import PunkTokenizer, NEChunkParser
from uima.wrappers import SpacyAnnotator
from uima.wrappers import BertAnnotator

import numpy as np

# custom type system
class MyToken(Annotation):
    pass


class MySentence(Annotation):
    Embedding = FloatArray()
    Score = FSFloat


class MyNER(Annotation):
    Source = FSString
    Selected = FSBoolean


MAIN_EMBEDDING = []  # this is a hack, it would be nice to be able to hack things


@AnalysisEngineType(
    input=[MySentence],
    output=[MySentence.Score, MyNER.Selected],
    indexes=[
        Index(
            "SelectedIndex",
            type_="sorted",
            fs=MyNER,
            key=MyNER.Selected,
            comparator="inverted",
        ),
        Index(
            "ScoredIndex",
            type="_sorted",
            fs=MySentence,
            key=FSFloat,
            comparator="inverted",
        ),
    ],
)
class MyAE(AnalysisEngine):
    def __init__(self, top_sentences):
        super().__init__()

        self.top_sentences = top_sentences

    def process(self, cas):
        global MAIN_EMBEDDING

        if MAIN_EMBEDDING:
            for sentence in cas.indices[AnnotationIndex(MySentence)]:
                sentence[MySentence.Score] = np.dot(
                    sentence[MySentence.Embedding], MAIN_EMBEDDING
                )
                cas.addToIndexes
            for sentence in cas.indices["ScoredIndex"][: self.top_sentences]:
                for ner in sentence.subiterator(cas.indices[AnnotationIndex(MyNer)]):
                    ner[MyNER.Selected] = True
        else:
            for sentence in cas.indices[AnnotationIndex(MySentence)]:
                MAIN_EMBEDDING = clone(sentence[MySentence.Embedding])


pipeline = buildPipeline(
    [
        # NLTK tokens
        TypeMapper(output={nltk.Token: MyToken}).wrap(PunkTokenizer()),

        # spacy sentence boundaries and NER
        SetFeature({MyNER.Source: "spaCy"}).wrap(
            TypeMapper(output={spacy.Sentence: MySentence, spacy.NER: MyNER}).wrap(
                SpacyAnnotator({"load": "en"})
            )
        ),

        # NLTK NERs over spaCy sentences
        SetFeature({MyNER.Source: "spaCy"}).wrap(
            TypeMapper(
                input={MyToken: nltk.Token, MySentence: nltk.Sentence},
                output={nltk.NamedEntity: MyNER},
            ).wrap(NEChunkParser())
        ),

        # BERT embeddings over spaCy sentences
        # here only text and sentences go over the wire and only embeddings come back
        TypeMapper(
            input={bert.Text: MySentence},
            output={bert.Text.FullEmbedding: MySentence.Embedding},
        ).wrap(
            Remote(server="http://localhost:8000", protocol="zmq").wrap(
                BertAnnotator({"model": "uncased_L-12_H-768_A-12"})
            )
        ),

        # Compute embedding distance to a global query and select NERs inside
        MyAE(2),
    ]
)

aCas = pipeline.newCAS()
aCas.setDocumentText("query sentence")  # (internally resets the cas)
pipeline.process(aCas)  # compute embedding for query

aCas.setDocumentText("very long text with NERs")
pipeline.process(aCas)  # compute selected NERs and embeddings

# get the tokens of the NERs out for further use
selected = []
for ner in aCas.indices["SelectedIndex"]:
    if not ner[
        MyNER.Selected
    ]:  # as the index is sorted, we can stop after the true values
        break
    # something could be done with the 'Source' field over here...
    selected.append(
        [
            token.coveredText()
            for token in ner.subiterator(cas.indices[AnnotationIndex(MyToken)])
        ]
    )
	# this is a concept file showcasing what a deep Python-UIMACPP could enable

	from uima import AnalysisEngine, AnalysisEngineType
	from uima.framework import buildPipeline, TypeMapper, SetFeature, Remote
	from uima.index import Index, AnnotationIndex
	from uima.typesystem.fs import (
	TOP,
	Annotation,
	FSFloatArray,
	FSString,
	FSBoolean,
	FSFloat,
	)

	from nltk.uima import PunkTokenizer, NEChunkParser
	from uima.wrappers import SpacyAnnotator
	from uima.wrappers import BertAnnotator

	import numpy as np

	# custom type system
	class MyToken(Annotation):
	pass


	class MySentence(Annotation):
	Embedding = FloatArray()
	Score = FSFloat


	class MyNER(Annotation):
	Source = FSString
	Selected = FSBoolean


	MAIN_EMBEDDING = [] # this is a hack, it would be nice to be able to hack things


	@AnalysisEngineType(
	input=[MySentence],
	output=[MySentence.Score, MyNER.Selected],
	indexes=[
	Index(
	"SelectedIndex",
	type_="sorted",
	fs=MyNER,
	key=MyNER.Selected,
	comparator="inverted",
	),
	Index(
	"ScoredIndex",
	type="_sorted",
	fs=MySentence,
	key=FSFloat,
	comparator="inverted",
	),
	],
	)
	class MyAE(AnalysisEngine):
	def __init__(self, top_sentences):
	super().__init__()

	self.top_sentences = top_sentences

	def process(self, cas):
	global MAIN_EMBEDDING

	if MAIN_EMBEDDING:
	for sentence in cas.indices[AnnotationIndex(MySentence)]:
	sentence[MySentence.Score] = np.dot(
	sentence[MySentence.Embedding], MAIN_EMBEDDING
	)
	cas.addToIndexes
	for sentence in cas.indices["ScoredIndex"][: self.top_sentences]:
	for ner in sentence.subiterator(cas.indices[AnnotationIndex(MyNer)]):
	ner[MyNER.Selected] = True
	else:
	for sentence in cas.indices[AnnotationIndex(MySentence)]:
	MAIN_EMBEDDING = clone(sentence[MySentence.Embedding])


	pipeline = buildPipeline(
	[
	# NLTK tokens
	TypeMapper(output={nltk.Token: MyToken}).wrap(PunkTokenizer()),

	# spacy sentence boundaries and NER
	SetFeature({MyNER.Source: "spaCy"}).wrap(
	TypeMapper(output={spacy.Sentence: MySentence, spacy.NER: MyNER}).wrap(
	SpacyAnnotator({"load": "en"})
	)
	),

	# NLTK NERs over spaCy sentences
	SetFeature({MyNER.Source: "spaCy"}).wrap(
	TypeMapper(
	input={MyToken: nltk.Token, MySentence: nltk.Sentence},
	output={nltk.NamedEntity: MyNER},
	).wrap(NEChunkParser())
	),

	# BERT embeddings over spaCy sentences
	# here only text and sentences go over the wire and only embeddings come back
	TypeMapper(
	input={bert.Text: MySentence},
	output={bert.Text.FullEmbedding: MySentence.Embedding},
	).wrap(
	Remote(server="http://localhost:8000", protocol="zmq").wrap(
	BertAnnotator({"model": "uncased_L-12_H-768_A-12"})
	)
	),

	# Compute embedding distance to a global query and select NERs inside
	MyAE(2),
	]
	)

	aCas = pipeline.newCAS()
	aCas.setDocumentText("query sentence") # (internally resets the cas)
	pipeline.process(aCas) # compute embedding for query

	aCas.setDocumentText("very long text with NERs")
	pipeline.process(aCas) # compute selected NERs and embeddings

	# get the tokens of the NERs out for further use
	selected = []
	for ner in aCas.indices["SelectedIndex"]:
	if not ner[
	MyNER.Selected
	]: # as the index is sorted, we can stop after the true values
	break
	# something could be done with the 'Source' field over here...
	selected.append(
	[
	token.coveredText()
	for token in ner.subiterator(cas.indices[AnnotationIndex(MyToken)])
	]
	)