Skip to content

Instantly share code, notes, and snippets.

@alexcg1
Last active March 30, 2023 13:24
Show Gist options
  • Save alexcg1/99c722d80129ce29286f8cf8516feda9 to your computer and use it in GitHub Desktop.
Save alexcg1/99c722d80129ce29286f8cf8516feda9 to your computer and use it in GitHub Desktop.
from docarray import BaseDoc, DocArray
from docarray.documents import TextDoc
from docarray.typing import AnyUrl
from jina import Deployment, Executor, requests
class TextChunk(TextDoc):
tags: dict = {}
class PDFDocument(BaseDoc):
texts: DocArray[TextChunk] = DocArray()
path: AnyUrl | None
class Extractor(Executor):
def __init__(
self,
content_types: list = ['text', 'table', 'image', 'metadata'],
**kwargs,
):
super().__init__(**kwargs)
self.content_types = content_types
@requests(on='/extract')
def add_chunks(self, docs: DocArray[PDFDocument], **kwargs):
for doc in docs:
self._extract_text(doc)
def _extract_text(self, doc: PDFDocument, **kwargs):
for _ in range(0, 90):
doc.texts.append(TextChunk(text='foo'))
dep = Deployment(
uses=Extractor,
uses_with={'content_types': ['text']},
)
docs = DocArray([PDFDocument(path='foo.png')])
print(f'input: {docs[0]}')
# above returns PDFDocument(id='5dede5812eb41a28953075b3bc52934c', texts=<DocArray (length=0)>, path=AnyUrl('foo.png', host_type='domain'))
with dep:
output = dep.post(inputs=docs, on='/extract') # this works fine
print(f'output: {output[0]}')
# above returns AnyDoc(id='5dede5812eb41a28953075b3bc52934c')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment