Last active
January 29, 2024 14:51
-
-
Save apjanco/aa9368f019acffdc39f1a6cfa8a98060 to your computer and use it in GitHub Desktop.
project.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
title: "Demo Project Workflow. From images to research data in Obsidian" | |
description: > | |
This project offers a workflow to process historical documents from the Circuit Court of Istmina, Chocó, Colombia. | |
https://eap.bl.uk/project/EAP1477 | |
In this project, we will: | |
- Fetch the IIIF Images and metadata from the British Library | |
- Segment the images with Kraken | |
- Transcribe the images using Google Vision | |
- Upload the images to eScriptorium where the transcriptions can be corrected | |
- Download the transcriptions from eScriptorium | |
- Use named entity recognition and entity linking to identify people, places, dates, events and organizations | |
- Use LLMs to generate case summaries and collection summaries | |
- Assesses the quality of extracted metadata | |
- Use the results to create a package of text and media that can be loaded into Obsidian for research. | |
vars: | |
name: "EAP1477/1: Archivo Histórico del Juzgado del Circuito de Istmina (1860-1930)" | |
language: "es" | |
text_direction: 'horizontal-lr' #['horizontal-lr', 'horizontal-rl', 'vertical-lr', 'vertical-rl'] | |
version: "0.0.0" | |
iiif_collections: "collections.txt" | |
escriptorium_url: "https://escriptorium.pennds.org" | |
escriptorium_project_name: "EAP1477/1" | |
transcription: 'vision' | |
env: | |
ESCRIPTORIUM_USERNAME: ESCRIPTORIUM_USERNAME | |
ESCRIPTORIUM_PASSWORD: ESCRIPTORIUM_PASSWORD | |
GOOGLE_VISION_KEY: GOOGLE_VISION_KEY | |
directories: ["assets", "configs", "scripts", "pipelines", "packages"] | |
workflows: | |
prepare: | |
- fetch_iiif | |
- segment | |
- transcribe | |
evaluate: | |
- upload | |
- download | |
process: | |
- download | |
#publish: | |
#- obsidian | |
#- huggingface-datasets | |
#- eleventy-site | |
commands: | |
- name: fetch_iiif | |
help: "Reads the collections file, loads the manifests and downloads images and metadata." | |
script: | |
- "python scripts/fetch_iiif.py ${vars.iiif_collections} assets/" | |
outputs: | |
- assets/ | |
- name: segment | |
help: "Segment the images with Kraken." | |
script: | |
- "python scripts/segment.py assets/ ${vars.text_direction}" | |
outputs: | |
- assets/ | |
- name: transcribe | |
help: "Transcribe the images using Google Vision ." | |
script: | |
- "python scripts/transcribe.py assets/ ${env.GOOGLE_VISION_KEY} ${vars.language}" | |
outputs: | |
- assets/ | |
- name: upload | |
help: "Upload the images and ALTO to eScriptorium where the transcriptions can be evaluated and corrected." | |
script: | |
- "python scripts/upload.py ${vars.escriptorium_url} ${env.ESCRIPTORIUM_USERNAME} ${env.ESCRIPTORIUM_PASSWORD} assets/images assets/transcriptions" | |
- name: download | |
help: "Download the corrected transcriptions from eScriptorium." | |
script: | |
- "python scripts/download.py ${vars.escriptorium_url} ${vars.escriptorium_project_name} assets/ ${vars.transcription}" | |
outputs: | |
- assets/transcriptions |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment