apjanco/gist:aa9368f019acffdc39f1a6cfa8a98060

## gistfile1.yml
title: "Demo Project Workflow. From images to research data in Obsidian"
description: >
  This project offers a workflow to process historical documents from the Circuit Court of Istmina, Chocó, Colombia.
  https://eap.bl.uk/project/EAP1477

  In this project, we will:
  - Fetch the IIIF Images and metadata from the British Library
  - Segment the images with Kraken
  - Transcribe the images using Google Vision
  - Upload the images to eScriptorium where the transcriptions can be corrected
  - Download the transcriptions from eScriptorium
  - Use named entity recognition and entity linking to identify people, places, dates, events and organizations
  - Use LLMs to generate case summaries and collection summaries
  - Assesses the quality of extracted metadata
  - Use the results to create a package of text and media that can be loaded into Obsidian for research.

vars:
  name: "EAP1477/1: Archivo Histórico del Juzgado del Circuito de Istmina (1860-1930)"
  language: "es"
  text_direction: 'horizontal-lr' #['horizontal-lr', 'horizontal-rl', 'vertical-lr', 'vertical-rl']
  version: "0.0.0"
  iiif_collections: "collections.txt"
  escriptorium_url: "https://escriptorium.pennds.org"
  escriptorium_project_name: "EAP1477/1"
  transcription: 'vision'

env:
  ESCRIPTORIUM_USERNAME: ESCRIPTORIUM_USERNAME
  ESCRIPTORIUM_PASSWORD: ESCRIPTORIUM_PASSWORD
  GOOGLE_VISION_KEY: GOOGLE_VISION_KEY

directories: ["assets", "configs", "scripts", "pipelines", "packages"]

workflows:
  prepare:
    - fetch_iiif
    - segment
    - transcribe
  evaluate:
    - upload
    - download
  process:
    - download
  #publish:
    #- obsidian
    #- huggingface-datasets
    #- eleventy-site


commands:
  - name: fetch_iiif
    help: "Reads the collections file, loads the manifests and downloads images and metadata."
    script:
      - "python scripts/fetch_iiif.py ${vars.iiif_collections} assets/"
    outputs:
      - assets/

  - name: segment
    help: "Segment the images with Kraken."
    script:
      - "python scripts/segment.py assets/ ${vars.text_direction}"
    outputs:
      - assets/

  - name: transcribe
    help: "Transcribe the images using Google Vision ."
    script:
      - "python scripts/transcribe.py assets/ ${env.GOOGLE_VISION_KEY} ${vars.language}"
    outputs:
      - assets/

  - name: upload
    help: "Upload the images and ALTO to eScriptorium where the transcriptions can be evaluated and corrected."
    script:
      - "python scripts/upload.py ${vars.escriptorium_url} ${env.ESCRIPTORIUM_USERNAME} ${env.ESCRIPTORIUM_PASSWORD} assets/images assets/transcriptions"

  - name: download
    help: "Download the corrected transcriptions from eScriptorium."
    script:
      - "python scripts/download.py ${vars.escriptorium_url} ${vars.escriptorium_project_name} assets/ ${vars.transcription}"
    outputs:
      - assets/transcriptions
	title: "Demo Project Workflow. From images to research data in Obsidian"
	description: >
	This project offers a workflow to process historical documents from the Circuit Court of Istmina, Chocó, Colombia.
	https://eap.bl.uk/project/EAP1477

	In this project, we will:
	- Fetch the IIIF Images and metadata from the British Library
	- Segment the images with Kraken
	- Transcribe the images using Google Vision
	- Upload the images to eScriptorium where the transcriptions can be corrected
	- Download the transcriptions from eScriptorium
	- Use named entity recognition and entity linking to identify people, places, dates, events and organizations
	- Use LLMs to generate case summaries and collection summaries
	- Assesses the quality of extracted metadata
	- Use the results to create a package of text and media that can be loaded into Obsidian for research.

	vars:
	name: "EAP1477/1: Archivo Histórico del Juzgado del Circuito de Istmina (1860-1930)"
	language: "es"
	text_direction: 'horizontal-lr' #['horizontal-lr', 'horizontal-rl', 'vertical-lr', 'vertical-rl']
	version: "0.0.0"
	iiif_collections: "collections.txt"
	escriptorium_url: "https://escriptorium.pennds.org"
	escriptorium_project_name: "EAP1477/1"
	transcription: 'vision'

	env:
	ESCRIPTORIUM_USERNAME: ESCRIPTORIUM_USERNAME
	ESCRIPTORIUM_PASSWORD: ESCRIPTORIUM_PASSWORD
	GOOGLE_VISION_KEY: GOOGLE_VISION_KEY

	directories: ["assets", "configs", "scripts", "pipelines", "packages"]

	workflows:
	prepare:
	- fetch_iiif
	- segment
	- transcribe
	evaluate:
	- upload
	- download
	process:
	- download
	#publish:
	#- obsidian
	#- huggingface-datasets
	#- eleventy-site


	commands:
	- name: fetch_iiif
	help: "Reads the collections file, loads the manifests and downloads images and metadata."
	script:
	- "python scripts/fetch_iiif.py ${vars.iiif_collections} assets/"
	outputs:
	- assets/

	- name: segment
	help: "Segment the images with Kraken."
	script:
	- "python scripts/segment.py assets/ ${vars.text_direction}"
	outputs:
	- assets/

	- name: transcribe
	help: "Transcribe the images using Google Vision ."
	script:
	- "python scripts/transcribe.py assets/ ${env.GOOGLE_VISION_KEY} ${vars.language}"
	outputs:
	- assets/

	- name: upload
	help: "Upload the images and ALTO to eScriptorium where the transcriptions can be evaluated and corrected."
	script:
	- "python scripts/upload.py ${vars.escriptorium_url} ${env.ESCRIPTORIUM_USERNAME} ${env.ESCRIPTORIUM_PASSWORD} assets/images assets/transcriptions"

	- name: download
	help: "Download the corrected transcriptions from eScriptorium."
	script:
	- "python scripts/download.py ${vars.escriptorium_url} ${vars.escriptorium_project_name} assets/ ${vars.transcription}"
	outputs:
	- assets/transcriptions