asehmi/nougat_em.sh

## nougat_em.sh
#!/bin/bash

# pip install nougat-ocr
# see https://github.com/facebookresearch/nougat for details and license

DEFAULT_BATCHSIZE=4

usage() {
    echo "Usage: $0 <path_to_directory> [--batchsize BATCHSIZE]"
    exit 1
}

# Check minimum arguments
[[ "$#" -lt 1 ]] && usage

while [[ "$#" -gt 0 ]]; do
    case $1 in
        --batchsize) BATCHSIZE="$2"; shift ;;
        *)
            [[ -z "$DIR_PATH" ]] && DIR_PATH="$1" || usage
            ;;
    esac
    shift
done

BATCHSIZE="${BATCHSIZE:-$DEFAULT_BATCHSIZE}"

# Check if the given path is a directory
[[ ! -d "$DIR_PATH" ]] && echo "Error: $DIR_PATH is not a directory." && exit 1

PARENT_DIR="$(dirname "$DIR_PATH")"
OUTPUT_DIR="${PARENT_DIR}/nougat-out"

[[ ! -d "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"

# Process PDFs
find "$DIR_PATH" -type f -iname "*.pdf" -print0 | while IFS= read -r -d '' pdf; do
    echo "Processing: $pdf"
    nougat --out "$OUTPUT_DIR" "$pdf" --markdown --batchsize "$BATCHSIZE" || echo "Failed to process $pdf"

    # create an archive of outputs thus far (relevant for google colab - this is optional)
    zip -r -9 -q latest_textFromPDF_nougat_mainRT.zip "$OUTPUT_DIR"

done

echo "All PDFs processed and saved to $OUTPUT_DIR"
	#!/bin/bash

	# pip install nougat-ocr
	# see https://github.com/facebookresearch/nougat for details and license

	DEFAULT_BATCHSIZE=4

	usage() {
	echo "Usage: $0 <path_to_directory> [--batchsize BATCHSIZE]"
	exit 1
	}

	# Check minimum arguments
	[[ "$#" -lt 1 ]] && usage

	while [[ "$#" -gt 0 ]]; do
	case $1 in
	--batchsize) BATCHSIZE="$2"; shift ;;
	*)
	[[ -z "$DIR_PATH" ]] && DIR_PATH="$1" \|\| usage
	;;
	esac
	shift
	done

	BATCHSIZE="${BATCHSIZE:-$DEFAULT_BATCHSIZE}"

	# Check if the given path is a directory
	[[ ! -d "$DIR_PATH" ]] && echo "Error: $DIR_PATH is not a directory." && exit 1

	PARENT_DIR="$(dirname "$DIR_PATH")"
	OUTPUT_DIR="${PARENT_DIR}/nougat-out"

	[[ ! -d "$OUTPUT_DIR" ]] && mkdir -p "$OUTPUT_DIR"

	# Process PDFs
	find "$DIR_PATH" -type f -iname "*.pdf" -print0 \| while IFS= read -r -d '' pdf; do
	echo "Processing: $pdf"
	nougat --out "$OUTPUT_DIR" "$pdf" --markdown --batchsize "$BATCHSIZE" \|\| echo "Failed to process $pdf"

	# create an archive of outputs thus far (relevant for google colab - this is optional)
	zip -r -9 -q latest_textFromPDF_nougat_mainRT.zip "$OUTPUT_DIR"

	done

	echo "All PDFs processed and saved to $OUTPUT_DIR"