AlisterH/joinpdf

## joinpdf
#! /bin/bash
# joinpdf: script for Puppy Linux to recursively find and combine pdf files.
# Required: basename and qpdf.
# Also:     which, realpath, file and gawk (any awk would work)
# Recommended: natsort
# Command line only version.  See gjoinpdf for gui version.
# Version 9 by disciple, 14/12/2022.
# Switched back to processing each file with qpdf in a separate thread before merging - generally faster on a multiprocessor machine, but will be slower with some files than just using pdfunite.
# Sejda may be better, but will generate larger files in some cases, and requires java.
# http://www.murga-linux.com/puppy/viewtopic.php?p=149208#149208
# Currently has NO ERROR HANDLING for the actual join operation.
# You may experience errors if you have pdfs that are corrupted.

# Set defaults
# $$ is the process ID, to prevent collisions between more than one simultaneous instance
OUTPUTFILE="combined-$(date '+%Y%m%d-%H%M')-PID$$.pdf"

# Use natsort if it is available, for better sorting
# `msort -l -w -c h` also gives good results but isn't designed for piping
if hash natsort 2>/dev/null
then
 SORT="natsort"
else
 echo "joinpdf: using sort"
 echo "for better results install natsort from https://github.com/jjk-jacky/natsort"
 sort -V 1>/dev/null 2>&1 <<<1
 if [ $? = "0" ]
 then
  SORT="sort -V"
 else
  SORT="sort"
  echo "joinpdf: sort does not support -V"
  echo "I hope you were careful with any file numbering"
 fi
fi

# Initialise filecount
# Will need to increase this if we want to join 10000+ files!
# I have not tested to find the maximum number of files in each input directory,
# although it performed well on my system with an input directory containing 960 files.
# Note that if there are too many our pipes will break!
FILECOUNT=1000

# print usage instructions if run without input arguments.
# TODO: better usage message.
test -sd "$@" 2>/dev/null
if [ "$?" = "0" ]
then
 echo "
joinpdf: at least one input pdf or directory is required

Usage: joinpdf [input files and/or folders]

Joins pdf inputs and pdfs in any folder inputs (found recursively in any subfolders and sorted in normal alphanumeric order) in the order specified.
Use gjoinpdf for gui version.
"
 exit
fi

# Just combine the pdfs if run with input arguments.

# Create temporary directory
TEMPFOLDER="`mktemp -dt joinpdf-XXX`"

# Find all files!
#see comment further down re mime types vs file extensions
for i in "$@"
do
 #find -L "$i" -type f -name '*.pdf' | $SORT >> "$TEMPFOLDER"/files.txt
 #this might actually cause problems if we have more files than we can pipe in one go
 #I guess find will pipe them to two different instances of sort, so they won't all be sorted
 find -L "$i" -type f -exec realpath {} + | $SORT >> "$TEMPFOLDER"/files0.txt
done

# get input files; detect pdfs by mime-type now
# this is more 'nixy and will pick up pdfs without a file extension, and not files with a .pdf extension that aren't actually pdfs
# need to use realpath above to pick up symlinks to pdfs (readlink would be a more portable alternative - would it be slower?)
# unfortunately (in terms of performance) we can't use the file command via find -exec because it doesn't get passed the realpath
# see https://unix.stackexchange.com/questions/79222/how-can-i-efficiently-dereference-all-symlinks-in-find-output-filenames

# make sure we create file so there is no error in the next while
touch "$TEMPFOLDER"/files.txt
while IFS='' read -r line
do
 file -F $'\t' --mime-type "$line" | grep 'application/pdf$' | cut -f 1 >> "$TEMPFOLDER"/files.txt
done < "$TEMPFOLDER"/files0.txt

# Make sure output file has an extension
OUTPUTFILE="`echo $OUTPUTFILE | gawk '{gsub (/\.pdf$|\.PDF$/,"",$0); print $0'}`"
OUTPUTFILE="$OUTPUTFILE.pdf"

# Symlink files for us to join
while read line
do FILECOUNT=$(($FILECOUNT+1))
 (qpdf --empty  --remove-page-labels --pages "$line" -- "$TEMPFOLDER"/$FILECOUNT) &
done < "$TEMPFOLDER"/files.txt
wait

# Remove lists
rm -f "$TEMPFOLDER"/files0.txt
rm -f "$TEMPFOLDER"/files.txt

if [ -z "`ls -A "$TEMPFOLDER"`" ]
then
 echo "error: no input pdfs found"
else
 qpdf --empty --stream-data=compress --object-streams=generate --pages "$TEMPFOLDER"/* -- "$OUTPUTFILE" && echo "created: $OUTPUTFILE"
fi

#remove temporary directory
rm -rf "$TEMPFOLDER"
	#! /bin/bash
	# joinpdf: script for Puppy Linux to recursively find and combine pdf files.
	# Required: basename and qpdf.
	# Also: which, realpath, file and gawk (any awk would work)
	# Recommended: natsort
	# Command line only version. See gjoinpdf for gui version.
	# Version 9 by disciple, 14/12/2022.
	# Switched back to processing each file with qpdf in a separate thread before merging - generally faster on a multiprocessor machine, but will be slower with some files than just using pdfunite.
	# Sejda may be better, but will generate larger files in some cases, and requires java.
	# http://www.murga-linux.com/puppy/viewtopic.php?p=149208#149208
	# Currently has NO ERROR HANDLING for the actual join operation.
	# You may experience errors if you have pdfs that are corrupted.

	# Set defaults
	# $$ is the process ID, to prevent collisions between more than one simultaneous instance
	OUTPUTFILE="combined-$(date '+%Y%m%d-%H%M')-PID$$.pdf"

	# Use natsort if it is available, for better sorting
	# `msort -l -w -c h` also gives good results but isn't designed for piping
	if hash natsort 2>/dev/null
	then
	SORT="natsort"
	else
	echo "joinpdf: using sort"
	echo "for better results install natsort from https://github.com/jjk-jacky/natsort"
	sort -V 1>/dev/null 2>&1 <<<1
	if [ $? = "0" ]
	then
	SORT="sort -V"
	else
	SORT="sort"
	echo "joinpdf: sort does not support -V"
	echo "I hope you were careful with any file numbering"
	fi
	fi

	# Initialise filecount
	# Will need to increase this if we want to join 10000+ files!
	# I have not tested to find the maximum number of files in each input directory,
	# although it performed well on my system with an input directory containing 960 files.
	# Note that if there are too many our pipes will break!
	FILECOUNT=1000

	# print usage instructions if run without input arguments.
	# TODO: better usage message.
	test -sd "$@" 2>/dev/null
	if [ "$?" = "0" ]
	then
	echo "
	joinpdf: at least one input pdf or directory is required

	Usage: joinpdf [input files and/or folders]

	Joins pdf inputs and pdfs in any folder inputs (found recursively in any subfolders and sorted in normal alphanumeric order) in the order specified.
	Use gjoinpdf for gui version.
	"
	exit
	fi

	# Just combine the pdfs if run with input arguments.

	# Create temporary directory
	TEMPFOLDER="`mktemp -dt joinpdf-XXX`"

	# Find all files!
	#see comment further down re mime types vs file extensions
	for i in "$@"
	do
	#find -L "$i" -type f -name '*.pdf' \| $SORT >> "$TEMPFOLDER"/files.txt
	#this might actually cause problems if we have more files than we can pipe in one go
	#I guess find will pipe them to two different instances of sort, so they won't all be sorted
	find -L "$i" -type f -exec realpath {} + \| $SORT >> "$TEMPFOLDER"/files0.txt
	done

	# get input files; detect pdfs by mime-type now
	# this is more 'nixy and will pick up pdfs without a file extension, and not files with a .pdf extension that aren't actually pdfs
	# need to use realpath above to pick up symlinks to pdfs (readlink would be a more portable alternative - would it be slower?)
	# unfortunately (in terms of performance) we can't use the file command via find -exec because it doesn't get passed the realpath
	# see https://unix.stackexchange.com/questions/79222/how-can-i-efficiently-dereference-all-symlinks-in-find-output-filenames

	# make sure we create file so there is no error in the next while
	touch "$TEMPFOLDER"/files.txt
	while IFS='' read -r line
	do
	file -F $'\t' --mime-type "$line" \| grep 'application/pdf$' \| cut -f 1 >> "$TEMPFOLDER"/files.txt
	done < "$TEMPFOLDER"/files0.txt

	# Make sure output file has an extension
	OUTPUTFILE="`echo $OUTPUTFILE \| gawk '{gsub (/\.pdf$\|\.PDF$/,"",$0); print $0'}`"
	OUTPUTFILE="$OUTPUTFILE.pdf"

	# Symlink files for us to join
	while read line
	do FILECOUNT=$(($FILECOUNT+1))
	(qpdf --empty --remove-page-labels --pages "$line" -- "$TEMPFOLDER"/$FILECOUNT) &
	done < "$TEMPFOLDER"/files.txt
	wait

	# Remove lists
	rm -f "$TEMPFOLDER"/files0.txt
	rm -f "$TEMPFOLDER"/files.txt

	if [ -z "`ls -A "$TEMPFOLDER"`" ]
	then
	echo "error: no input pdfs found"
	else
	qpdf --empty --stream-data=compress --object-streams=generate --pages "$TEMPFOLDER"/* -- "$OUTPUTFILE" && echo "created: $OUTPUTFILE"
	fi

	#remove temporary directory
	rm -rf "$TEMPFOLDER"