Skip to content

Instantly share code, notes, and snippets.

@alecjacobson
Last active March 16, 2022 21:16
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alecjacobson/69402c750787efc56b3a2a4cac590a82 to your computer and use it in GitHub Desktop.
Save alecjacobson/69402c750787efc56b3a2a4cac590a82 to your computer and use it in GitHub Desktop.
This script will attempt to create a copy of the input tex directory whose total size is less than the 10000 KBs (i.e., 10MBs) limit of ArXiv.
#!/bin/bash
if [ -z "$1" ];then
echo "USAGE:
procrusteanarxiv path/to/input/dir/containing/tex/files/
This script tested with dependencies:
gs Ghostscript version 9.27 (9.21 is buggy)
latexmk version 4.52c
rsync version 2.6.9
imagemagick version 7.0.8-12
This script will attempt to create a copy of the input tex directory whose total
size is less than the 10000 KBs (i.e., 10MBs) limit of ArXiv. It first tries:
1. deleting all unnecessary files.
If this doesn't work, it tries to
2. repack all .pdf's using high-quality jpeg compression without downsampling.
If that is still not enough, it will
3. run a binary search on the downsampling resolution to apply to all .pdf's.
"
exit 0
fi
# gs version 9.21 has a bug that makes downsampled images ugly. Appears to be
# fixed by at least 9.27
if [[ -z "${GS}" ]]; then
MY_GS="/Users/ajx/Downloads/ghostscript-9.27/bin/gs"
else
MY_GS="${GS}"
fi
get_dir_size_in_b()
{
find "$1" ! -type d -print0 | xargs -0 stat -f '%z' | awk '{sum += $1} END{print sum}'
}
get_dir_size_in_kb()
{
#du -k -d0 "$1" | awk '{print $1;}'
b=$(get_dir_size_in_b "$1")
expr '(' "$b" ')' / 1000
}
get_file_size_in_kb()
{
du -k "$1" | cut -f1
}
gs_preset()
{
gs_file="$1"
gstmp="/var/tmp/gstmp.pdf"
$MY_GS -sOutputFile="$gstmp" \
-dQUIET \
-dNOPAUSE -dBATCH \
-sDEVICE=pdfwrite \
-dPDFSETTINGS=/$2 \
-f "$gs_file"
mv "$gstmp" "$gs_file"
}
gs_compress()
{
res="$3"
qfactor="$4"
$MY_GS -sOutputFile="$2" \
-dQUIET \
-dNOPAUSE -dBATCH \
-sDEVICE=pdfwrite \
-c ".setpdfwrite << \
/AlwaysEmbed [] \
/AntiAliasColorImages //false \
/AntiAliasGrayImages //false \
/AntiAliasMonoImages //false \
/ASCII85EncodePages //false \
/AutoFilterColorImages //true \
/AutoFilterGrayImages //true \
/AutoPositionEPSFiles //true \
/Binding /Left \
/CalCMYKProfile (None) \
/CalGrayProfile (None) \
/CalRGBProfile (None) \
/ColorImageDepth -1 \
/ColorImageDict .defaultImageDict \
/ColorImageDownsampleThreshold 1.5 \
/ColorImageFilter /DCTEncode \
/CompressPages //true \
/ConvertImagesToIndexed //true \
/DefaultRenderingIntent /Default \
/DetectBlends //true \
/DownsampleColorImages //true \
/DownsampleGrayImages //true \
/DownsampleMonoImages //true \
/EmitDSCWarnings //false \
/EncodeColorImages //true \
/EncodeGrayImages //true \
/EncodeMonoImages //true \
/EndPage -1 \
/GrayImageDepth -1 \
/GrayImageDict .defaultImageDict \
/GrayImageDownsampleThreshold 1.5 \
/GrayImageFilter /DCTEncode \
/ImageMemory 524288 \
/LockDistillerParams //false \
/MaxSubsetPct 100 \
>> setdistillerparams" \
-c ".setpdfwrite << \
/MonoImageDepth -1 \
/MonoImageDict mark \
/K -1 \
.dicttomark readonly \
/MonoImageDownsampleThreshold 1.5 \
/MonoImageFilter /CCITTFaxEncode \
/OffOptimizations 0 \
/OPM 1 \
/Optimize //true \
/ParseDSCComments //true \
/ParseDSCCommentsForDocInfo //true \
/PDFXTrimBoxToMediaBoxOffset [0 0 0 0] \
/PDFXSetBleedBoxToMediaBox //true \
/PDFXBleedBoxToTrimBoxOffset [0 0 0 0] \
/PreserveCopyPage //true \
/PreserveHalftoneInfo //false \
/sRGBProfile (None) \
/StartPage 1 \
/SubsetFonts //true \
/TransferFunctionInfo /Preserve \
/UseFlateCompression //true \
/UsePrologue //false \
/PassThroughJPEGImages //true \
/AutoRotatePages /None \
/CannotEmbedFontPolicy /Error \
/ColorACSImageDict << /QFactor $qfactor /Blend 1 /ColorTransform 1 /HSamples [2 1 1 2] /VSamples [2 1 1 2] >> \
/ColorConversionStrategy /LeaveColorUnchanged \
/ColorImageDownsampleType /Subsample \
/ColorImageResolution $res \
/CompatibilityLevel 1.7 \
/CreateJobTicket //true \
/DoThumbnails //true \
/EmbedAllFonts //true \
/GrayACSImageDict .prepressACSImageDict \
/GrayImageDownsampleType /Bicubic \
/GrayImageResolution 300 \
/MonoImageDownsampleType /Subsample \
/MonoImageResolution 1200 \
/NeverEmbed [] \
/PreserveEPSInfo //true \
/PreserveOPIComments //true \
/PreserveOverprintSettings //true \
/UCRandBGInfo /Preserve \
>> setdistillerparams" \
-f "$1"
}
# https://stackoverflow.com/a/10453202/148668
# gs_compress input.pdf res jpeg
# gs_compress input.pdf 72 0.01
gs_compress_inplace()
{
gstmp="/var/tmp/gstmp.pdf"
gs_compress "$1" $gstmp $2 $3
mv "$gstmp" "$1"
}
png2jpg()
{
echo " [debug] $1 $2"
convert "$1" -background white -flatten -alpha off -resize 2048x2048\> -quality 90 "$2"
}
png2jpg_inplace()
{
png2jpgtmp="/var/tmp/png2jpgtmp.jpg"
png2jpg "$1" "$png2jpgtmp"
mv "$png2jpgtmp" "$1"
}
png2jpg_all()
{
find . -type f -name "*.png" -print0 | while IFS= read -r -d $'\0' file; do
# convert each .png file in place into a .jpg format (latex doesn't seem to
# care about extension and this avoid needing to edit the .tex files to find
# the new files)
png2jpg_inplace "$file"
done
find . -type f -name "*.jpg" -print0 | while IFS= read -r -d $'\0' file; do
png2jpg_inplace "$file"
done
}
# Returns the largest integer i for which `command j` succeeds (exits with a
# null exit code) where j = 10*i
function dichotomic_search_ten {
min=$1
max=$2
command=$3
while [ $min -lt $max ]; do
# Compute the mean between min and max, rounded up to the superior unit
current=`expr '(' "$min" + "$max" + 1 ')' / 2`
current_times_ten=`expr 10 \* "$current" `
if $command $current_times_ten 1>&2
then min=$current
else max=`expr $current - 1`
fi
done
echo $min
}
input_dir="$1"
input_base="$(basename "$input_dir")"
stripped_dir="procrustean-arxiv-stripped"
compress="procrustean-arxiv-compress"
under="$input_base-procrustean-arxiv"
##############################################################################
# Create a local copy
##############################################################################
rsync -r --exclude=.git "$input_dir" "$stripped_dir" --delete
size=$(get_dir_size_in_kb "$stripped_dir" )
echo "Original size: $size KBs"
cd "$stripped_dir"
# determine main tex file
main_tex=$(grep -l -m 1 "^[^\%]*\\\\documentclass" *.tex)
if [ -z "$main_tex" ];then
echo "could not find main file."
exit -1
fi
##############################################################################
# Use latexmk to build the document as a pdf and extract file list
##############################################################################
# determine basename (e.g., of .pdf or .fls)
main_base="${main_tex%.*}"
# Generating a latexmk file
echo "\$pdf_mode = 1;
\$dvi_mode = \$postscript_mode = 0;
\$pdflatex = 'pdflatex --shell-escape -synctex=1 %O %S';
@default_files = ( '$main_tex' );" > latexmkrc
if ! latexmk -silent 2>/dev/null ; then
echo "latexmk failed"
exit -1;
fi
# Check whether every file exists in .fls
find . -type f -print0 | while IFS= read -r -d $'\0' file; do
# strip "./" off the front
file=${file:2}
#echo "grep -qF $file $main_base.fls"
if [ "$file" == "$main_base.fls" ]; then
:
elif [ "$file" == "latexmkrc" ]; then
:
elif [ "$file" == "$main_base.pdf" ]; then
# just immediately remove the main pdf (it doesn't actually matter)
rm "$file"
continue;
elif ! grep -iqF "$file" "$main_base.fls" ; then
rm "$file"
continue;
fi
size=$(get_file_size_in_kb "$file")
done
# clean up latexmk junk
latexmk -C -silent 2>/dev/null
# pop up
cd ../
size=$(get_dir_size_in_kb "$stripped_dir" )
echo "Removing unused files: $size KBs"
if [ "$size" -le "10000" ]; then
rsync -r --exclude=.git "$stripped_dir/" "$under" --delete
else
############################################################################
# Try simply converting to jpeg without downsampling
############################################################################
rsync -r --exclude=.git "$stripped_dir/" "$compress" --delete
cd "$compress"
png2jpg_all
find . -type f -name "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do
# compress each .pdf file in place
echo " [debug] $file ..."
gs_preset "$file" default
done
cd ..
size=$(get_dir_size_in_kb "$compress" )
echo "JPEG compression without downsampling: $size KBs"
if [ "$size" -le "10000" ]; then
rsync -r --exclude=.git "$compress/" "$under" --delete
else
##########################################################################
# Binary search over downsampling resolutions
##########################################################################
# compress_and_measure 600
compress_and_measure()
{
res="$1"
echo " [debug] res=$1"
rsync -r --exclude=.git "$stripped_dir/" "$compress" --delete
cd "$compress"
png2jpg_all
# compress and downsample each .pdf file
find . -type f -name "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do
echo " [debug] $file ..."
gs_compress_inplace "$file" $res 0.1
done
cd ..
size=$(get_dir_size_in_kb "$compress" )
echo " [debug] size=$size"
if [ "$size" -lt "10000" ]; then
rsync -r --exclude=.git "$compress/" "$under" --delete
return 0;
else
return 1;
fi
}
ideal_res=$(dichotomic_search_ten 0 100 compress_and_measure)
size=$(get_dir_size_in_kb "$under" )
echo "JPEG compression with downsampling to $ideal_res dpi: $size KBs"
fi
fi
size=$(get_dir_size_in_kb "$under" )
echo "---------------------------------------------------------------"
echo "Output \".\/$under\/\": $size KBs"
# clean up temp directories
rm -rf "$compress"
rm -rf "$stripped_dir"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment