Skip to content

Instantly share code, notes, and snippets.

@markasoftware
Last active May 29, 2020 12:32
Show Gist options
  • Save markasoftware/12a0b08b2d68b90ca4b40bacae5d79b7 to your computer and use it in GitHub Desktop.
Save markasoftware/12a0b08b2d68b90ca4b40bacae5d79b7 to your computer and use it in GitHub Desktop.
ACM Scraper
#!/bin/bash
# This file is released under the GNU Public License v3
# ACM scraper during coronavirus
# Will skip existing PDFs to speed up a resumed download
# Get the link to the first issue in the journal/SIG/etc, then the scraper will use "next" links to traverse
# Usage: ./acm.bash first_issue_link output_dir
# Eg, ./acm.bash https://dl.acm.org/toc/siggraph/1969/3/3 /media/mass/siggraph to download all SIGGRAPH PDFs
# note: acm does BLOCK IPs after a few hundred PDFs! Uncommenting the sleep statement below to slow things down may help, but I haven't tested
if ! command -v pup >/dev/null
then
echo 'Install PUP: https://github.com/ericchiang/pup/releases'
exit 1
fi
if (( $# < 2 ))
then
echo 'Usage: ./acm.bash first_issue_link output_dir'
exit 1
fi
issue_link=$1
output=${2%/}
if ! [[ -d "$output" ]]
then
echo "Output directory does not exist."
exit 1
fi
cookiejar=$(mktemp)
curl -sLc "$cookiejar" "https://dl.acm.org" >/dev/null
while true
do
[[ $issue_link = *'toc'* ]] || break
issue_html=$(curl -sb "$cookiejar" "$issue_link")
issue_slashes=${issue_link#*toc/*/}
echo "$issue_link" >&2
issue_dir="$output/$issue_slashes"
mkdir -p "$issue_dir"
echo -n "$issue_html" | pup '.issue-item__title > a' | tr -d \\n | grep -o '<a[^<]*' | while IFS= read -r link
do
title=${link##*\"> }
doi=${link#*\"}
doi=${doi%%\"*}
doi=${doi/abs/pdf}
echo " $title ($doi)" >&2
if [[ -e "$issue_dir/$title.pdf" ]]
then
echo ' (skipping, already exists)' >&2
else
curl -sb "$cookiejar" -o "$issue_dir/$title.pdf" "https://dl.acm.org$doi"
# uncomment to decrease risk of
# sleep 30
fi
done
issue_link=$(echo -n "$issue_html" | pup '.content-navigation__btn--next' | grep -o 'href="[^"]*')
issue_link=${issue_link#href=\"}
issue_link="https://dl.acm.org$issue_link"
done
echo "Done!" >&2
rm -f "$cookiejar"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment