Created
May 30, 2021 23:01
-
-
Save marco-schmidt/d9c6e485509bd34cbdd58991583d8132 to your computer and use it in GitHub Desktop.
Shell script to download IMDb data files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Purpose: bash script to retrieve IMDb tsv files described at https://www.imdb.com/interfaces/ | |
# Created: 2018-10-03 | |
# Requires: bash, date, mkdir, pushd, wget, ls, du, gzip, popd | |
# writing rights and enough space (~ 650 MB as of 2018) in argument directory | |
# make sure the script has exactly one argument, otherwise exit with usage instructions | |
if [ -z "$1" ]; then | |
echo "Usage: '$0 <directory>' to download IMDb tsv files to 'directory/YYYY/YYYY-MM-DD'." | |
echo "Note: <directory> must exist already, subdirectories will be created." | |
echo "Example: '$0 ~/imdb' will download to '~/imdb/2018/2018-10-03' if script got called on October 3rd, 2018." | |
exit 1 | |
fi | |
# make sure argument is existing directory | |
DEST_DIR="$1" | |
if [ ! -d "$DEST_DIR" ]; then | |
printf "Error: Argument directory '%s' does not exist.\n" "$DEST_DIR" | |
exit 1 | |
fi | |
# store today's date in variable: year + slash + full date | |
DATE=$(date '+%Y/%Y-%m-%d') | |
# create subdirectories in destination directory | |
DEST_DIR="${DEST_DIR}/${DATE}" | |
mkdir -p "$DEST_DIR" | |
ERROR_CODE=$? | |
if [ $ERROR_CODE != 0 ] ; then | |
echo "Error: Could not create directory '$DEST_DIR' (error code ${ERROR_CODE})." | |
exit 1 | |
fi | |
# go to today's directory | |
pushd "$DEST_DIR" || exit | |
# retrieve files | |
wget -e robots=off --no-verbose -np -nd -nc -r -l 1 -A gz https://datasets.imdbws.com | |
ERROR_CODE=$? | |
if [ $ERROR_CODE != 0 ] ; then | |
echo "Error: Failed to retrieve files to directory '$DEST_DIR' (error code ${ERROR_CODE})." | |
popd || exit | |
exit 1 | |
fi | |
# print information on downloaded files | |
ls -al | |
du -h | |
# test integrity of files | |
echo "Success. Testing files:" | |
gzip -t -v ./*.gz | |
# go back to original directory | |
popd || exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment