Skip to content

Instantly share code, notes, and snippets.

@lukebiggerstaff
Last active March 8, 2021 08:49
Show Gist options
  • Save lukebiggerstaff/9029066281d38fdc8055f5d7cb19a70b to your computer and use it in GitHub Desktop.
Save lukebiggerstaff/9029066281d38fdc8055f5d7cb19a70b to your computer and use it in GitHub Desktop.
Bash script to download the ncdc weather data used by Hadoop: The definitive guide.
#!/bin/bash
# simple script to download weather data from ncdc
# for use with Hadoop: The Definitive guide by Tom White
# script downloads all files for each year,
# combines into one gzipped file and saves in the DATA_FOLDER directory
#FTP address
FTP_HOSTNAME="ftp.ncei.noaa.gov"
FTP_PATH="/pub/data/noaa/"
# folder to store files
DATA_FOLDER="all"
function check_folder {
if [[ !-d "$DATA_FOLDER" ]]
then
mkdir -v $DATA_FOLDER
echo "$DATA_FOLDER folder created"
else
echo "$DATA_FOLDER folder exists"
fi
}
function download_year_data {
local start_year="$1"
local end_year="${2:-$start_year}"
for year in $(seq $start_year $end_year)
do
if [[ ! -f "$DATA_FOLDER"/"$year".gz ]]
then
echo -n $year download ..
wget -q -r ftp://"$FTP_HOSTNAME""$FTP_PATH""$year" -O all/"$year".gz
echo .. completed
else
echo "$DATA_FOLDER"/"$year".gz already exists
fi
done
}
function cleanup {
if [[ -d "$FTP_HOSTNAME" ]]
then
rm -rf ./"$FTP_HOSTNAME"
echo "$FTP_HOSTNAME" directory removed
fi
}
function main {
if [[ "$#" == 1 || "$#" == 2 ]]
then
check_folder
download_year_data "$@"
else
echo "Usage: $0 <start_year> [<end_year>]"
fi
cleanup
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment