Created
April 14, 2017 18:42
-
-
Save khufkens/b458cabed47b57f30607c3b3ca4f93eb to your computer and use it in GitHub Desktop.
NCDC Batch Download Tool (command line) - downloads station data from the National Climatic Data Center
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Bash script to download all NCDC weather records for | |
# a list of stations (Station number (AWS/WMO/DATSAV3 number)) | |
# (it's easy to adjust the script to take any selection | |
# of years, just replace the ncftpls query for the years | |
# with a years file of your own liking or a range in the | |
# for loop e.g. years 2001 - 2010 = {2001..2010..1} ) | |
# | |
# raw data is downloaded, extracted from gz files | |
# and processed so all data is appended into one big | |
# file readable by for example R | |
# | |
# original NCDC data is stored in the raw_input folder | |
# created in the root where you intially downloaded the data | |
# | |
# use: ./NCDC_BDTcl.sh stations.txt | |
# requires: ncftpls, wget | |
# | |
# Coded by Koen Hufkens at Boston University 2010 | |
# set NCDC server | |
server="ftp://ftp.ncdc.noaa.gov/pub/data/gsod" | |
# read stations | |
stations=`cat $1` | |
# remember the grep "^d" trick first character that has 'd' | |
# list all available years, put in variable | |
ncftpls -l `echo "$server"` | grep "^d" | awk '{print $NF}' > years.txt | |
years=`cat years.txt` | |
rm years.txt | |
# ALTERNATIVE YEAR SELECTION METHODS | |
years=`seq 1968 2015` # uncomment to use a range of years from 2001 to 2010 | |
#years=`cat myyearsfile.txt` # uncomment and replace the filename with a list of years you want to use | |
for i in $years # loop through all available years | |
do | |
echo "looking up available stations for year: $i" | |
# get a list of all the stations for the given year | |
ncftpls -l `echo "$server/$i"` | awk '{print $NF}' > available_stations.txt | |
astations=`cat available_stations.txt` | |
rm available_stations.txt | |
for j in $stations # loop through all stations in the stations file | |
do | |
file=`echo "$astations" | grep "$j"` # look if there is station data for the year being processed | |
if [ -n "$file" ]; then # if the search for a specific file in a year fails, skip download | |
echo "Downloading: $file" | |
wget -q `echo "$server/$i/$file"` # download file | |
fi | |
done | |
done | |
# unzip all data files | |
gzip -d *.gz | |
# get unique stations in the current download dir | |
stations=`ls | cut -d'-' -f1 | uniq` | |
for i in $stations | |
do | |
# get the files corresponding to the station under evaluation | |
ls | grep $i > files.txt | |
# extract the years they cover | |
ls | grep $i | cut -d'-' -f3 | cut -d'.' -f1 > years.txt | |
# sort these files according to year | |
files=`paste years.txt files.txt | sort -k1 | cut -f2` | |
# preprocess data (remove * and headers, replace tabs by spaces) and append all years into tmp file | |
cat `echo $files` | sed '/^STN/d' | sed 's/*//g' | tr -s '\t' ' ' > tmp.txt | |
# create final output file (first line is the header, note the '-' for double value columns in original file) | |
echo "STN--- WBAN YEARMODA TEMP - DEWP - SLP - STP - VISIB - WDSP - MXSPD GUST MAX MIN PRCP SNDP FRSHTT" > $i.txt | |
# append all the measurement data | |
cat tmp.txt >> $i.txt | |
# clean up intermediates | |
rm files.txt | |
rm years.txt | |
rm tmp.txt | |
done | |
# make dir for the raw original input files *.op | |
mkdir raw_input | |
# move files to thie raw_input directory | |
mv *.op raw_input | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment