khufkens/NCDC_BDTcl.sh

## NCDC_BDTcl.sh
#!/bin/bash

# Bash script to download all NCDC weather records for
# a list of stations (Station number (AWS/WMO/DATSAV3 number))
# (it's easy to adjust the script to take any selection
# of years, just replace the ncftpls query for the years
# with a years file of your own liking or a range in the
# for loop e.g. years 2001 - 2010 = {2001..2010..1} )
#
# raw data is downloaded, extracted from gz files
# and processed so all data is appended into one big
# file readable by for example R
#
# original NCDC data is stored in the raw_input folder
# created in the root where you intially downloaded the data
#
# use: ./NCDC_BDTcl.sh stations.txt
# requires: ncftpls, wget
#
# Coded by Koen Hufkens at Boston University 2010

# set NCDC server
server="ftp://ftp.ncdc.noaa.gov/pub/data/gsod"

# read stations
stations=`cat $1`

# remember the grep "^d" trick first character that has 'd'
# list all available years, put in variable
ncftpls -l `echo "$server"` | grep "^d" | awk '{print $NF}' > years.txt
years=`cat years.txt`
rm years.txt

# ALTERNATIVE YEAR SELECTION METHODS
years=`seq 1968 2015` # uncomment to use a range of years from 2001 to 2010
#years=`cat myyearsfile.txt` # uncomment and replace the filename with a list of years you want to use

for i in $years # loop through all available years
do

echo "looking up available stations for year: $i"

# get a list of all the stations for the given year
ncftpls -l `echo "$server/$i"` | awk '{print $NF}' > available_stations.txt
astations=`cat available_stations.txt`
rm available_stations.txt

	for j in $stations # loop through all stations in the stations file
	do

	file=`echo "$astations" | grep "$j"` # look if there is station data for the year being processed

	if [ -n "$file" ]; then # if the search for a specific file in a year fails, skip download
		echo "Downloading: $file"
		wget -q `echo "$server/$i/$file"` # download file
	fi
	done
done

# unzip all data files
gzip -d *.gz

# get unique stations in the current download dir
stations=`ls | cut -d'-' -f1 | uniq`

for i in $stations
do
	# get the files corresponding to the station under evaluation
	ls | grep $i > files.txt

	# extract the years they cover
	ls | grep $i | cut -d'-' -f3 | cut -d'.' -f1 > years.txt

	# sort these files according to year
	files=`paste years.txt files.txt | sort -k1 | cut -f2`

	# preprocess data (remove * and headers, replace tabs by spaces) and append all years into tmp file
	cat `echo $files` | sed '/^STN/d' | sed 's/*//g' | tr -s '\t' ' ' > tmp.txt

	# create final output file (first line is the header, note the '-' for double value columns in original file)
	echo "STN--- WBAN YEARMODA TEMP - DEWP - SLP - STP - VISIB - WDSP - MXSPD GUST MAX MIN PRCP SNDP FRSHTT" > $i.txt

	# append all the measurement data
	cat tmp.txt >> $i.txt

	# clean up intermediates
	rm files.txt
	rm years.txt
	rm tmp.txt
done

# make dir for the raw original input files *.op
mkdir raw_input

# move files to thie raw_input directory
mv *.op raw_input

exit
	#!/bin/bash

	# Bash script to download all NCDC weather records for
	# a list of stations (Station number (AWS/WMO/DATSAV3 number))
	# (it's easy to adjust the script to take any selection
	# of years, just replace the ncftpls query for the years
	# with a years file of your own liking or a range in the
	# for loop e.g. years 2001 - 2010 = {2001..2010..1} )
	#
	# raw data is downloaded, extracted from gz files
	# and processed so all data is appended into one big
	# file readable by for example R
	#
	# original NCDC data is stored in the raw_input folder
	# created in the root where you intially downloaded the data
	#
	# use: ./NCDC_BDTcl.sh stations.txt
	# requires: ncftpls, wget
	#
	# Coded by Koen Hufkens at Boston University 2010

	# set NCDC server
	server="ftp://ftp.ncdc.noaa.gov/pub/data/gsod"

	# read stations
	stations=`cat $1`

	# remember the grep "^d" trick first character that has 'd'
	# list all available years, put in variable
	ncftpls -l `echo "$server"` \| grep "^d" \| awk '{print $NF}' > years.txt
	years=`cat years.txt`
	rm years.txt

	# ALTERNATIVE YEAR SELECTION METHODS
	years=`seq 1968 2015` # uncomment to use a range of years from 2001 to 2010
	#years=`cat myyearsfile.txt` # uncomment and replace the filename with a list of years you want to use

	for i in $years # loop through all available years
	do

	echo "looking up available stations for year: $i"

	# get a list of all the stations for the given year
	ncftpls -l `echo "$server/$i"` \| awk '{print $NF}' > available_stations.txt
	astations=`cat available_stations.txt`
	rm available_stations.txt

	for j in $stations # loop through all stations in the stations file
	do

	file=`echo "$astations" \| grep "$j"` # look if there is station data for the year being processed

	if [ -n "$file" ]; then # if the search for a specific file in a year fails, skip download
	echo "Downloading: $file"
	wget -q `echo "$server/$i/$file"` # download file
	fi
	done
	done

	# unzip all data files
	gzip -d *.gz

	# get unique stations in the current download dir
	stations=`ls \| cut -d'-' -f1 \| uniq`

	for i in $stations
	do
	# get the files corresponding to the station under evaluation
	ls \| grep $i > files.txt

	# extract the years they cover
	ls \| grep $i \| cut -d'-' -f3 \| cut -d'.' -f1 > years.txt

	# sort these files according to year
	files=`paste years.txt files.txt \| sort -k1 \| cut -f2`

	# preprocess data (remove * and headers, replace tabs by spaces) and append all years into tmp file
	cat `echo $files` \| sed '/^STN/d' \| sed 's/*//g' \| tr -s '\t' ' ' > tmp.txt

	# create final output file (first line is the header, note the '-' for double value columns in original file)
	echo "STN--- WBAN YEARMODA TEMP - DEWP - SLP - STP - VISIB - WDSP - MXSPD GUST MAX MIN PRCP SNDP FRSHTT" > $i.txt

	# append all the measurement data
	cat tmp.txt >> $i.txt

	# clean up intermediates
	rm files.txt
	rm years.txt
	rm tmp.txt
	done

	# make dir for the raw original input files *.op
	mkdir raw_input

	# move files to thie raw_input directory
	mv *.op raw_input

	exit