Skip to content

Instantly share code, notes, and snippets.

@derekpowell
Last active May 5, 2021 21:43
Show Gist options
  • Save derekpowell/436407bda5fe9c2d62c632ed00146b2e to your computer and use it in GitHub Desktop.
Save derekpowell/436407bda5fe9c2d62c632ed00146b2e to your computer and use it in GitHub Desktop.
Create sqlite database of NOAA GHCN daily weather data
# script to create sqlite database of daily GHCN weather station data
# adding a table of US zipcodes for my own purposes (easily commented out)
# cribs heavily from https://github.com/dylburger/noaa-ghcn-weather-data
# Settings
import sqlalchemy
import os
import urllib.request
import pandas as pd
years = list(range(2010, 2021)) # range of years to download data
db_file = 'ghcn-daily-us.db' # database filename
folder = "ghcn/" # folder to save .csv.gz files in. set to "" to use local directory
## ------ script
def get_ghcd_daily_files(years, prefix=None, silent=False):
# years = str, int, or list of years (str or int)
# folder = sub folder name to save files in
if type(years) != list:
years = [years]
url_base = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/XXXX.csv.gz"
for year in years:
fname = prefix + str(year) + ".csv.gz"
urllib.request.urlretrieve(url_base.replace(
"XXXX", str(year)), filename=fname)
if not silent:
print(year, "data downloaded")
# create db connection
conn = sqlalchemy.create_engine("sqlite:///" + db_file)
# add stations table
station_metadata = pd.read_csv('https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt',
sep='\s+', # Fields are separated by one or more spaces
# Grab only the first 4 columns
usecols=[0, 1, 2, 3],
# Missing elevation is noted as -999.9
na_values=[-999.9],
header=None,
names=['station_id', 'latitude', 'longitude', 'elevation'])
station_metadata.to_sql('stations', conn, if_exists="replace")
# add zipcodes table
zip_data = pd.read_csv(
"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_zcta_national.zip",
sep='\s+',
header=0,
names=["zip", "area_land", "area_water", "area_land_sqmi",
"area_water_sqmi", "latitude", "longitude"]
)
zip_data.to_sql("zipcodes", conn, index=False)
# download csv files and append to weather table
for year in years:
get_ghcd_daily_files(year, folder)
weather_data = pd.read_csv(folder + str(year) + ".csv.gz",
header=None,
index_col=False,
names=['station_id',
'measurement_date',
'measurement_type',
'measurement_flag',
'quality_flag',
'source_flag',
'observation_time'],
parse_dates=['measurement_date'])
# select only US stations
weather_data = weather_data[weather_data.station_id.str.startswith("US")]
weather_data_subset = weather_data[weather_data.measurement_type.isin(['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN'])][[
'station_id', 'measurement_date', 'measurement_type', 'measurement_flag']]
weather_data_subset.to_sql(
'weather', conn, chunksize=100000, index_label='id', if_exists="append")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment