Skip to content

Instantly share code, notes, and snippets.

@PizzaRules668
Created December 2, 2023 22:34
Show Gist options
  • Save PizzaRules668/2b3786196f05ca16be504fd1c074f905 to your computer and use it in GitHub Desktop.
Save PizzaRules668/2b3786196f05ca16be504fd1c074f905 to your computer and use it in GitHub Desktop.
Iowa State Environmental Mesonet Data dowloader
import pandas as pd
pd.low_memory=False
from tqdm import tqdm
import numpy as np
import requests
import time
import math
import os
import threading
import queue
import argparse
"""
updater.py
This file will get all of the data and process the data
example usage:
python updater.py --network WI_ASOS --startingYear 2023 --startingMonth 11 --startingDay 1 --endingYear 2023 --endingMonth 12 --endingDay 1 --processThreads 2 --downloadThreads 2
URL:
https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={station}&data=all&year1={StartingYear}&month1={StartingMonth}&day1={StartingDay}&year2={EndingYear}&month2={EndingMonth}&day2={EndingDay}&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=empty&trace=empty&direct=no&report_type=3&report_type=4
"""
parser = argparse.ArgumentParser()
parser.add_argument("--network", type=str, default="WI_ASOS", help="The network to get the data from")
parser.add_argument("--stations", type=str, default=None, help="The stations to get the data from")
parser.add_argument("--startingYear", type=int, default=1940, help="The starting year to get the data from")
parser.add_argument("--startingMonth", type=int, default=1, help="The starting month to get the data from")
parser.add_argument("--startingDay", type=int, default=1, help="The starting day to get the data from")
parser.add_argument("--endingYear", type=int, default=2030, help="The ending year to get the data from")
parser.add_argument("--endingMonth", type=int, default=1, help="The ending month to get the data from")
parser.add_argument("--endingDay", type=int, default=1, help="The ending day to get the data from")
parser.add_argument("--downloadThreads", type=int, default=2, help="The amount of threads to use to download the data")
parser.add_argument("--processThreads", type=int, default=2, help="The amount of threads to use to process the data")
network = None
stations = None
startingYear = None
startingMonth = None
startingDay = None
endingYear = None
endingMonth = None
endingDay = None
dataToProcess = queue.Queue()
dataToGet = queue.Queue()
downloadDone = 0
downloadBar = None
processDone = 0
processBar = None
# Get Data downloads the data from the website and saves it to a file
def getData():
global dataToGet, downloadDone, downloadBar
# While there is data to get
while not dataToGet.empty():
station = dataToGet.get()
# Download the data
r = requests.get(f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={station}&data=all&year1={startingYear}&month1={startingMonth}&day1={startingDay}&year2={endingYear}&month2={endingMonth}&day2={endingDay}&tz=America%2FChicago&format=onlycomma&latlon=no&elev=no&missing=empty&trace=empty&direct=no&report_type=3")
open(f"data/download/{station}.csv", "w").write(r.content.decode())
# Process the progress bar
downloadDone += 1
downloadBar.update()
dataToProcess.put(station)
dataToGet.task_done()
# Process Data will process the data and turn it into a usable format
def processData():
global dataToProcess, dataToGet, processDone, processBar
# While there is data to process
while not dataToProcess.empty() or not dataToGet.empty():
station = dataToProcess.get()
# Read the downloaded data
df = pd.read_csv(f"data/download/{station}.csv", low_memory=False)
df = df.rename(columns={"valid": "time"})
df.to_csv(f"data/{network}/{station}/WeatherData.csv")
# Change the index to the time column
df.set_index('time', inplace=True)
df.index = pd.to_datetime(df.index)
if not os.path.exists(f"data/{network}/{station}/"):
os.makedirs(f"data/{network}/{station}/")
# Calculate the sin and cos of the time
df['month_sin'] = df.index.month.map(lambda x: math.sin(x * (2. * math.pi / 12)))
df['month_cos'] = df.index.month.map(lambda x: math.cos(x * (2. * math.pi / 12)))
df['day_sin'] = df.index.day .map(lambda x: math.sin(x * (2. * math.pi / 31)))
df['day_cos'] = df.index.day .map(lambda x: math.cos(x * (2. * math.pi / 31)))
df['hour_sin'] = df.index.hour .map(lambda x: math.sin(x * (2. * math.pi / 24)))
df['hour_cos'] = df.index.hour .map(lambda x: math.cos(x * (2. * math.pi / 24)))
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['hour'] = df.index.hour
df['dayofyear'] = df.index.dayofyear
# Turn the direction into a sin
df['drct_sin'] = df['drct'].map(lambda x: 0 if np.isnan(x) else math.sin(x * (2. * math.pi / 360)))
# Try and see if the file exists, if it does then concat the two files
try:
df1 = pd.read_csv(f"data/{network}/{station}/WeatherData.csv", low_memory=False)
df = pd.concat([df1, df])
except: pass
# Save the file
df.to_csv(f"data/{network}/{station}/TrimmedWeatherData.csv")
# Process the progress bar
processDone += 1
processBar.update()
dataToProcess.task_done()
def getStations(network):
# https://mesonet.agron.iastate.edu/sites/networks.php?network={network}&format=csv&nohtml=on
r = requests.get(f"https://mesonet.agron.iastate.edu/sites/networks.php?network={network}&format=csv&nohtml=on")
open(f"data/{network}.csv", "w").write(r.content.decode())
df = pd.read_csv(f"data/{network}.csv", low_memory=False)
return df['stid'].values
if __name__ == "__main__":
# Create the directories if they don't exist
if not os.path.exists("data/download/"): os.makedirs("data/download/")
args = parser.parse_args()
if args.stations is not None:
stations = args.stations.split(",")
network = args.network
stations = getStations(args.network)
startingYear = args.startingYear
startingMonth = args.startingMonth
startingDay = args.startingDay
endingYear = args.endingYear
endingMonth = args.endingMonth
endingDay = args.endingDay
downloadBar = tqdm(total=len(stations), desc="Amount of Stations Downloaded", postfix=downloadDone)
processBar = tqdm(total=len(stations), desc="Amount of Stations Processed", postfix=processDone)
# Put all of the stations into the queue, and create dir
for station in stations:
if not os.path.exists(f"data/download/{station}.csv"): os.makedirs(f"data/download/{station}.csv")
dataToGet.put(station)
# Start all of the threads
dataGetterThreads = []
for i in range(args.downloadThreads):
t = threading.Thread(target=getData, daemon=True)
t.start()
dataGetterThreads.append(t)
dataProcessorThreads = []
for i in range(args.processThreads):
t = threading.Thread(target=processData, daemon=True)
t.start()
dataProcessorThreads.append(t)
# Wait for all of the threads to finish
for t in dataGetterThreads: t.join()
for t in dataProcessorThreads: t.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment