PizzaRules668/get_weather_data.py

## get_weather_data.py
import pandas as pd
pd.low_memory=False

from tqdm import tqdm
import numpy as np
import requests
import time
import math
import os

import threading
import queue

import argparse


"""
updater.py

This file will get all of the data and process the data

example usage:
python updater.py --network WI_ASOS --startingYear 2023 --startingMonth 11 --startingDay 1 --endingYear 2023 --endingMonth 12 --endingDay 1 --processThreads 2 --downloadThreads 2

URL:
https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={station}&data=all&year1={StartingYear}&month1={StartingMonth}&day1={StartingDay}&year2={EndingYear}&month2={EndingMonth}&day2={EndingDay}&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=empty&trace=empty&direct=no&report_type=3&report_type=4
"""

parser = argparse.ArgumentParser()
parser.add_argument("--network", type=str, default="WI_ASOS", help="The network to get the data from")
parser.add_argument("--stations", type=str, default=None, help="The stations to get the data from")

parser.add_argument("--startingYear", type=int, default=1940, help="The starting year to get the data from")
parser.add_argument("--startingMonth", type=int, default=1, help="The starting month to get the data from")
parser.add_argument("--startingDay", type=int, default=1, help="The starting day to get the data from")

parser.add_argument("--endingYear", type=int, default=2030, help="The ending year to get the data from")
parser.add_argument("--endingMonth", type=int, default=1, help="The ending month to get the data from")
parser.add_argument("--endingDay", type=int, default=1, help="The ending day to get the data from")

parser.add_argument("--downloadThreads", type=int, default=2, help="The amount of threads to use to download the data")
parser.add_argument("--processThreads", type=int, default=2, help="The amount of threads to use to process the data")

network = None
stations = None

startingYear = None
startingMonth = None
startingDay = None

endingYear = None
endingMonth = None
endingDay = None

dataToProcess   = queue.Queue()
dataToGet       = queue.Queue()

downloadDone = 0
downloadBar = None

processDone = 0
processBar = None

# Get Data downloads the data from the website and saves it to a file
def getData():
    global dataToGet, downloadDone, downloadBar

    # While there is data to get
    while not dataToGet.empty():
        station = dataToGet.get()

        # Download the data
        r = requests.get(f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={station}&data=all&year1={startingYear}&month1={startingMonth}&day1={startingDay}&year2={endingYear}&month2={endingMonth}&day2={endingDay}&tz=America%2FChicago&format=onlycomma&latlon=no&elev=no&missing=empty&trace=empty&direct=no&report_type=3")
        open(f"data/download/{station}.csv", "w").write(r.content.decode())

        # Process the progress bar
        downloadDone += 1
        downloadBar.update()

        dataToProcess.put(station)
        dataToGet.task_done()

# Process Data will process the data and turn it into a usable format
def processData():
    global dataToProcess, dataToGet, processDone, processBar

    # While there is data to process
    while not dataToProcess.empty() or not dataToGet.empty():
        station = dataToProcess.get()

        # Read the downloaded data
        df = pd.read_csv(f"data/download/{station}.csv", low_memory=False)
        df = df.rename(columns={"valid": "time"})

        df.to_csv(f"data/{network}/{station}/WeatherData.csv")

        # Change the index to the time column
        df.set_index('time', inplace=True)
        df.index = pd.to_datetime(df.index)

        if not os.path.exists(f"data/{network}/{station}/"):
            os.makedirs(f"data/{network}/{station}/")

        # Calculate the sin and cos of the time
        df['month_sin']   = df.index.month.map(lambda x: math.sin(x * (2. * math.pi / 12)))
        df['month_cos']   = df.index.month.map(lambda x: math.cos(x * (2. * math.pi / 12)))
        df['day_sin']     = df.index.day  .map(lambda x: math.sin(x * (2. * math.pi / 31)))
        df['day_cos']     = df.index.day  .map(lambda x: math.cos(x * (2. * math.pi / 31)))
        df['hour_sin']    = df.index.hour .map(lambda x: math.sin(x * (2. * math.pi / 24)))
        df['hour_cos']    = df.index.hour .map(lambda x: math.cos(x * (2. * math.pi / 24)))
        df['year']        = df.index.year
        df['month']       = df.index.month
        df['day']         = df.index.day
        df['hour']        = df.index.hour
        df['dayofyear']   = df.index.dayofyear

        # Turn the direction into a sin
        df['drct_sin'] = df['drct'].map(lambda x: 0 if np.isnan(x) else math.sin(x * (2. * math.pi / 360)))

        # Try and see if the file exists, if it does then concat the two files
        try:
            df1 = pd.read_csv(f"data/{network}/{station}/WeatherData.csv", low_memory=False)
            df = pd.concat([df1, df])
        except: pass

        # Save the file
        df.to_csv(f"data/{network}/{station}/TrimmedWeatherData.csv")

        # Process the progress bar
        processDone += 1
        processBar.update()

        dataToProcess.task_done()

def getStations(network):
    # https://mesonet.agron.iastate.edu/sites/networks.php?network={network}&format=csv&nohtml=on
    r = requests.get(f"https://mesonet.agron.iastate.edu/sites/networks.php?network={network}&format=csv&nohtml=on")
    open(f"data/{network}.csv", "w").write(r.content.decode())

    df = pd.read_csv(f"data/{network}.csv", low_memory=False)
    return df['stid'].values

if __name__ == "__main__":
    # Create the directories if they don't exist
    if not os.path.exists("data/download/"):    os.makedirs("data/download/")

    args = parser.parse_args()

    if args.stations is not None:
        stations = args.stations.split(",")

    network = args.network
    stations = getStations(args.network)

    startingYear = args.startingYear
    startingMonth = args.startingMonth
    startingDay = args.startingDay

    endingYear = args.endingYear
    endingMonth = args.endingMonth
    endingDay = args.endingDay

    downloadBar = tqdm(total=len(stations), desc="Amount of Stations Downloaded", postfix=downloadDone)
    processBar  = tqdm(total=len(stations), desc="Amount of Stations Processed", postfix=processDone)


    # Put all of the stations into the queue, and create dir
    for station in stations:
        if not os.path.exists(f"data/download/{station}.csv"): os.makedirs(f"data/download/{station}.csv")
        dataToGet.put(station)

    # Start all of the threads
    dataGetterThreads = []
    for i in range(args.downloadThreads):
        t = threading.Thread(target=getData, daemon=True)
        t.start()
        dataGetterThreads.append(t)

    dataProcessorThreads = []
    for i in range(args.processThreads):
        t = threading.Thread(target=processData, daemon=True)
        t.start()
        dataProcessorThreads.append(t)

    # Wait for all of the threads to finish
    for t in dataGetterThreads:     t.join()
    for t in dataProcessorThreads:  t.join()
	import pandas as pd
	pd.low_memory=False

	from tqdm import tqdm
	import numpy as np
	import requests
	import time
	import math
	import os

	import threading
	import queue

	import argparse


	"""
	updater.py

	This file will get all of the data and process the data

	example usage:
	python updater.py --network WI_ASOS --startingYear 2023 --startingMonth 11 --startingDay 1 --endingYear 2023 --endingMonth 12 --endingDay 1 --processThreads 2 --downloadThreads 2

	URL:
	https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={station}&data=all&year1={StartingYear}&month1={StartingMonth}&day1={StartingDay}&year2={EndingYear}&month2={EndingMonth}&day2={EndingDay}&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=empty&trace=empty&direct=no&report_type=3&report_type=4
	"""

	parser = argparse.ArgumentParser()
	parser.add_argument("--network", type=str, default="WI_ASOS", help="The network to get the data from")
	parser.add_argument("--stations", type=str, default=None, help="The stations to get the data from")

	parser.add_argument("--startingYear", type=int, default=1940, help="The starting year to get the data from")
	parser.add_argument("--startingMonth", type=int, default=1, help="The starting month to get the data from")
	parser.add_argument("--startingDay", type=int, default=1, help="The starting day to get the data from")

	parser.add_argument("--endingYear", type=int, default=2030, help="The ending year to get the data from")
	parser.add_argument("--endingMonth", type=int, default=1, help="The ending month to get the data from")
	parser.add_argument("--endingDay", type=int, default=1, help="The ending day to get the data from")

	parser.add_argument("--downloadThreads", type=int, default=2, help="The amount of threads to use to download the data")
	parser.add_argument("--processThreads", type=int, default=2, help="The amount of threads to use to process the data")

	network = None
	stations = None

	startingYear = None
	startingMonth = None
	startingDay = None

	endingYear = None
	endingMonth = None
	endingDay = None

	dataToProcess = queue.Queue()
	dataToGet = queue.Queue()

	downloadDone = 0
	downloadBar = None

	processDone = 0
	processBar = None

	# Get Data downloads the data from the website and saves it to a file
	def getData():
	global dataToGet, downloadDone, downloadBar

	# While there is data to get
	while not dataToGet.empty():
	station = dataToGet.get()

	# Download the data
	r = requests.get(f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station={station}&data=all&year1={startingYear}&month1={startingMonth}&day1={startingDay}&year2={endingYear}&month2={endingMonth}&day2={endingDay}&tz=America%2FChicago&format=onlycomma&latlon=no&elev=no&missing=empty&trace=empty&direct=no&report_type=3")
	open(f"data/download/{station}.csv", "w").write(r.content.decode())

	# Process the progress bar
	downloadDone += 1
	downloadBar.update()

	dataToProcess.put(station)
	dataToGet.task_done()

	# Process Data will process the data and turn it into a usable format
	def processData():
	global dataToProcess, dataToGet, processDone, processBar

	# While there is data to process
	while not dataToProcess.empty() or not dataToGet.empty():
	station = dataToProcess.get()

	# Read the downloaded data
	df = pd.read_csv(f"data/download/{station}.csv", low_memory=False)
	df = df.rename(columns={"valid": "time"})

	df.to_csv(f"data/{network}/{station}/WeatherData.csv")

	# Change the index to the time column
	df.set_index('time', inplace=True)
	df.index = pd.to_datetime(df.index)

	if not os.path.exists(f"data/{network}/{station}/"):
	os.makedirs(f"data/{network}/{station}/")

	# Calculate the sin and cos of the time
	df['month_sin'] = df.index.month.map(lambda x: math.sin(x * (2. * math.pi / 12)))
	df['month_cos'] = df.index.month.map(lambda x: math.cos(x * (2. * math.pi / 12)))
	df['day_sin'] = df.index.day .map(lambda x: math.sin(x * (2. * math.pi / 31)))
	df['day_cos'] = df.index.day .map(lambda x: math.cos(x * (2. * math.pi / 31)))
	df['hour_sin'] = df.index.hour .map(lambda x: math.sin(x * (2. * math.pi / 24)))
	df['hour_cos'] = df.index.hour .map(lambda x: math.cos(x * (2. * math.pi / 24)))
	df['year'] = df.index.year
	df['month'] = df.index.month
	df['day'] = df.index.day
	df['hour'] = df.index.hour
	df['dayofyear'] = df.index.dayofyear

	# Turn the direction into a sin
	df['drct_sin'] = df['drct'].map(lambda x: 0 if np.isnan(x) else math.sin(x * (2. * math.pi / 360)))

	# Try and see if the file exists, if it does then concat the two files
	try:
	df1 = pd.read_csv(f"data/{network}/{station}/WeatherData.csv", low_memory=False)
	df = pd.concat([df1, df])
	except: pass

	# Save the file
	df.to_csv(f"data/{network}/{station}/TrimmedWeatherData.csv")

	# Process the progress bar
	processDone += 1
	processBar.update()

	dataToProcess.task_done()

	def getStations(network):
	# https://mesonet.agron.iastate.edu/sites/networks.php?network={network}&format=csv&nohtml=on
	r = requests.get(f"https://mesonet.agron.iastate.edu/sites/networks.php?network={network}&format=csv&nohtml=on")
	open(f"data/{network}.csv", "w").write(r.content.decode())

	df = pd.read_csv(f"data/{network}.csv", low_memory=False)
	return df['stid'].values

	if __name__ == "__main__":
	# Create the directories if they don't exist
	if not os.path.exists("data/download/"): os.makedirs("data/download/")

	args = parser.parse_args()

	if args.stations is not None:
	stations = args.stations.split(",")

	network = args.network
	stations = getStations(args.network)

	startingYear = args.startingYear
	startingMonth = args.startingMonth
	startingDay = args.startingDay

	endingYear = args.endingYear
	endingMonth = args.endingMonth
	endingDay = args.endingDay

	downloadBar = tqdm(total=len(stations), desc="Amount of Stations Downloaded", postfix=downloadDone)
	processBar = tqdm(total=len(stations), desc="Amount of Stations Processed", postfix=processDone)


	# Put all of the stations into the queue, and create dir
	for station in stations:
	if not os.path.exists(f"data/download/{station}.csv"): os.makedirs(f"data/download/{station}.csv")
	dataToGet.put(station)

	# Start all of the threads
	dataGetterThreads = []
	for i in range(args.downloadThreads):
	t = threading.Thread(target=getData, daemon=True)
	t.start()
	dataGetterThreads.append(t)

	dataProcessorThreads = []
	for i in range(args.processThreads):
	t = threading.Thread(target=processData, daemon=True)
	t.start()
	dataProcessorThreads.append(t)

	# Wait for all of the threads to finish
	for t in dataGetterThreads: t.join()
	for t in dataProcessorThreads: t.join()