thedivtagguy/anomalies.py

## anomalies.py
# calculate long term averages for each month and also calculate week wise anomalies for 2022

from datetime import datetime, timedelta
import os
import numpy as np
import rasterio as rio
from multiprocessing import Pool, cpu_count

# version changed from V06B to V06C on May 8, 2022
dt_version_shift = datetime.strptime("20220508", '%Y%m%d')

def sum_rasters(dates):
  total = np.zeros((1800, 3600))
  transform = None

  for date in dates:
    dt = datetime.strptime(date, '%Y%m%d')
    version = 'V06C' if dt >= dt_version_shift else 'V06B'
    with rio.open("../imerg-raw-data/" + date + "-imerg/" + f"3B-DAY-L.GIS.IMERG.{date}.{version}.liquid" + ".tif") as src:
      transform = src.transform
      total += src.read(1)
      src.close()

  return (total, transform)


def lta(dates, key):
  print(f"Getting: {key}")

  lta, transform = sum_rasters(dates)

  lta /= len(dates)


  if not os.path.exists("../lta/" + key):
    os.mkdir("../lta/" + key)

  with rio.open("../lta/" + key + "/border-crop-lta.tif", "w", driver="GTiff", width=3600, height=1800, count=1, dtype=np.float32, transform=transform) as dst:
    dst.write(lta, 1)
    dst.close()


if __name__ == '__main__':
    pool = Pool(processes=cpu_count())
    dates = [x for x in os.listdir("../imerg-raw-data") if not x.endswith("Store") and not x.endswith("-imerg")]
    dates_map = {}
    dates_2022 = {}

    for date in dates:
      year, month, day = (date[0:4], date[4:6], date[6:8])
      key = month + "-" + day
      if key not in dates_map:
        dates_map[key] = []

      if key not in dates_2022:
        dates_2022[key] = []

      # don't use 2022 data for long term average
      if year != "2022":
        dates_map[key].append(date)

      else:
        dates_2022[key].append(date)

    # ucomment below two lines to calculate daily long term averages
    # pool_data = [(dates_map[key], key) for key in dates_map]
    # pool.starmap(lta, pool_data)

    # calculate weekly anomalies for March, April, May, and June 2022
    # Week 10: March 6 - March 12
    # Week 11: March 13 - March 19
    # Week 12: March 20 - March 26
    # Week 13: March 27 - April 2
    # Week 14: April 3 - April 9
    # Week 15: April 10 - April 16
    # Week 16: April 17 - April 23
    # Week 17: April 24 - April 30
    # Week 18: May 1 - May 7
    # Week 19: May 8 - May 14
    # Week 20: May 15 - May 21
    # Week 21: May 22 - May 28
    # Week 22: May 29 - June 4
    # Week 23: June 5 - June 11
    # Week 24: June 12 - June 18
    # Week 25: June 19 - June 25
    # Week 26: June 26 - July 2
    # Week 27: July 3 - July 9
    # Week 28: July 10 - July 16
    # Week 29: July 17 - July 23
    # Week 30: July 24 - July 30
    # Week 31: August 1 - August 7
    # Week 32: August 8 - August 14
    # Week 33: August 15 - August 21
    # Week 34: August 22 - August 28
    # Week 35: August 29 - September 4


    week_dates = {}

    # generate a mapping of week number to dates
    for week in range(30, 31):
      start_date = datetime(2022, 3, 5) + timedelta(days=7*(week-10))
      end_date = datetime(2022, 3, 6) + timedelta(days=7*(week-9))
      key = start_date.strftime("%m-%d")
      while start_date <= end_date:
        if week not in week_dates:
          week_dates[week] = []

        week_dates[week].append(start_date.strftime("%Y%m%d"))
        start_date += timedelta(days=1)

    # now calculate lta for each week
    # go through the dates of each week, remove the year part from the date
    # search those dates in the main dates list above
    # sum all those dates
    # average by the number of weeks
    # write to file
    for week_num, week_dates in week_dates.items():
      print(f"Calculating week: {week_num}")
      week_lta = np.zeros((1800, 3600))
      week_total_2022 = np.zeros((1800, 3600))
      # remove year + turn dates into keys to look up in the main dates map
      dates_no_year = [x[4:6] + "-" + x[6:] for x in week_dates]
      start_date = dates_no_year[0]
      end_date = dates_no_year[-1]
      # sum all the data for each of the above dates
      total_days_count = 0
      total_2022_days_count = 0
      for key in dates_no_year:
        if key in dates_map:
          print(dates_map[key])
          (total_lt, transform_lt) = sum_rasters(dates_map[key])
          week_lta += total_lt
          total_days_count += len(dates_map[key])

        if key in dates_2022:
          (total_2022, transform_2022) = sum_rasters(dates_2022[key])
          week_total_2022 += total_2022

      # print(total_days_count, total_days_count / 7)
      total_weeks = total_days_count / 7
      # average the data
      week_lta /= total_weeks
      # average for 2022

      # anomaly
      # daily imerg data has a 0.1mm unit. so if actual rainfall was 10mm, it is 100 in the data.
      week_2022_anomaly = (week_total_2022 - week_lta) * 0.1

      # make directtory
      # if "../lta/week-" + str(week_num) does not exist, make it
      if not os.path.exists("../lta/week-" + str(week_num)):
        os.mkdir("../lta/week-" + str(week_num))

      # if not os.path.exists("../2022-anomalies/week-" + str(week_num)):
      #   os.mkdir("../2022-anomalies/week-" + str(week_num))

      # write the data
      with rio.open("../lta/week-" + str(week_num) + f"/border-crop-lta-{start_date}-{end_date}.tif", "w", driver="GTiff", width=3600, height=1800, count=1, dtype=np.float32, transform=transform_lt) as dst:
        dst.update_tags(IMPORTANT_UNIT = "0.1mm. So if actual rainfall was 10mm, it is 100 in the data.")
        dst.write(week_lta, 1)
        dst.close()

      with rio.open(f"../2022-anomalies/border-crop-anomaly-{start_date}-{end_date}.tif", "w", driver="GTiff", width=3600, height=1800, count=1, dtype=np.float32, transform=transform_2022) as dst:
        dst.update_tags(IMPORTANT_UNIT = "1mm. So if actual rainfall was 10mm, it is 10 in the data.")
        dst.write(week_2022_anomaly, 1)
        dst.close()

## get_imerg.py
import subprocess
import os
import datetime
from multiprocessing import Pool, cpu_count

# version changed from V06B to V06C on May 8, 2022
dt_version_shift = datetime.datetime.strptime("20220508", '%Y%m%d')
start_month = 7
end_month = 8

start_year = 2000
end_year = 2022

def getYear(year):
  for month in range(start_month, end_month, 1):
    for day in range(31, 0, -1):
      date = str(year) + str(month).zfill(2) + str(day).zfill(2)
      # doesn't exist for some reason. can check here: https://jsimpsonhttps.pps.eosdis.nasa.gov/imerg/gis/2019/06/
      if date == "20190630":
        continue
      try:
        dt = datetime.datetime.strptime(date, '%Y%m%d')
        # if more than day before yesterday's date, skip
        if dt > datetime.datetime.now() - datetime.timedelta(days=2):
          continue
      except ValueError:
        continue

      # if folder exists continue
      if os.path.exists('../imerg-raw-data/' + date):
        continue
      # Else, make it
      else:
        os.makedirs('../imerg-raw-data/' + date)

      print(date)
      # version changed from V06B to V06C on May 8, 2022
      version = 'V06C' if dt >= dt_version_shift else 'V06B'
      file_name = f"3B-DAY-L.GIS.IMERG.{date}.{version}"
      file_link = f"https://jsimpsonhttps.pps.eosdis.nasa.gov/imerg/gis/{year}/{str(month).zfill(2)}/{file_name}.zip"
      # download the imerg zip
      command = f"curl -o ../imerg-raw-data/{date}.zip -O --ssl-reqd -u manas.sharma@thomsonreuters.com:manas.sharma@thomsonreuters.com {file_link}"
      subprocess.call(command, shell=True)
      # unzip and delete zip
      command = f"unzip ../imerg-raw-data/{date}.zip -d ../imerg-raw-data/{date}-imerg && rm ../imerg-raw-data/{date}.zip"
      subprocess.call(command, shell=True)

      # sample code for clipping to a geojson
      # # make directory
      # command = f"mkdir ../imerg-clipped/{date}"
      # subprocess.call(command, shell=True)
      # command = f"/usr/local/bin/gdalwarp -cutline ../maps/border-crop.geojson ../imerg-raw-data/{date}-  imerg/{file_name}.liquid.tif ../imerg-clipped/{date}/border-crop-{date}.tif"
      # subprocess.call(command, shell=True)

      # uncomment to delete raw data
      # command = f"rm -rf ../imerg-raw-data/{date}-imerg"
      # print(command)
      # subprocess.call(command, shell=True)


if __name__ == '__main__':
    pool = Pool(processes=cpu_count())
    pool.map(getYear, range(end_year, start_year, -1))
	# calculate long term averages for each month and also calculate week wise anomalies for 2022

	from datetime import datetime, timedelta
	import os
	import numpy as np
	import rasterio as rio
	from multiprocessing import Pool, cpu_count

	# version changed from V06B to V06C on May 8, 2022
	dt_version_shift = datetime.strptime("20220508", '%Y%m%d')

	def sum_rasters(dates):
	total = np.zeros((1800, 3600))
	transform = None

	for date in dates:
	dt = datetime.strptime(date, '%Y%m%d')
	version = 'V06C' if dt >= dt_version_shift else 'V06B'
	with rio.open("../imerg-raw-data/" + date + "-imerg/" + f"3B-DAY-L.GIS.IMERG.{date}.{version}.liquid" + ".tif") as src:
	transform = src.transform
	total += src.read(1)
	src.close()

	return (total, transform)


	def lta(dates, key):
	print(f"Getting: {key}")

	lta, transform = sum_rasters(dates)

	lta /= len(dates)


	if not os.path.exists("../lta/" + key):
	os.mkdir("../lta/" + key)

	with rio.open("../lta/" + key + "/border-crop-lta.tif", "w", driver="GTiff", width=3600, height=1800, count=1, dtype=np.float32, transform=transform) as dst:
	dst.write(lta, 1)
	dst.close()




	if __name__ == '__main__':
	pool = Pool(processes=cpu_count())
	dates = [x for x in os.listdir("../imerg-raw-data") if not x.endswith("Store") and not x.endswith("-imerg")]
	dates_map = {}
	dates_2022 = {}

	for date in dates:
	year, month, day = (date[0:4], date[4:6], date[6:8])
	key = month + "-" + day
	if key not in dates_map:
	dates_map[key] = []

	if key not in dates_2022:
	dates_2022[key] = []

	# don't use 2022 data for long term average
	if year != "2022":
	dates_map[key].append(date)

	else:
	dates_2022[key].append(date)

	# ucomment below two lines to calculate daily long term averages
	# pool_data = [(dates_map[key], key) for key in dates_map]
	# pool.starmap(lta, pool_data)

	# calculate weekly anomalies for March, April, May, and June 2022
	# Week 10: March 6 - March 12
	# Week 11: March 13 - March 19
	# Week 12: March 20 - March 26
	# Week 13: March 27 - April 2
	# Week 14: April 3 - April 9
	# Week 15: April 10 - April 16
	# Week 16: April 17 - April 23
	# Week 17: April 24 - April 30
	# Week 18: May 1 - May 7
	# Week 19: May 8 - May 14
	# Week 20: May 15 - May 21
	# Week 21: May 22 - May 28
	# Week 22: May 29 - June 4
	# Week 23: June 5 - June 11
	# Week 24: June 12 - June 18
	# Week 25: June 19 - June 25
	# Week 26: June 26 - July 2
	# Week 27: July 3 - July 9
	# Week 28: July 10 - July 16
	# Week 29: July 17 - July 23
	# Week 30: July 24 - July 30
	# Week 31: August 1 - August 7
	# Week 32: August 8 - August 14
	# Week 33: August 15 - August 21
	# Week 34: August 22 - August 28
	# Week 35: August 29 - September 4




	week_dates = {}

	# generate a mapping of week number to dates
	for week in range(30, 31):
	start_date = datetime(2022, 3, 5) + timedelta(days=7*(week-10))
	end_date = datetime(2022, 3, 6) + timedelta(days=7*(week-9))
	key = start_date.strftime("%m-%d")
	while start_date <= end_date:
	if week not in week_dates:
	week_dates[week] = []

	week_dates[week].append(start_date.strftime("%Y%m%d"))
	start_date += timedelta(days=1)

	# now calculate lta for each week
	# go through the dates of each week, remove the year part from the date
	# search those dates in the main dates list above
	# sum all those dates
	# average by the number of weeks
	# write to file
	for week_num, week_dates in week_dates.items():
	print(f"Calculating week: {week_num}")
	week_lta = np.zeros((1800, 3600))
	week_total_2022 = np.zeros((1800, 3600))
	# remove year + turn dates into keys to look up in the main dates map
	dates_no_year = [x[4:6] + "-" + x[6:] for x in week_dates]
	start_date = dates_no_year[0]
	end_date = dates_no_year[-1]
	# sum all the data for each of the above dates
	total_days_count = 0
	total_2022_days_count = 0
	for key in dates_no_year:
	if key in dates_map:
	print(dates_map[key])
	(total_lt, transform_lt) = sum_rasters(dates_map[key])
	week_lta += total_lt
	total_days_count += len(dates_map[key])

	if key in dates_2022:
	(total_2022, transform_2022) = sum_rasters(dates_2022[key])
	week_total_2022 += total_2022

	# print(total_days_count, total_days_count / 7)
	total_weeks = total_days_count / 7
	# average the data
	week_lta /= total_weeks
	# average for 2022

	# anomaly
	# daily imerg data has a 0.1mm unit. so if actual rainfall was 10mm, it is 100 in the data.
	week_2022_anomaly = (week_total_2022 - week_lta) * 0.1

	# make directtory
	# if "../lta/week-" + str(week_num) does not exist, make it
	if not os.path.exists("../lta/week-" + str(week_num)):
	os.mkdir("../lta/week-" + str(week_num))

	# if not os.path.exists("../2022-anomalies/week-" + str(week_num)):
	# os.mkdir("../2022-anomalies/week-" + str(week_num))

	# write the data
	with rio.open("../lta/week-" + str(week_num) + f"/border-crop-lta-{start_date}-{end_date}.tif", "w", driver="GTiff", width=3600, height=1800, count=1, dtype=np.float32, transform=transform_lt) as dst:
	dst.update_tags(IMPORTANT_UNIT = "0.1mm. So if actual rainfall was 10mm, it is 100 in the data.")
	dst.write(week_lta, 1)
	dst.close()

	with rio.open(f"../2022-anomalies/border-crop-anomaly-{start_date}-{end_date}.tif", "w", driver="GTiff", width=3600, height=1800, count=1, dtype=np.float32, transform=transform_2022) as dst:
	dst.update_tags(IMPORTANT_UNIT = "1mm. So if actual rainfall was 10mm, it is 10 in the data.")
	dst.write(week_2022_anomaly, 1)
	dst.close()
	import subprocess
	import os
	import datetime
	from multiprocessing import Pool, cpu_count

	# version changed from V06B to V06C on May 8, 2022
	dt_version_shift = datetime.datetime.strptime("20220508", '%Y%m%d')
	start_month = 7
	end_month = 8

	start_year = 2000
	end_year = 2022

	def getYear(year):
	for month in range(start_month, end_month, 1):
	for day in range(31, 0, -1):
	date = str(year) + str(month).zfill(2) + str(day).zfill(2)
	# doesn't exist for some reason. can check here: https://jsimpsonhttps.pps.eosdis.nasa.gov/imerg/gis/2019/06/
	if date == "20190630":
	continue
	try:
	dt = datetime.datetime.strptime(date, '%Y%m%d')
	# if more than day before yesterday's date, skip
	if dt > datetime.datetime.now() - datetime.timedelta(days=2):
	continue
	except ValueError:
	continue

	# if folder exists continue
	if os.path.exists('../imerg-raw-data/' + date):
	continue
	# Else, make it
	else:
	os.makedirs('../imerg-raw-data/' + date)

	print(date)
	# version changed from V06B to V06C on May 8, 2022
	version = 'V06C' if dt >= dt_version_shift else 'V06B'
	file_name = f"3B-DAY-L.GIS.IMERG.{date}.{version}"
	file_link = f"https://jsimpsonhttps.pps.eosdis.nasa.gov/imerg/gis/{year}/{str(month).zfill(2)}/{file_name}.zip"
	# download the imerg zip
	command = f"curl -o ../imerg-raw-data/{date}.zip -O --ssl-reqd -u manas.sharma@thomsonreuters.com:manas.sharma@thomsonreuters.com {file_link}"
	subprocess.call(command, shell=True)
	# unzip and delete zip
	command = f"unzip ../imerg-raw-data/{date}.zip -d ../imerg-raw-data/{date}-imerg && rm ../imerg-raw-data/{date}.zip"
	subprocess.call(command, shell=True)

	# sample code for clipping to a geojson
	# # make directory
	# command = f"mkdir ../imerg-clipped/{date}"
	# subprocess.call(command, shell=True)
	# command = f"/usr/local/bin/gdalwarp -cutline ../maps/border-crop.geojson ../imerg-raw-data/{date}- imerg/{file_name}.liquid.tif ../imerg-clipped/{date}/border-crop-{date}.tif"
	# subprocess.call(command, shell=True)

	# uncomment to delete raw data
	# command = f"rm -rf ../imerg-raw-data/{date}-imerg"
	# print(command)
	# subprocess.call(command, shell=True)



	if __name__ == '__main__':
	pool = Pool(processes=cpu_count())
	pool.map(getYear, range(end_year, start_year, -1))