Skip to content

Instantly share code, notes, and snippets.

@robertvunabandi
Created April 7, 2020 14:52
Show Gist options
  • Save robertvunabandi/767675136c3dfa8600aa6310ee253008 to your computer and use it in GitHub Desktop.
Save robertvunabandi/767675136c3dfa8600aa6310ee253008 to your computer and use it in GitHub Desktop.
A script to convert the grapes data from https://www.ncdc.noaa.gov/paleo-search/study/13194 into just one table with fields year, location, abbreviation, latitude, longitude, value (which is harvest_days_after_august_31st)
from typing import Dict, Generic, List, Tuple, TypeVar
T = TypeVar("T")
S = TypeVar("T")
class Pair(Generic[T, S]):
pass
# filenames. Note that for the first 3 filenames, we named them this way.
# so the name mapping must remain the same or you'd need to change the name
# here to whatever name used.
F_LOC = "locations.csv". # extracted locations into its own csv
F_ABBR = "abbreviations.csv" # extracted abbreviatons into its own csv
F_DATA = "data.csv" # extracted the data into its own csv
F_OUT = "clean_data.csv" # the final output csv
# fields
class Field:
YEAR = "year"
LOC = "location"
ABBR = "abbreviation"
LAT = "latitude"
LON = "longitude"
VAL = "value"
NEW_HEADERS = (Field.YEAR, Field.LOC, Field.ABBR, Field.LAT, Field.LON, Field.VAL)
def create_parsed() -> None:
# first, get the abbreviations and locations
abbrs = get_abbreviations()
locs = get_locations()
# now, the data
data = get_data()
# now build the new table
new_table = []
new_table.append(list(NEW_HEADERS))
for year, value_map in data:
for loc, (lat, lon) in locs.items():
row = []
row.append(year)
row.append(loc)
row.append(abbrs[loc])
row.append(lat)
row.append(lon)
row.append(value_map[loc])
new_table.append(row)
# store the data in the output file
with open(F_OUT, "w") as f:
for line in new_table:
f.write(",".join(line) + "\n")
def get_abbreviations() -> Dict[str, str]:
""" Dict[Location, LocationAbbreviation] """
with open(F_ABBR) as f:
lines = f.read().splitlines()
rows = [[s.strip() for s in line.split(",")] for line in lines]
return dict(zip(rows[0], rows[1]))
def get_locations() -> Dict[str, Tuple[str, str]]:
""" Dict[Location, Tuple[Latitude, Longitude]] """
with open(F_LOC) as f:
lines = f.read().splitlines()
rows = [[s.strip() for s in line.split(",")] for line in lines][1:]
return {
loc: (str(float(lat)), str(float(lon)))
for loc, lat, lon in rows
}
def get_data() -> List[Pair[int, Dict[str, float]]]:
""" List[Pair[Year, Dict[Location, Value]]] """
with open(F_DATA) as f:
lines = f.read().splitlines()
rows = [line.split(",") for line in lines]
header, rows = rows[0], rows[1:]
year_idx = 0
idx_to_location = {idx: loc.strip() for idx, loc in enumerate(header[1:])}
return [
[
str(int(row[year_idx])),
{
idx_to_location[idx]: try_float(value)
for idx, value in enumerate(row[1:])
}
]
for row in rows
]
def try_float(s: str) -> str:
try:
return str(float(s))
except ValueError:
return ""
if __name__ == "__main__":
create_parsed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment