|
#!/usr/bin/env python2 |
|
""" python this.py [year=2015 to=2016] |
|
read_epa_download no2 ozone TEMP WIND .zip or .csv |
|
in download_dirs from epa_download |
|
write e.g. tmp/2016-epa-no2-ozone-temp-wind.csv 79k rows, 7 columns |
|
city state date no2 ozone temp wind |
|
""" |
|
|
|
from __future__ import division |
|
import numpy as np |
|
import pandas as pd |
|
|
|
from pr import pr |
|
import zutil as nu |
|
|
|
__version__ = "2017-07-29 july denis-bz-py t-online de" |
|
|
|
# read these columns only, and rename them -- |
|
epa_columns_dict = { |
|
"City Name": "city", |
|
"State Name": "state", |
|
"Date Local": "date", |
|
"Arithmetic Mean": "av", # hour / 8 hour |
|
# "1st Max Value": "max1", |
|
"Parameter Name": "param", # drop |
|
} |
|
|
|
epa_url = "http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata" |
|
# (download with requests ? mttiw) |
|
download_dirs = "epa/download $epa" # <-- where you run epa_download |
|
|
|
#............................................................................... |
|
def read_epa_download( csv_or_zip, columns=epa_columns_dict, nrows=1e8, dropneg=False ): |
|
""" read e.g. daily_TEMP_2015.csv or .zip |
|
the given columns only, and rename them |
|
WIND: speed only, drop direction |
|
city and state names with "-", New-York |
|
""" |
|
# units: NO2 ppb, ozone ppm, temp F, wind knots = .51444 m/s |
|
# NO2 1 ppb = 1.91 ug/m3, Ozone 1 ppb = 2.0 ug/m3 |
|
# read_csv e.g. daily_no2_2015.zip with a single csv works |
|
|
|
df = pd.read_csv( csv_or_zip, usecols=columns.keys(), |
|
parse_dates=["Date Local"], infer_datetime_format=True, |
|
index_col=False, nrows=nrows ) |
|
df.rename( columns=columns, inplace=True ) |
|
df = df[ df.city != "Not in a city" ] |
|
if "param" in df.columns: |
|
df = df[ df.param != "Wind Direction - Resultant" ] # wind speed only |
|
df = df.drop( "param", axis=1 ) |
|
if dropneg and "av" in df.columns: |
|
# drop no2 < 0 in Grapevine Midlothian Tyler TX ? |
|
df = df.drop( df.av < 0 ) |
|
df.city = df.city.str.replace( r" \(.*", "" ) # Dentsville (Dents) -> Dentsville |
|
df.city = df.city.str.replace( ".", "" ) # St. Louis |
|
df.city = df.city.str.replace( " ", "-" ) # New-York, for awk |
|
df.state = df.state.str.replace( " ", "-" ) |
|
df = df.sort_values( by=["city", "date"] ) |
|
df.index = np.arange( len(df) ) |
|
nr, nc = df.shape |
|
print "\nread_epa_csv %s: %d rows, %d columns: %s" % ( |
|
csv_or_zip, nr, nc, " ".join( df.columns )) |
|
if "av" in df.columns: |
|
q = np.r_[0, 10, 25, 50, 75, 90, 100] |
|
print "Daily averages, percentiles %s: %s" % (q, np.percentile( df.av, q=q )) |
|
|
|
return df |
|
|
|
|
|
def epa_zipfilename( what, year, suffix=".zip", dirs=download_dirs ): |
|
basename = "daily_%s_%s%s" % (what, year, suffix) |
|
return nu.findfile( basename, dirs=dirs ) # else IOError |
|
|
|
|
|
def write_csv( out, df, csvheader="", **kw ): |
|
""" pandas df.to_csv with csvheader """ |
|
nr, nc = df.shape |
|
print "\nsaving to %s %d rows, %d columns" % (out, nr, nc) |
|
nu.mkdirpart( out ) |
|
with open( out, "w" ) as f: |
|
if csvheader: |
|
f.write( csvheader ) # grr not gist.github |
|
df.to_csv( f, sep="\t", float_format="%.3g", na_rep="NaN", index=None, **kw ) |
|
|
|
#............................................................................... |
|
def merge_epa_csvs( year=2015, what="no2 ozone TEMP WIND", mindays=350, maxnan=1, |
|
nrows=1e8, verbose=1 ): |
|
""" for no2 ozone ...: |
|
read_epa_download( findfile ) |
|
-> merged df with columns: city state date no2 ozone temp wind |
|
""" |
|
print "\n-- merge_epa_csvs mindays %d maxnan %d --" % (mindays, maxnan) |
|
whats = what.split() |
|
avdf_list = [] |
|
|
|
for what in whats: |
|
print "\n", 80 * "-" |
|
csv = epa_zipfilename( what, year ) |
|
what = what.lower() |
|
df = read_epa_download( csv, nrows=nrows, dropneg=(what != "temp") ) |
|
|
|
# city, state, day -> daily average |
|
avdf = df.groupby( ["city", "state", "date"] ).mean() |
|
|
|
# drop cities with < 350 days |
|
if what != "temp": # except Chicago, no temp |
|
gr = avdf.groupby( ["city", "state"] ) |
|
nbefore = len(gr) |
|
avdf = gr.filter( lambda x: x.size >= mindays ) |
|
nafter = len( avdf.groupby( ["city", "state"] )) |
|
print "dropped %d of %d cities with < %d days %s" % ( |
|
nbefore - nafter, nbefore, mindays, what ) |
|
|
|
avdf = avdf.rename( columns={"av" : what} ) |
|
avdf_list.append( avdf ) |
|
if verbose: |
|
pr( "-- %s avdf --" % what, avdf ) |
|
|
|
merge = pd.concat( avdf_list, axis=1 ) |
|
|
|
# drop rows with > maxnan NaNs -- |
|
nbefore = len(merge) |
|
merge = merge.dropna( thresh = len(whats) - maxnan ) |
|
nafter = len(merge) |
|
print "dropped %d of %d rows with > %d NaNs " % ( |
|
nbefore - nafter, nbefore, maxnan ) |
|
merge = merge.reset_index() |
|
|
|
return merge, avdf_list |
|
|
|
|
|
def counts_avs( df ): |
|
""" -> df: ndays, year avs no2 ozone temp wind """ |
|
gr = df.drop( "date", axis=1 ) .groupby([ "city", "state" ]) |
|
counts = gr.count() |
|
avs = gr.mean() |
|
counts.columns = "n_no2 n_ozone n_temp n_wind ".split() |
|
avs.columns = "av_no2 av_ozone av_temp av_wind ".split() |
|
navs = pd.concat( [counts, avs], axis=1 ) .reset_index() \ |
|
.iloc[ :, [2,3,4,5,6,7,8,9,0,1] ] # ndays avs city state |
|
return navs |
|
|
|
|
|
#............................................................................... |
|
if __name__ == "__main__": |
|
import sys |
|
|
|
np.set_printoptions( threshold=20, edgeitems=10, linewidth=140, |
|
formatter = dict( float=nu.numstr )) |
|
pd.set_option( "display.width", 140, "display.max_rows", 6, |
|
"display.precision", 3 ) |
|
|
|
#........................................................................... |
|
year = 2016 |
|
to = 0 |
|
mindays = 350 # drop cities in no2 ozone temp wind with < __ days |
|
maxnan = 1 # drop rows with > __ NaN s |
|
nrows = 1e8 |
|
save = "tmp" |
|
verbose = 1 |
|
|
|
# to change these params, run this.py a=1 b=None 'c = "str"' in sh or ipython |
|
for arg in sys.argv[1:]: |
|
exec( arg ) |
|
|
|
print "\n", 80 * "=" |
|
print " ".join(sys.argv) |
|
if to == 0: |
|
to = year |
|
print "year: %d to %d" % (year, to) |
|
|
|
from_ = "# %s\n#\n" % nu.From( __file__ ) # pwd, time |
|
csvheader = \ |
|
"""# NO2 ppb, ozone ppm, temp F, wind knots |
|
# (NO2 1 ppb = 1.91 ug/m3, Ozone 1 ppb = 2.0 ug/m3, 1 knot = .51444 m/s) |
|
""" + from_ |
|
|
|
#........................................................................... |
|
# read, merge, write_csv -- |
|
for year in xrange( year, to+1 ): |
|
merge, avdfs = merge_epa_csvs( year=year, |
|
mindays=mindays, maxnan=maxnan, nrows=nrows, verbose=verbose ) |
|
ndays_yearavs = counts_avs( merge ) |
|
if verbose: |
|
pr( "-- merge_epa_csvs --", merge ) |
|
pr( "-- ndays_yearavs --", ndays_yearavs ) |
|
if save: |
|
out = "%s/%d-epa-no2-ozone-temp-wind.csv" % (save, year) |
|
write_csv( out, merge, csvheader=csvheader ) |
|
|
|
out = "%s/%d-epa-ndays-yearavs.tsv" % (save, year) |
|
write_csv( out, ndays_yearavs, csvheader=from_ ) # sort-ndays |
|
|