Skip to content

Instantly share code, notes, and snippets.

@denis-bz
Last active July 29, 2017 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denis-bz/12ef3d6f75b734babd5bd3a43920e112 to your computer and use it in GitHub Desktop.
Save denis-bz/12ef3d6f75b734babd5bd3a43920e112 to your computer and use it in GitHub Desktop.
EPA air quality: NO2 in Phoenix New York San Diego 2016 2017-07-29 Jul

EPA air quality: NO2 and ozone in 3 US cities in 2016

The Environmental Protection Agency collects data on NO2, ozone, wind etc. in hundreds of US cities, and puts it in files like daily_TEMP_2015.csv on the web site http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata .

Plots

2016-epa-no2-3cities 2016-epa-ozone-3cities

A leading question: how can one set reasonable limits for pollutants that vary so much ?

The files below:

epa_download  -- download daily_TEMP_2016.zip ...
epa_air_smallcsv.py  -- big EPA files -> small csv
2016-epa-ndays-yearavs.tsv  -- city -> nr. days and yearly averages
pr.py  -- for debugging
zutil.py

First look at epa_download -help. This downloads .zip files from the url to the current directory with curl; it also links daily_44602* to daily_no2* and daily_44201* to daily_ozone* .

epa_air_smallcsv.py reads the big .zip files and merges them, e.g.
in: daily_no2_2016.zip daily_ozone_2016.zip daily_TEMP_2016.zip daily_WIND_2016.zip
out: 2016-epa-no2-ozone-temp-wind.csv, ~ 80k rows and 7 columns:

city        state       date        no2     ozone   temp    wind
Albuquerque New-Mexico  2016-01-01  10.9    0.0224  30.3    2.59
Albuquerque New-Mexico  2016-01-02  15.4    0.0208  31.3    2.62
...
Yukon       Oklahoma    2016-12-31  5       NaN     43.4    7.3

Here "NaN", not-a-number, marks missing data for a given city and day. I drop cities with < 350 days, and rows with more than one NaN, i.e. 2 or more of NO2 ozone TEMP WIND missing.

(There's more than one way to skin a cat, split up data; this is adequate for my purpose, which is to just plot the data to see its day-to-day variability. Why so variable ? No idea; comments welcome.)

Missing days

It looks as though many days are missing in the EPA data for no2, ozone, TEMP and WIND, in many cities. It's entirely possible that, as a novice with pandas, I've made mistakes. But simple checks like

egrep -w Chicago daily_TEMP_2016.csv | less

show data for "Chicago-Naperville-Elgin", but not "Chicago".

The file 2016-epa-ndays-yearavs.tsv below gives the numbers of days that have data, and the yearly averages, for the cities in the output .csv:

n_no2 n_ozone n_temp n_wind av_no2 av_ozone av_temp av_wind city state
366 366 366 366  19.6 0.0306 75.5 2.8  Phoenix Arizona 
...
362 362   0 362  15.2 0.0278 NaN 4.77  Chicago Illinois
...

Notes

Interpolate ? One could fill in missing data from nearby days and nearby cities, e.g. with scipy KDTree and inverse-distance weighting . But be warned: interpolation can affect later analysis. If you do interpolate, measure how good it is: throw out some known cities or days, interpolate there, and compare the interpolated to the real values.

Fwiw, Weibull plots of NO2 and ozone don't look that linear; nor, surprisingly, does wind. Would a mixture model, normal + heavy tail, make sense physically ?

Comments are welcome, test cases most welcome.

cheers
-- denis

Last change: 29 July 2017

#!/usr/bin/env python2
""" python this.py [year=2015 to=2016]
read_epa_download no2 ozone TEMP WIND .zip or .csv
in download_dirs from epa_download
write e.g. tmp/2016-epa-no2-ozone-temp-wind.csv 79k rows, 7 columns
city state date no2 ozone temp wind
"""
from __future__ import division
import numpy as np
import pandas as pd
from pr import pr
import zutil as nu
__version__ = "2017-07-29 july denis-bz-py t-online de"
# read these columns only, and rename them --
epa_columns_dict = {
"City Name": "city",
"State Name": "state",
"Date Local": "date",
"Arithmetic Mean": "av", # hour / 8 hour
# "1st Max Value": "max1",
"Parameter Name": "param", # drop
}
epa_url = "http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata"
# (download with requests ? mttiw)
download_dirs = "epa/download $epa" # <-- where you run epa_download
#...............................................................................
def read_epa_download( csv_or_zip, columns=epa_columns_dict, nrows=1e8, dropneg=False ):
""" read e.g. daily_TEMP_2015.csv or .zip
the given columns only, and rename them
WIND: speed only, drop direction
city and state names with "-", New-York
"""
# units: NO2 ppb, ozone ppm, temp F, wind knots = .51444 m/s
# NO2 1 ppb = 1.91 ug/m3, Ozone 1 ppb = 2.0 ug/m3
# read_csv e.g. daily_no2_2015.zip with a single csv works
df = pd.read_csv( csv_or_zip, usecols=columns.keys(),
parse_dates=["Date Local"], infer_datetime_format=True,
index_col=False, nrows=nrows )
df.rename( columns=columns, inplace=True )
df = df[ df.city != "Not in a city" ]
if "param" in df.columns:
df = df[ df.param != "Wind Direction - Resultant" ] # wind speed only
df = df.drop( "param", axis=1 )
if dropneg and "av" in df.columns:
# drop no2 < 0 in Grapevine Midlothian Tyler TX ?
df = df.drop( df.av < 0 )
df.city = df.city.str.replace( r" \(.*", "" ) # Dentsville (Dents) -> Dentsville
df.city = df.city.str.replace( ".", "" ) # St. Louis
df.city = df.city.str.replace( " ", "-" ) # New-York, for awk
df.state = df.state.str.replace( " ", "-" )
df = df.sort_values( by=["city", "date"] )
df.index = np.arange( len(df) )
nr, nc = df.shape
print "\nread_epa_csv %s: %d rows, %d columns: %s" % (
csv_or_zip, nr, nc, " ".join( df.columns ))
if "av" in df.columns:
q = np.r_[0, 10, 25, 50, 75, 90, 100]
print "Daily averages, percentiles %s: %s" % (q, np.percentile( df.av, q=q ))
return df
def epa_zipfilename( what, year, suffix=".zip", dirs=download_dirs ):
basename = "daily_%s_%s%s" % (what, year, suffix)
return nu.findfile( basename, dirs=dirs ) # else IOError
def write_csv( out, df, csvheader="", **kw ):
""" pandas df.to_csv with csvheader """
nr, nc = df.shape
print "\nsaving to %s %d rows, %d columns" % (out, nr, nc)
nu.mkdirpart( out )
with open( out, "w" ) as f:
if csvheader:
f.write( csvheader ) # grr not gist.github
df.to_csv( f, sep="\t", float_format="%.3g", na_rep="NaN", index=None, **kw )
#...............................................................................
def merge_epa_csvs( year=2015, what="no2 ozone TEMP WIND", mindays=350, maxnan=1,
nrows=1e8, verbose=1 ):
""" for no2 ozone ...:
read_epa_download( findfile )
-> merged df with columns: city state date no2 ozone temp wind
"""
print "\n-- merge_epa_csvs mindays %d maxnan %d --" % (mindays, maxnan)
whats = what.split()
avdf_list = []
for what in whats:
print "\n", 80 * "-"
csv = epa_zipfilename( what, year )
what = what.lower()
df = read_epa_download( csv, nrows=nrows, dropneg=(what != "temp") )
# city, state, day -> daily average
avdf = df.groupby( ["city", "state", "date"] ).mean()
# drop cities with < 350 days
if what != "temp": # except Chicago, no temp
gr = avdf.groupby( ["city", "state"] )
nbefore = len(gr)
avdf = gr.filter( lambda x: x.size >= mindays )
nafter = len( avdf.groupby( ["city", "state"] ))
print "dropped %d of %d cities with < %d days %s" % (
nbefore - nafter, nbefore, mindays, what )
avdf = avdf.rename( columns={"av" : what} )
avdf_list.append( avdf )
if verbose:
pr( "-- %s avdf --" % what, avdf )
merge = pd.concat( avdf_list, axis=1 )
# drop rows with > maxnan NaNs --
nbefore = len(merge)
merge = merge.dropna( thresh = len(whats) - maxnan )
nafter = len(merge)
print "dropped %d of %d rows with > %d NaNs " % (
nbefore - nafter, nbefore, maxnan )
merge = merge.reset_index()
return merge, avdf_list
def counts_avs( df ):
""" -> df: ndays, year avs no2 ozone temp wind """
gr = df.drop( "date", axis=1 ) .groupby([ "city", "state" ])
counts = gr.count()
avs = gr.mean()
counts.columns = "n_no2 n_ozone n_temp n_wind ".split()
avs.columns = "av_no2 av_ozone av_temp av_wind ".split()
navs = pd.concat( [counts, avs], axis=1 ) .reset_index() \
.iloc[ :, [2,3,4,5,6,7,8,9,0,1] ] # ndays avs city state
return navs
#...............................................................................
if __name__ == "__main__":
import sys
np.set_printoptions( threshold=20, edgeitems=10, linewidth=140,
formatter = dict( float=nu.numstr ))
pd.set_option( "display.width", 140, "display.max_rows", 6,
"display.precision", 3 )
#...........................................................................
year = 2016
to = 0
mindays = 350 # drop cities in no2 ozone temp wind with < __ days
maxnan = 1 # drop rows with > __ NaN s
nrows = 1e8
save = "tmp"
verbose = 1
# to change these params, run this.py a=1 b=None 'c = "str"' in sh or ipython
for arg in sys.argv[1:]:
exec( arg )
print "\n", 80 * "="
print " ".join(sys.argv)
if to == 0:
to = year
print "year: %d to %d" % (year, to)
from_ = "# %s\n#\n" % nu.From( __file__ ) # pwd, time
csvheader = \
"""# NO2 ppb, ozone ppm, temp F, wind knots
# (NO2 1 ppb = 1.91 ug/m3, Ozone 1 ppb = 2.0 ug/m3, 1 knot = .51444 m/s)
""" + from_
#...........................................................................
# read, merge, write_csv --
for year in xrange( year, to+1 ):
merge, avdfs = merge_epa_csvs( year=year,
mindays=mindays, maxnan=maxnan, nrows=nrows, verbose=verbose )
ndays_yearavs = counts_avs( merge )
if verbose:
pr( "-- merge_epa_csvs --", merge )
pr( "-- ndays_yearavs --", ndays_yearavs )
if save:
out = "%s/%d-epa-no2-ozone-temp-wind.csv" % (save, year)
write_csv( out, merge, csvheader=csvheader )
out = "%s/%d-epa-ndays-yearavs.tsv" % (save, year)
write_csv( out, ndays_yearavs, csvheader=from_ ) # sort-ndays
n_no2 n_ozone n_temp n_wind av_no2 av_ozone av_temp av_wind city state
366 366 366 366 19.6 0.0306 75.5 2.8 Phoenix Arizona
366 366 366 366 17.2 0.0369 69.9 4.16 Las-Vegas Nevada
366 366 366 366 16.6 0.0265 56.7 4.31 New-York New-York
366 366 366 366 13.5 0.0271 59.4 5.39 Washington District-Of-Columbia
366 366 366 366 13.4 0.0279 57.9 1.65 Philadelphia Pennsylvania
366 366 366 366 13 0.0312 64.8 3.83 San-Diego California
366 366 366 366 12.8 0.0337 54.8 2.36 Reno Nevada
366 366 366 366 12.3 0.0337 65.9 2.09 Fresno California
366 366 366 366 11.9 0.0276 50.6 5.22 Milwaukee Wisconsin
366 366 366 366 11.8 0.0331 67.9 2.78 Bakersfield California
366 366 366 366 11.8 0.0313 68.1 5.34 El-Paso Texas
366 366 366 366 11.6 0.0288 59.6 3.59 Louisville Kentucky
366 366 366 366 11.5 0.0336 63.7 4.89 Oklahoma-City Oklahoma
366 366 366 366 11.2 0.0269 60.4 4.07 St-Louis Missouri
366 366 366 366 10.4 0.0246 72.4 4.12 Houston Texas
366 366 366 366 10.3 0.0385 59.2 4.09 Albuquerque New-Mexico
366 366 366 366 10.2 0.0256 71.7 5.39 Baytown Texas
366 366 366 366 9.88 0.0268 63.9 3.43 Sacramento California
366 366 366 366 9.1 0.0292 64.3 3.04 Chula-Vista California
366 366 366 366 7.85 0.0316 71.2 4.71 Tucson Arizona
366 366 366 366 7.6 0.0289 68.9 4.75 Dallas Texas
366 366 366 366 6.25 0.0273 67.9 6.52 Arlington Texas
366 366 366 366 6.15 0.0276 70.7 4.8 San-Antonio Texas
366 366 366 366 4.23 0.0256 70.2 3.67 Tomball Texas
366 366 366 366 3.87 0.0261 58.2 1.9 Atascadero California
366 366 366 366 3.7 0.029 67.4 4.06 Longview Texas
366 366 366 366 3.28 0.0296 69.9 4.53 Conroe Texas
366 366 366 366 3.15 0.0287 68.1 6.39 Waco Texas
366 366 366 366 1.97 0.0309 58.7 3.36 Lompoc California
366 366 366 366 1.34 0.0352 61.6 4.56 Capitan California
366 366 366 0 16.6 0.0325 67.6 NaN Azusa California
366 366 366 0 16.6 0.0352 66.5 NaN Upland California
366 366 366 0 16.1 0.0246 55.9 NaN Bayonne New-Jersey
366 366 366 0 15.5 0.0297 66.1 NaN Los-Angeles California
366 366 366 0 9.89 0.0296 54 NaN Bloomfield New-Mexico
366 366 366 0 5.45 0.0433 66.3 NaN Banning California
366 366 366 0 3.81 0.0348 52.2 NaN Milton Massachusetts
366 366 365 366 7.91 0.0289 63.9 3.81 North-Little-Rock Arkansas
366 366 364 365 15.2 0.0303 58 3.25 Cincinnati Ohio
366 366 361 366 6.14 0.0263 58.8 3.71 Santa-Maria California
366 366 291 366 5.98 0.0278 67.1 2.37 Chico California
366 366 0 366 6.91 0.0278 NaN 3.59 Buckeye Arizona
366 365 366 366 9.54 0.0302 62.8 2.34 Charlotte North-Carolina
366 365 366 366 8.86 0.031 70.2 4.59 Austin Texas
366 365 366 366 6.62 0.0314 53.6 3.45 East-Providence Rhode-Island
366 365 366 366 4.35 0.0243 70.3 3.58 West-Orange Texas
366 365 366 366 3.91 0.0439 64.5 2.4 Alpine California
366 365 366 366 1.58 0.0321 67.4 5.65 Tyler Texas
366 365 366 366 0.769 0.0332 61 5.88 Los-Padres-National-Forest California
366 364 366 366 3.93 0.0276 71.2 5.01 Manvel Texas
366 364 366 366 2.73 0.0355 72.5 8.66 Galveston Texas
366 363 366 366 14.4 0.025 54.4 3.37 Boston Massachusetts
366 363 366 366 8.18 0.0338 64.5 4.09 Simi-Valley California
366 362 366 366 4.38 0.028 72.3 5.18 Seabrook Texas
366 360 366 366 1.19 0.0263 59.6 2.24 Carpinteria California
366 359 366 365 10.2 0.0297 63.3 4.88 Memphis Tennessee
366 357 366 366 26.1 0.0289 55.4 3.35 Denver Colorado
366 0 366 366 16.2 NaN 54.1 3.31 Seattle Washington
366 0 366 366 14.6 NaN 53.7 5.23 Providence Rhode-Island
366 0 366 366 13 NaN 53.2 4.94 Detroit Michigan
366 0 366 366 12.3 NaN 66.4 1.69 Birmingham Alabama
366 0 366 366 12 NaN 59.1 3.23 Kansas-City Missouri
366 0 366 366 11.9 NaN 60.1 4.36 Bridgeton Missouri
366 0 366 366 5.03 NaN 76.8 3.68 El-Centro California
366 0 366 366 4.88 NaN 60 1.24 Ashland Kentucky
365 365 365 0 21.2 0.0232 64.3 NaN Long-Beach California
365 365 365 0 15.5 0.026 65.1 NaN Compton California
365 365 365 0 8.02 0.0425 65.3 NaN Lancaster California
365 365 0 365 6.01 0.0259 NaN 3.54 Blaine Minnesota
365 364 366 366 4.84 0.0334 49.6 7.82 Brookhurst Wyoming
365 363 365 365 13.6 0.0235 61.3 5.9 San-Jose California
365 0 365 365 1.92 NaN 48.6 4.05 Grantsville Maryland
364 366 366 366 8.25 0.0292 67.7 6.25 Fort-Worth Texas
364 366 360 366 11.2 0.0284 54.1 5.08 Cleveland Ohio
364 365 366 366 4.76 0.035 64.1 6.23 Tracy California
364 362 365 365 1.71 0.027 71.3 4.22 Lake-Jackson Texas
364 360 365 365 3.47 0.0282 70.8 4.71 Nederland Texas
363 366 366 366 7.86 0.0349 64.8 4.24 Otay-Mesa California
363 366 366 366 3.76 0.0284 66.2 4.04 Greenville Texas
363 366 366 366 2.37 0.0281 62.2 2.39 Goleta California
363 363 363 0 16.1 0.0304 71.4 NaN North-Las-Vegas Nevada
363 363 0 363 21.5 0.028 NaN 2.21 Tempe Arizona
363 363 0 363 11.5 0.0324 NaN 3.53 Calexico California
363 361 365 365 1.18 0.0319 62 3.3 Sanford North-Carolina
363 361 364 364 1.32 0.0383 48.5 5.29 Rangely Colorado
363 360 366 366 5.67 0.0305 67.9 5.18 Grapevine Texas
362 366 366 366 5.97 0.0277 53.4 3.39 Davenport Iowa
362 364 358 366 2.75 0.0332 66.7 3.16 Folsom California
362 362 362 0 13.6 0.033 67 NaN Mira-Loma California
362 362 362 0 12.1 0.0291 57.8 NaN Camden New-Jersey
362 362 362 0 9.56 0.034 65.7 NaN Shafter California
362 362 0 362 15.2 0.0278 NaN 4.77 Chicago Illinois
362 0 362 362 8.64 NaN 52.8 4.1 South-Bend Indiana
361 366 366 366 7.67 0.028 61.9 2.71 Santa-Barbara California
361 361 366 366 2.05 0.0305 68.1 7.08 Corsicana Texas
361 361 361 0 14.2 0.037 60.7 NaN Barstow California
361 361 361 0 10.1 0.0354 67.4 NaN Santa-Clarita California
361 0 361 361 17.4 NaN 56.6 2.44 North-Laurel Maryland
361 0 361 361 4.87 NaN 61.6 6.71 Yukon Oklahoma
360 363 366 366 4.5 0.03 66.5 6.44 Denton Texas
360 363 365 365 8.7 0.0344 65.6 3.42 Hanford California
360 360 360 0 15.4 0.0246 55.9 NaN Newark New-Jersey
360 360 360 0 5.21 0.032 52.3 NaN Lynn Massachusetts
360 360 360 0 4.09 0.0396 67.9 NaN Searles-Valley California
360 0 360 360 8.93 NaN 54.1 2.34 East-Hartford Connecticut
359 359 359 0 12.3 0.034 49.5 NaN Worcester Massachusetts
358 365 366 366 9.27 0.0249 71.7 4.76 Channelview Texas
358 364 366 366 2.42 0.0315 59 4.89 Gaviota California
358 358 358 0 20.1 0.0234 66.2 NaN Pico-Rivera California
358 358 358 0 11.2 0.0207 51.9 NaN Knowlton New-Jersey
358 358 358 0 6.09 0.0257 72.1 NaN Deer-Park Texas
358 358 0 358 17.1 0.0218 NaN 6.69 Schiller-Park Illinois
358 354 360 360 16 0.0276 53.7 3.72 Welby Colorado
357 357 357 0 14.7 0.0334 67.1 NaN Rubidoux California
356 358 358 358 6.35 0.0369 63.5 4.28 Camp-Pendleton-South California
356 356 356 0 18.2 0.0302 67.8 NaN Fontana California
356 356 356 0 10 0.0394 64.1 NaN Victorville California
356 356 356 0 2.72 0.0322 50.5 NaN Newburyport Massachusetts
356 0 356 356 6.88 NaN 54.4 3.89 Des-Moines Iowa
355 366 366 366 12.5 0.0259 56 3.84 Indianapolis Indiana
355 366 366 366 10.6 0.0261 70.3 3.3 Baton-Rouge Louisiana
355 355 355 0 16.6 0.0352 67.1 NaN San-Bernardino California
355 355 355 0 2.79 0.0349 49.9 NaN Ware Massachusetts
355 355 354 355 12.1 0.0233 64.1 3 Stockton California
355 349 355 355 5.67 0.0312 62.2 2.66 Raleigh North-Carolina
355 0 355 355 16.7 NaN 53.2 4.94 Tacoma Washington
354 366 366 366 5.51 0.0324 63.9 3.72 Madera California
354 366 366 366 2.06 0.0303 56.8 4.5 Nipomo California
354 365 366 366 2.47 0.03 67.5 4.62 Kaufman Texas
354 354 354 0 19 0.0276 66.7 NaN Anaheim California
354 354 354 0 6.39 0.0302 51.6 NaN Chicopee Massachusetts
354 0 354 354 9.47 NaN 52.8 5.55 Livonia Michigan
353 365 366 359 9.74 0.029 53.7 2.75 Rochester New-York
353 353 355 355 2.28 0.0287 67.3 6.17 Italy Texas
352 365 365 365 10.5 0.0309 57.4 3.18 Essex Maryland
352 360 361 362 4.98 0.0259 70.9 4.94 Beaumont Texas
350 366 366 366 6.85 0.0297 63.3 3.4 Tulsa Oklahoma
350 365 366 366 13.6 0.0278 54.2 5.11 New-Haven Connecticut
350 350 350 0 10.2 0.0344 64.7 NaN El-Cajon California
349 349 349 0 4.44 0.0365 49.6 NaN Casper Wyoming
348 0 348 348 15.8 NaN 57.3 3.61 Hartford Connecticut
311 311 311 0 4.87 0.0291 66.1 NaN Midlothian Texas
288 288 288 0 7 0.0402 72.2 NaN Palm-Springs California
272 272 272 0 11.4 0.0369 66.4 NaN Glendora California
269 269 269 0 3.86 0.0377 44.9 NaN Roosevelt Utah
255 0 255 255 8.28 NaN 60.2 5.49 Gary Indiana
194 194 194 0 4.19 0.0284 52.3 NaN Sioux-Falls South-Dakota
91 91 91 0 13.5 0.0147 46.8 NaN Tualatin Oregon
91 91 91 0 9.55 0.0204 46.3 NaN Portland Oregon
72 72 72 0 20.3 0.0153 62.7 NaN La-Habra California
0 366 366 366 NaN 0.0279 53.9 4.44 Allen-Park Michigan
0 366 366 366 NaN 0.0278 74.1 7.76 Corpus-Christi Texas
0 366 366 366 NaN 0.0303 59.8 3.39 El-Paso-de-Robles California
0 366 366 366 NaN 0.0334 51.8 2.3 Fort-Collins Colorado
0 366 366 366 NaN 0.0324 67.6 5.73 Frisco Texas
0 366 366 366 NaN 0.0444 53.1 5.11 Grand-Canyon-Village Arizona
0 366 366 366 NaN 0.0282 58.8 1.87 Jackson California
0 366 366 366 NaN 0.046 71.2 6.29 Joshua-Tree-National-Monument California
0 366 366 366 NaN 0.0274 62.3 7.5 Kansas-City Kansas
0 366 366 366 NaN 0.0258 48.8 2.67 Keene New-Hampshire
0 366 366 366 NaN 0.0335 71.5 4.48 Marana Arizona
0 366 366 366 NaN 0.0327 64 6.36 Piru California
0 366 366 366 NaN 0.0275 71 5.25 Port-Arthur Texas
0 366 366 366 NaN 0.0288 52 4.17 Portsmouth New-Hampshire
0 366 366 366 NaN 0.0333 59.4 2.51 San-Andreas California
0 366 366 366 NaN 0.0287 58.6 4.96 San-Luis-Obispo California
0 366 366 366 NaN 0.0304 68.4 5.4 Temple Texas
0 366 366 366 NaN 0.0417 63.3 7.29 Virgin Utah
0 366 366 366 NaN 0.034 66.1 6.67 Weatherford Texas
0 365 365 365 NaN 0.0347 65.3 3.54 Clovis California
0 365 365 365 NaN 0.0247 76.3 6.76 Harlingen Texas
0 365 365 365 NaN 0.031 49.6 5.95 Horicon Wisconsin
0 365 365 365 NaN 0.0214 75.8 5.52 Laredo Texas
0 365 365 365 NaN 0.0328 63.4 3.94 Lawton Oklahoma
0 365 365 365 NaN 0.03 68.3 4.48 Leander Texas
0 365 365 365 NaN 0.0236 76.2 5.58 Mission Texas
0 365 365 365 NaN 0.0277 55.2 5.27 Omaha Nebraska
0 365 365 365 NaN 0.0383 54.2 6.46 Palisade Colorado
0 365 365 365 NaN 0.0379 53.8 5.19 Southglenn Colorado
0 364 364 364 NaN 0.0306 67.7 5.1 Granbury Texas
0 364 364 364 NaN 0.0371 68.7 4.94 Green-Valley Arizona
0 364 364 364 NaN 0.0443 67.2 5.08 Maricopa California
0 363 363 363 NaN 0.0318 63.3 2.86 Colusa California
0 363 363 363 NaN 0.0317 62.6 5.91 Goldsby Oklahoma
0 363 363 363 NaN 0.0297 51.5 4.79 Grand-Rapids Michigan
0 363 363 363 NaN 0.0296 61.8 2.43 Ojai California
0 363 363 363 NaN 0.0326 63 3.98 Thousand-Oaks California
0 363 363 363 NaN 0.028 57.6 4.72 Wilmington Delaware
0 362 362 362 NaN 0.0349 66.3 7.6 Cleburne Texas
0 362 362 362 NaN 0.0344 48.2 5.59 Dunkirk New-York
0 362 362 362 NaN 0.0399 66 6.74 Jean Nevada
0 362 362 362 NaN 0.0362 47 7.81 Peterborough New-Hampshire
0 362 362 362 NaN 0.0434 73.7 6.38 Queen-Valley Arizona
0 362 362 362 NaN 0.0309 67.4 4.62 Rockwall Texas
0 361 361 361 NaN 0.0365 64.7 2.81 Arvin California
0 361 361 361 NaN 0.0276 56.1 2.07 Beltsville Maryland
0 361 361 361 NaN 0.0321 51.5 4.45 Greeley Colorado
0 360 360 360 NaN 0.0309 71.7 3.9 Sunrise-Manor Nevada
0 360 360 360 NaN 0.0304 65.3 2.25 Visalia California
0 360 360 360 NaN 0.0257 46.9 4.44 Williston North-Dakota
0 359 359 359 NaN 0.0326 61.4 4.81 Cherry-Tree Oklahoma
0 359 359 359 NaN 0.0346 68.8 3.8 Spring-Valley Nevada
0 358 358 358 NaN 0.0271 51.9 3.53 Concord New-Hampshire
0 358 358 358 NaN 0.0281 54.3 4.03 Pittsburgh Pennsylvania
0 357 357 357 NaN 0.0295 66.9 7.47 Eagle-Mountain Texas
0 355 355 355 NaN 0.0373 54 5 Lakewood Colorado
0 355 355 355 NaN 0.0321 51.8 5.11 Lansing Michigan
0 355 355 355 NaN 0.0252 46.6 3.93 Lebanon New-Hampshire
0 354 354 354 NaN 0.0412 44.1 4.05 Aspen-Park Colorado
0 354 354 354 NaN 0.0249 63.1 2.58 Yuba-City California
0 353 353 353 NaN 0.0318 68 6.36 Killeen Texas
0 353 353 353 NaN 0.0327 51.2 5.53 Torrington Wyoming
0 352 352 352 NaN 0.0294 51.2 5.33 East-Syracuse New-York
0 352 352 352 NaN 0.0342 63.8 3.44 Parlier California
0 351 351 351 NaN 0.0338 63.7 3.51 Choctaw Oklahoma
0 349 349 349 NaN 0.0314 65.6 6.03 Pilot-Point Texas
0 348 348 348 NaN 0.0278 72.4 5.42 Victoria Texas
0 341 341 341 NaN 0.0364 68.3 2.74 Oildale California
0 329 329 329 NaN 0.0275 70.3 3.9 Jackson Mississippi
0 306 306 306 NaN 0.047 63.1 3.93 Placerville California
0 255 255 255 NaN 0.0421 63.4 1.4 Sonora California
0 209 209 209 NaN 0.031 67.4 3.01 Modesto California
0 206 206 206 NaN 0.0331 67.2 3.8 Roseville California
#!/bin/bash
url=http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata
case $1 in "" | -h* | --h* )
cat <<!
epa-download 2015 2016:
curl daily_no2_2015.zip ... daily_WIND_2016.zip
from $url
link daily_44602_2015.zip to daily_no2_2015.zip and 44201 to ozone
Notes:
Many days are missing, see EPA-air-quality.md ff.
pandas.read_csv( daily_no2_2015.zip with a single csv ) works.
!
exit 1
esac
year=$1
yearto=${2-$year}
for year in `seq $year $yearto`
do
for zip in daily_{42602,44201,TEMP,WIND}_$year.zip
do
if [[ ! -f $zip ]]; then
curl --remote-time --show-error -O $url/$zip
fi
done
ln daily_{44201,ozone}_$year.zip 2> /dev/null # not mv, test -f
ln daily_{42602,no2}_$year.zip 2> /dev/null
done
#!/usr/bin/python
""" pr( ... x= y= ... ), probj() for testing """
# must be thousands such
# use with https://docs.python.org/2.7/library/logging.html ?
from __future__ import division
import numpy as np
__version__ = "2016-03-06 mar denis"
#...............................................................................
def pr( *args, **kwargs ):
""" pr( "stage 42:", ntop=ntop, df=df ... )
probj each arg / kwarg, summary line / ndarray
apply liberally
"""
print ""
for arg in args:
probj( arg )
for key, val in sorted( kwargs.items() ):
# sorted, else random not caller's order
probj( val, key )
# print ""
#...............................................................................
def probj( x, nm="" ):
""" print any ? obj:
string scalar list tuple dict: 1 summary line "nm: list len 42"
pandas Series DataFrame with pd.set_option
numpy ndarray with user's np.set_printoptions
"""
# sure to be buggy
if nm:
print "-- %s:" % nm ,
if x is None or _isstr( x ):
print x
return
if np.isscalar( x ) \
or (hasattr( x, "ndim" ) and x.ndim == 0): # np.array( 3 )
print "%.6g" % x
return
if hasattr( x, "name" ) and x.name is not None: # pandas
print x.name ,
# type / classname, shape / len --
t = getattr( x, "__class__", type(x) )
print t.__name__ ,
if hasattr( x, "shape" ):
print x.shape
n = x.shape[0]
else:
try:
n = len(x)
print "len %d" % n
except (TypeError, AttributeError): # len() of unsized object ?
n = np.NaN
print ""
if isinstance( x, (tuple, list) ):
return
if isinstance( x, dict ):
print " keys:", sorted( x.keys() [:10] )
return
# pandas DataFrame, Series etc.
# (to print as ndarray, pr( df.values ))
if hasattr( x, "head" ):
print x # with user's pd.set_option( max_rows max_cols ... )
return
if hasattr( x, "values" ):
x = x.values
if hasattr( x, "dtype" ): # np array kind O ?
print x # with user's np.set_printoptions
print ""
def _isstr( x ):
""" basestring or np.array( "str" ) """
return isinstance( x, basestring ) \
or np.issubdtype( getattr( x, "dtype", "i4"), np.string_ ) # ?
#...............................................................................
if __name__ == "__main__":
from collections import namedtuple
np.set_printoptions( threshold=100, edgeitems=10, linewidth=140,
formatter = dict( float = lambda x: "%.2g" % x )) # float arrays %.2g
# pd.set_option( "display.width", 140, "display.precision", 2 )
class C:
pass
c = C()
Namedtuple = namedtuple( "Namedtuple", "x y" )
pr(
adict = { 1:2, 3:4 },
alist = [1, 2],
array0d = np.array( 3 ),
none = None,
arraynone = [None],
arraystr = np.array("string"),
array2 = np.array([ "string", 3 ]),
eye = np.eye( 3 ) * np.pi,
pi = np.pi,
arraypi = np.array( np.pi ),
s = "string",
C=C,
c=c,
Namedtuple = Namedtuple,
anamedtuple = Namedtuple( 1, 2 ),
)
# also ~/py/etc/pandas/etc/z-groupby.py
#!/usr/bin/env python2
""" zutil.py: Bag asum findfile ints numstr vmap """
from __future__ import division
import numpy as np
class Bag( dict ):
""" a dict with d.key short for d["key"], aka Dotdict
d = Bag( k=v ... / **dict / dict.items() / [(k,v) ...] ) just like dict
"""
# np.savez( **bag )
# bag = Bag( np.load( .npz ).items() )
def __init__(self, *args, **kwargs):
dict.__init__( self, *args, **kwargs )
self.__dict__ = self
def __getnewargs__(self): # for cPickle.dump( d, file, protocol=-1)
return tuple(self)
Dotdict = Bag
#...............................................................................
def asum( X ):
""" array summary: "shape dtype min av max [density]" """
if not hasattr( X, "dtype" ) or X.dtype.kind not in "fiu": # float int uint
return str(X)
if hasattr( X, "todense" ): # issparse
sparsetype = type(X).__name__ + " " # csr_matrix etc.
density = " of the %.3g %% non-0" % (
100. * X.nnz / np.prod( X.shape ))
else:
sparsetype = density = ""
dtype = str(X.dtype) .replace( "float64", "" )
return "%s %s%s min av max %.3g %.3g %.3g %s" % (
X.shape, sparsetype, dtype, X.min(), X.mean(), X.max(), density )
def bincount( y ):
return np.bincount( (y - y.min()) .astype(int) )
def copy( x ):
return x if x is None or np.isscalar(x) \
else np.copy( x )
def findfile( filename, dirs="data ../data $SCIKIT_LEARN_DATA $webdata " ):
""" -> first dir/file found in a list of dirs
IOError if none
"""
from os.path import expanduser, expandvars, isfile, join
if isinstance( dirs, basestring ):
dirs = dirs.split()
for dir in [""] + dirs: # "": /abspath ~/name name
dirfile = expanduser( expandvars( join( dir, filename )))
if isfile( dirfile ):
return dirfile
raise IOError( "file \"%s\" not found in folders %s" % (filename, dirs) )
def ints( X ):
return np.round(X).astype(int) # NaN Inf -> - maxint
def maxnorm( X, axis=None ):
return np.linalg.norm( X, ord=np.inf, axis=axis )
def minavmax( X ):
return "%.3g %.3g %.3g" % (
X.min(), X.mean(), X.max() )
def mkdirpart( filename ):
""" tmp/file: mkdir tmp """
import os
dir = os.path.dirname( filename )
if dir and not os.path.isdir( dir ):
os.makedirs( dir )
def numstr( x, fmt="%.2g" ):
""" %.2g but 123 not 1.2e+02 """
# np.set_printoptions( ... formatter = dict( float=ut.numstr ))
if isinstance( x, basestring ) or x is None:
return x
if np.isscalar(x):
if abs(x) < 1e-10: return "0"
if abs(x) < 1e-6: return "%.1g" % x
if 100 <= abs(x) < 1e5: return "%.0f" % x
return fmt % x
x = np.asanyarray(x)
assert x.ndim == 1, x.ndim # NotImplemented
return " ".join([ numstr( _, fmt ) for _ in x ])
def numstr3( x ):
return numstr( x, fmt="%.3g" )
_pvec = 3
_pvecscale = 0 # > 0: ints( scale * x )
def pvec( x ):
""" -> str( short vecs ), smax( long ) """
if x is None or isinstance( x, basestring ):
return x
if np.isscalar(x):
return "%.3g" % x
if len(x) <= _pvec:
if _pvecscale > 0:
x = ints( _pvecscale * np.asarray(x) )
return str( x ) # np.printoptions
else:
return "%.3g" % smax( x )
def smax( x ):
""" signed max, to see oscillation """
return x if np.isscalar(x) or x.ndim == 0 \
else x[ np.fabs(x) .argmax() ]
def vmap( f, X ):
return np.vstack( map( f, X )) .squeeze()
#...............................................................................
def From( pyfile ):
""" -> "callersfile pwd time" for tagging output files """
import os
pyfile = pyfile.split("/") [-1]
pwd = os.getenv("PWD") .replace( "/Users/", "~", 1 ) # mac -L not -P
return "from %s run in %s %s" % (pyfile, pwd, isoday())
def isoday():
import time
return time.strftime( "%Y-%m-%d %h %H:%M" ) # 2011-11-15 Nov 12:06
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment