Skip to content

Instantly share code, notes, and snippets.

@statcompute
Created November 24, 2018 06:38
Show Gist options
  • Save statcompute/f25492bb1287e0febfedfb04b299b90e to your computer and use it in GitHub Desktop.
Save statcompute/f25492bb1287e0febfedfb04b299b90e to your computer and use it in GitHub Desktop.
Data wrangling with astropy
from astropy.io import ascii
from astropy.table import Table, join
from numpy import nanmean, nanmedian, array, sort
tbl1 = ascii.read("Downloads/nycflights.csv", format = "csv")
### SUBSETTING
sel_cols = ["origin", "dest", "distance", "dep_delay", "carrier"]
tbl2 = tbl1[sel_cols][range(10)]
tbl2.info
# name dtype
#--------- -----
# origin str3
# dest str3
# distance int64
#dep_delay int64
# carrier str2
### FILTERING ###
tbl2[list(i["carrier"] == "UA" and i["origin"] == 'EWR' for i in tbl2)]
#origin dest distance dep_delay carrier
#------ ---- -------- --------- -------
# EWR IAH 1400 2 UA
# EWR ORD 719 -4 UA
tbl2[(tbl2["carrier"] == "UA") & (tbl2["origin"] == "EWR")]
#origin dest distance dep_delay carrier
#------ ---- -------- --------- -------
# EWR IAH 1400 2 UA
# EWR ORD 719 -4 UA
### FILTER BY GROUPS ###
vstack(map(lambda x: x[(x["carrier"] == "UA") & (x["origin"] == "EWR")],
tbl2.group_by(sort(array(range(len(tbl2))) % 2)).groups))
#origin dest distance dep_delay carrier
#------ ---- -------- --------- -------
# EWR IAH 1400 2 UA
# EWR ORD 719 -4 UA
### GROUPING ###
grp = tbl2.group_by("origin")
### AGGREGATING ###
agg1 = Table(grp['origin', 'distance'].groups.aggregate(nanmedian), names = ["origin", "med_dist"])
#origin med_dist
#------ --------
# EWR 1065.0
# JFK 1089.0
# LGA 747.5
agg2 = Table(grp['origin', 'dep_delay'].groups.aggregate(nanmean), names = ["origin", "avg_delay"])
#origin avg_delay
#------ -------------------
# EWR -2.3333333333333335
# JFK -0.6666666666666666
# LGA -1.75
### JOINING ###
join(agg1, agg2, join_type = "inner", keys = "origin")
#origin med_dist avg_delay
#------ -------- -------------------
# EWR 1065.0 -2.3333333333333335
# JFK 1089.0 -0.6666666666666666
# LGA 747.5 -1.75
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment