Created
November 24, 2018 06:38
-
-
Save statcompute/f25492bb1287e0febfedfb04b299b90e to your computer and use it in GitHub Desktop.
Data wrangling with astropy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from astropy.io import ascii | |
from astropy.table import Table, join | |
from numpy import nanmean, nanmedian, array, sort | |
tbl1 = ascii.read("Downloads/nycflights.csv", format = "csv") | |
### SUBSETTING | |
sel_cols = ["origin", "dest", "distance", "dep_delay", "carrier"] | |
tbl2 = tbl1[sel_cols][range(10)] | |
tbl2.info | |
# name dtype | |
#--------- ----- | |
# origin str3 | |
# dest str3 | |
# distance int64 | |
#dep_delay int64 | |
# carrier str2 | |
### FILTERING ### | |
tbl2[list(i["carrier"] == "UA" and i["origin"] == 'EWR' for i in tbl2)] | |
#origin dest distance dep_delay carrier | |
#------ ---- -------- --------- ------- | |
# EWR IAH 1400 2 UA | |
# EWR ORD 719 -4 UA | |
tbl2[(tbl2["carrier"] == "UA") & (tbl2["origin"] == "EWR")] | |
#origin dest distance dep_delay carrier | |
#------ ---- -------- --------- ------- | |
# EWR IAH 1400 2 UA | |
# EWR ORD 719 -4 UA | |
### FILTER BY GROUPS ### | |
vstack(map(lambda x: x[(x["carrier"] == "UA") & (x["origin"] == "EWR")], | |
tbl2.group_by(sort(array(range(len(tbl2))) % 2)).groups)) | |
#origin dest distance dep_delay carrier | |
#------ ---- -------- --------- ------- | |
# EWR IAH 1400 2 UA | |
# EWR ORD 719 -4 UA | |
### GROUPING ### | |
grp = tbl2.group_by("origin") | |
### AGGREGATING ### | |
agg1 = Table(grp['origin', 'distance'].groups.aggregate(nanmedian), names = ["origin", "med_dist"]) | |
#origin med_dist | |
#------ -------- | |
# EWR 1065.0 | |
# JFK 1089.0 | |
# LGA 747.5 | |
agg2 = Table(grp['origin', 'dep_delay'].groups.aggregate(nanmean), names = ["origin", "avg_delay"]) | |
#origin avg_delay | |
#------ ------------------- | |
# EWR -2.3333333333333335 | |
# JFK -0.6666666666666666 | |
# LGA -1.75 | |
### JOINING ### | |
join(agg1, agg2, join_type = "inner", keys = "origin") | |
#origin med_dist avg_delay | |
#------ -------- ------------------- | |
# EWR 1065.0 -2.3333333333333335 | |
# JFK 1089.0 -0.6666666666666666 | |
# LGA 747.5 -1.75 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment