Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
def make_dataset(carrier_list, range_start = -60, range_end = 120, bin_width = 5):
# Check to make sure the start is less than the end!
assert range_start < range_end, "Start must be less than end!"
by_carrier = pd.DataFrame(columns=['proportion', 'left', 'right',
'f_proportion', 'f_interval',
'name', 'color'])
range_extent = range_end - range_start
# Iterate through all the carriers
for i, carrier_name in enumerate(carrier_list):
# Subset to the carrier
subset = flights[flights['name'] == carrier_name]
# Create a histogram with specified bins and range
arr_hist, edges = np.histogram(subset['arr_delay'],
bins = int(range_extent / bin_width),
range = [range_start, range_end])
# Divide the counts by the total to get a proportion and create df
arr_df = pd.DataFrame({'proportion': arr_hist / np.sum(arr_hist),
'left': edges[:-1], 'right': edges[1:] })
# Format the proportion
arr_df['f_proportion'] = ['%0.5f' % proportion for proportion in arr_df['proportion']]
# Format the interval
arr_df['f_interval'] = ['%d to %d minutes' % (left, right) for left,
right in zip(arr_df['left'], arr_df['right'])]
# Assign the carrier for labels
arr_df['name'] = carrier_name
# Color each carrier differently
arr_df['color'] = Category20_16[i]
# Add to the overall dataframe
by_carrier = by_carrier.append(arr_df)
# Overall dataframe
by_carrier = by_carrier.sort_values(['name', 'left'])
# Convert dataframe to column data source
return ColumnDataSource(by_carrier)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment