Skip to content

Instantly share code, notes, and snippets.

@smzn
Created January 6, 2024 08:29
Show Gist options
  • Save smzn/d941aecf83cdfdbd8108aa7a084e84c3 to your computer and use it in GitHub Desktop.
Save smzn/d941aecf83cdfdbd8108aa7a084e84c3 to your computer and use it in GitHub Desktop.
aggregation
df = divvy_tripdata
# Recalculating electric and classic bike counts as actual counts instead of rates
electric_bike_start_counts = df[df['rideable_type'] == 'electric_bike'].groupby('start_station_name')['ride_id'].count()
classic_bike_start_counts = df[df['rideable_type'] == 'classic_bike'].groupby('start_station_name')['ride_id'].count()
# Electric and classic bike counts at both start and end stations
electric_bike_end_counts = df[df['rideable_type'] == 'electric_bike'].groupby('end_station_name')['ride_id'].count()
classic_bike_end_counts = df[df['rideable_type'] == 'classic_bike'].groupby('end_station_name')['ride_id'].count()
# Combining start and end counts
electric_bike_counts = electric_bike_start_counts.add(electric_bike_end_counts, fill_value=0)
classic_bike_counts = classic_bike_start_counts.add(classic_bike_end_counts, fill_value=0)
# Recalculating member and casual counts as actual counts instead of rates
member_counts = df[df['member_casual'] == 'member'].groupby('start_station_name')['ride_id'].count()
casual_counts = df[df['member_casual'] == 'casual'].groupby('start_station_name')['ride_id'].count()
# Member and casual counts at end stations
member_end_counts = df[df['member_casual'] == 'member'].groupby('end_station_name')['ride_id'].count()
casual_end_counts = df[df['member_casual'] == 'casual'].groupby('end_station_name')['ride_id'].count()
# Combining start and end counts for members and casual users
member_counts = member_counts.add(member_end_counts, fill_value=0)
casual_counts = casual_counts.add(casual_end_counts, fill_value=0)
# Total start and end counts for each station
start_counts = df.groupby('start_station_name')['ride_id'].count()
end_counts = df.groupby('end_station_name')['ride_id'].count()
# Average latitude and longitude for each start station
average_lat = df.groupby('start_station_name')['start_lat'].mean()
average_lng = df.groupby('start_station_name')['start_lng'].mean()
# Average latitude and longitude for each end station
average_lat_end = df.groupby('end_station_name')['end_lat'].mean()
average_lng_end = df.groupby('end_station_name')['end_lng'].mean()
# Combining start and end station latitude and longitude
average_lat = average_lat.add(average_lat_end, fill_value=0) / 2
average_lng = average_lng.add(average_lng_end, fill_value=0) / 2
# Combining the recalculated counts into a single DataFrame
aggregated_data = pd.DataFrame({
'Electric Bike Count': electric_bike_counts,
'Classic Bike Count': classic_bike_counts,
'Start Count': start_counts,
'End Count': end_counts,
'Member Count': member_counts,
'Casual Count': casual_counts,
'Average Latitude': average_lat,
'Average Longitude': average_lng
})
# Filling NaN values with 0
aggregated_data = aggregated_data.fillna(0)
# 整数値に変換する列を指定
columns_to_convert = ['Electric Bike Count', 'Classic Bike Count',
'Start Count', 'End Count',
'Member Count', 'Casual Count']
# 指定された列を整数値に変換
for column in columns_to_convert:
aggregated_data[column] = aggregated_data[column].astype(int)
# Displaying the first few rows of the new aggregated data
aggregated_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment