smzn/gist:1682054dbb07d007ad1d38d2e7abd261

## gistfile1.txt
import pandas as pd

df = divvy_tripdata

# 2. Convert 'started_at' and 'ended_at' to datetime
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# 3. Calculate the travel time for each trip
df['travel_time'] = df['ended_at'] - df['started_at']

# 4. Convert travel time to minutes
df['travel_time_min'] = df['travel_time'].dt.total_seconds() / 60

# 5. Group by start and end station, calculate total travel time, number of trips, and mean travel time
station_stats = df.groupby(['start_station_name', 'end_station_name']).agg(
    total_travel_time_min=('travel_time_min', 'sum'),
    number_of_trips=('travel_time_min', 'count'),
    mean_travel_time_min=('travel_time_min', 'mean')
).reset_index()

# Extracting the station names used in the transition probability matrix (second uploaded file)
used_stations = set(transition_probability_matrix.columns[1:]) # Ignoring the first column as it's also a station name

# Filter the station_stats dataframe to include only those rows where both the start and end stations are in the used_stations set
filtered_station_stats = station_stats[
    (station_stats['start_station_name'].isin(used_stations)) &
    (station_stats['end_station_name'].isin(used_stations))
]

filtered_station_stats
	import pandas as pd

	df = divvy_tripdata

	# 2. Convert 'started_at' and 'ended_at' to datetime
	df['started_at'] = pd.to_datetime(df['started_at'])
	df['ended_at'] = pd.to_datetime(df['ended_at'])

	# 3. Calculate the travel time for each trip
	df['travel_time'] = df['ended_at'] - df['started_at']

	# 4. Convert travel time to minutes
	df['travel_time_min'] = df['travel_time'].dt.total_seconds() / 60

	# 5. Group by start and end station, calculate total travel time, number of trips, and mean travel time
	station_stats = df.groupby(['start_station_name', 'end_station_name']).agg(
	total_travel_time_min=('travel_time_min', 'sum'),
	number_of_trips=('travel_time_min', 'count'),
	mean_travel_time_min=('travel_time_min', 'mean')
	).reset_index()

	# Extracting the station names used in the transition probability matrix (second uploaded file)
	used_stations = set(transition_probability_matrix.columns[1:]) # Ignoring the first column as it's also a station name

	# Filter the station_stats dataframe to include only those rows where both the start and end stations are in the used_stations set
	filtered_station_stats = station_stats[
	(station_stats['start_station_name'].isin(used_stations)) &
	(station_stats['end_station_name'].isin(used_stations))
	]

	filtered_station_stats