This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lance | |
import pyarrow as pa | |
from tqdm.auto import tqdm | |
import datasets | |
from transformers import AutoTokenizer | |
# We'll be using the GPT neo tokenizer for tokenizing the code files | |
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Drop the two now redundant columns | |
new_df_test.drop(['start_date'], axis=1, inplace=True) | |
new_df_test.drop(['end_date'], axis=1, inplace=True) | |
# View the final dataset | |
new_df_test.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let's now extract the weekdays using a similar way we used before | |
week_start = list() | |
week_end = list() | |
for i in start_time: | |
week_start.append(datetime.datetime.strptime(i, DATETIME).weekday()) | |
for i in end_time: | |
week_end.append(datetime.datetime.strptime(i, DATETIME).weekday()) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let's get the duration of the trip in seconds and then we will get the weekdays | |
start_time = new_df_test['start_date'].values | |
end_time = new_df_test['end_date'].values | |
DATETIME = '%Y-%m-%d %H:%M:%S' | |
duration = list() | |
for i in range(len(start_time)): | |
difference = datetime.datetime.strptime(end_time[i], DATETIME) - datetime.datetime.strptime(start_time[i], DATETIME) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Now after this, let's encode these columns based on their category names | |
new_df_test = replace_small_categorical_data(new_df_test, column_name='train_type', categorical_names=train_type_names) | |
new_df_test = replace_small_categorical_data(new_df_test, column_name='train_class', categorical_names=train_class_names) | |
new_df_test = replace_small_categorical_data(new_df_test, column_name='fare', categorical_names=fare_names) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First get category names | |
_, train_type_names = get_category_names(new_df_test, column_name='train_type') | |
_, train_class_names = get_category_names(new_df_test, column_name='train_class') | |
_, fare_names = get_category_names(new_df_test, column_name='fare') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
new_df_test.fare.value_counts().plot(kind='bar') | |
plt.title('Distribution of Train Fare') | |
plt.xlabel('Train Fares') | |
plt.ylabel('Number of trains') | |
plt.yscale('symlog') | |
fig = plt.gcf() | |
fig.set_size_inches(15.5, 5.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
new_df_test.train_class.value_counts().plot(kind='bar') | |
plt.title('Distribution of Train Classes') | |
plt.xlabel('Train Classes') | |
plt.ylabel('Number of trains') | |
plt.yscale('symlog') | |
fig = plt.gcf() | |
fig.set_size_inches(15.5, 5.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
new_df_test.train_type.value_counts().plot(kind='bar') | |
plt.title('Distribution of Train Types') | |
plt.xlabel('Train Types') | |
plt.ylabel('Number of trains') | |
plt.yscale('symlog') | |
fig = plt.gcf() | |
fig.set_size_inches(15.5, 5.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We will insert our new 'new_df_test' dataframe as it has the 'destination' column encoded | |
new_df_test = replace_small_categorical_data(new_df_test, column_name='origin', categorical_names=city_names) | |
# Let's look at the results; | |
new_df_test.head() |
NewerOlder