Skip to content

Instantly share code, notes, and snippets.

View tanaymeh's full-sized avatar
🏖️
Working Remotely

Tanay Mehta tanaymeh

🏖️
Working Remotely
View GitHub Profile
import lance
import pyarrow as pa
from tqdm.auto import tqdm
import datasets
from transformers import AutoTokenizer
# We'll be using the GPT neo tokenizer for tokenizing the code files
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
# Drop the two now redundant columns
new_df_test.drop(['start_date'], axis=1, inplace=True)
new_df_test.drop(['end_date'], axis=1, inplace=True)
# View the final dataset
new_df_test.head()
# Let's now extract the weekdays using a similar way we used before
week_start = list()
week_end = list()
for i in start_time:
week_start.append(datetime.datetime.strptime(i, DATETIME).weekday())
for i in end_time:
week_end.append(datetime.datetime.strptime(i, DATETIME).weekday())
# Let's get the duration of the trip in seconds and then we will get the weekdays
start_time = new_df_test['start_date'].values
end_time = new_df_test['end_date'].values
DATETIME = '%Y-%m-%d %H:%M:%S'
duration = list()
for i in range(len(start_time)):
difference = datetime.datetime.strptime(end_time[i], DATETIME) - datetime.datetime.strptime(start_time[i], DATETIME)
# Now after this, let's encode these columns based on their category names
new_df_test = replace_small_categorical_data(new_df_test, column_name='train_type', categorical_names=train_type_names)
new_df_test = replace_small_categorical_data(new_df_test, column_name='train_class', categorical_names=train_class_names)
new_df_test = replace_small_categorical_data(new_df_test, column_name='fare', categorical_names=fare_names)
# First get category names
_, train_type_names = get_category_names(new_df_test, column_name='train_type')
_, train_class_names = get_category_names(new_df_test, column_name='train_class')
_, fare_names = get_category_names(new_df_test, column_name='fare')
new_df_test.fare.value_counts().plot(kind='bar')
plt.title('Distribution of Train Fare')
plt.xlabel('Train Fares')
plt.ylabel('Number of trains')
plt.yscale('symlog')
fig = plt.gcf()
fig.set_size_inches(15.5, 5.5)
new_df_test.train_class.value_counts().plot(kind='bar')
plt.title('Distribution of Train Classes')
plt.xlabel('Train Classes')
plt.ylabel('Number of trains')
plt.yscale('symlog')
fig = plt.gcf()
fig.set_size_inches(15.5, 5.5)
new_df_test.train_type.value_counts().plot(kind='bar')
plt.title('Distribution of Train Types')
plt.xlabel('Train Types')
plt.ylabel('Number of trains')
plt.yscale('symlog')
fig = plt.gcf()
fig.set_size_inches(15.5, 5.5)
# We will insert our new 'new_df_test' dataframe as it has the 'destination' column encoded
new_df_test = replace_small_categorical_data(new_df_test, column_name='origin', categorical_names=city_names)
# Let's look at the results;
new_df_test.head()