Tanay Mehta tanaymeh

## fused_gelu.cu
extern "C" {
#include <cuda_runtime.h>
#include <math.h>

__device__ __forceinline__
float gelu(float x){
    const float two_by_pi = 0.7978845608028654f; // literally: 2/pi
    return 0.5f * x * (1.0f + tanhf(two_by_pi * (x + 0.044715f * x * x * x)));
}

## lance_dataset.py
import lance
import pyarrow as pa

from tqdm.auto import tqdm

import datasets
from transformers import AutoTokenizer

# We'll be using the GPT neo tokenizer for tokenizing the code files
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

## spanish-ds-encodef-5.py
# Drop the two now redundant columns
new_df_test.drop(['start_date'], axis=1, inplace=True)
new_df_test.drop(['end_date'], axis=1, inplace=True)

# View the final dataset
new_df_test.head()

## spanish-ds-encodef-4.py
# Let's now extract the weekdays using a similar way we used before
week_start = list()
week_end = list()

for i in start_time:
    week_start.append(datetime.datetime.strptime(i, DATETIME).weekday())

for i in end_time:
    week_end.append(datetime.datetime.strptime(i, DATETIME).weekday())


## spanish-ds-encodef-3.py
# Let's get the duration of the trip in seconds and then we will get the weekdays
start_time = new_df_test['start_date'].values
end_time = new_df_test['end_date'].values

DATETIME = '%Y-%m-%d %H:%M:%S'

duration = list()

for i in range(len(start_time)):
    difference = datetime.datetime.strptime(end_time[i], DATETIME) - datetime.datetime.strptime(start_time[i], DATETIME)

## spanish-ds-encodef-2.py
# Now after this, let's encode these columns based on their category names
new_df_test = replace_small_categorical_data(new_df_test, column_name='train_type', categorical_names=train_type_names)
new_df_test = replace_small_categorical_data(new_df_test, column_name='train_class', categorical_names=train_class_names)
new_df_test = replace_small_categorical_data(new_df_test, column_name='fare', categorical_names=fare_names)

## spanish-ds-encodef-1.py
# First get category names
_, train_type_names = get_category_names(new_df_test, column_name='train_type')
_, train_class_names = get_category_names(new_df_test, column_name='train_class')
_, fare_names = get_category_names(new_df_test, column_name='fare')

## spanish-ds-plot-train-fare.py
new_df_test.fare.value_counts().plot(kind='bar')
plt.title('Distribution of Train Fare')
plt.xlabel('Train Fares')
plt.ylabel('Number of trains')
plt.yscale('symlog')

fig = plt.gcf()
fig.set_size_inches(15.5, 5.5)

## spanish-ds-plot-train-class.py
new_df_test.train_class.value_counts().plot(kind='bar')
plt.title('Distribution of Train Classes')
plt.xlabel('Train Classes')
plt.ylabel('Number of trains')
plt.yscale('symlog')

fig = plt.gcf()
fig.set_size_inches(15.5, 5.5)

## spanish-ds-plot-train-type.py
new_df_test.train_type.value_counts().plot(kind='bar')
plt.title('Distribution of Train Types')
plt.xlabel('Train Types')
plt.ylabel('Number of trains')
plt.yscale('symlog')

fig = plt.gcf()
fig.set_size_inches(15.5, 5.5)
	extern "C" {
	#include <cuda_runtime.h>
	#include <math.h>

	__device__ __forceinline__
	float gelu(float x){
	const float two_by_pi = 0.7978845608028654f; // literally: 2/pi
	return 0.5f * x * (1.0f + tanhf(two_by_pi * (x + 0.044715f * x * x * x)));
	}
	import lance
	import pyarrow as pa

	from tqdm.auto import tqdm

	import datasets
	from transformers import AutoTokenizer

	# We'll be using the GPT neo tokenizer for tokenizing the code files
	tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
	# Drop the two now redundant columns
	new_df_test.drop(['start_date'], axis=1, inplace=True)
	new_df_test.drop(['end_date'], axis=1, inplace=True)

	# View the final dataset
	new_df_test.head()
	# Let's now extract the weekdays using a similar way we used before
	week_start = list()
	week_end = list()

	for i in start_time:
	week_start.append(datetime.datetime.strptime(i, DATETIME).weekday())

	for i in end_time:
	week_end.append(datetime.datetime.strptime(i, DATETIME).weekday())
	# Let's get the duration of the trip in seconds and then we will get the weekdays
	start_time = new_df_test['start_date'].values
	end_time = new_df_test['end_date'].values

	DATETIME = '%Y-%m-%d %H:%M:%S'

	duration = list()

	for i in range(len(start_time)):
	difference = datetime.datetime.strptime(end_time[i], DATETIME) - datetime.datetime.strptime(start_time[i], DATETIME)
	# Now after this, let's encode these columns based on their category names
	new_df_test = replace_small_categorical_data(new_df_test, column_name='train_type', categorical_names=train_type_names)
	new_df_test = replace_small_categorical_data(new_df_test, column_name='train_class', categorical_names=train_class_names)
	new_df_test = replace_small_categorical_data(new_df_test, column_name='fare', categorical_names=fare_names)
	# First get category names
	_, train_type_names = get_category_names(new_df_test, column_name='train_type')
	_, train_class_names = get_category_names(new_df_test, column_name='train_class')
	_, fare_names = get_category_names(new_df_test, column_name='fare')
	new_df_test.fare.value_counts().plot(kind='bar')
	plt.title('Distribution of Train Fare')
	plt.xlabel('Train Fares')
	plt.ylabel('Number of trains')
	plt.yscale('symlog')

	fig = plt.gcf()
	fig.set_size_inches(15.5, 5.5)
	new_df_test.train_class.value_counts().plot(kind='bar')
	plt.title('Distribution of Train Classes')
	plt.xlabel('Train Classes')
	plt.ylabel('Number of trains')
	plt.yscale('symlog')

	fig = plt.gcf()
	fig.set_size_inches(15.5, 5.5)
	new_df_test.train_type.value_counts().plot(kind='bar')
	plt.title('Distribution of Train Types')
	plt.xlabel('Train Types')
	plt.ylabel('Number of trains')
	plt.yscale('symlog')

	fig = plt.gcf()
	fig.set_size_inches(15.5, 5.5)