Skip to content

Instantly share code, notes, and snippets.

View edumunozsala's full-sized avatar

Eduardo M Sala edumunozsala

View GitHub Profile
@edumunozsala
edumunozsala / uris_yellow_2019.tsv
Created February 20, 2024 07:13
URIs Yellow Taxi 2019
https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-01.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-02.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-03.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-04.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-05.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-06.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-07.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-08.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-09.csv.gz https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-10.csv.gz http
@edumunozsala
edumunozsala / finetune_llama_v2.py
Created July 19, 2023 17:33 — forked from younesbelkada/finetune_llama_v2.py
Fine tune Llama v2 models on Guanaco Dataset
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
@edumunozsala
edumunozsala / edumunozsala.json
Last active May 21, 2023 11:43
Linkedin profile in json
{
"public_identifier": "edumunozsala",
"profile_pic_url": "https://s3.us-west-000.backblazeb2.com/proxycurl/person/edumunozsala/profile?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=0004d7f56a0400b0000000001%2F20230521%2Fus-west-000%2Fs3%2Faws4_request&X-Amz-Date=20230521T110202Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=35f2bd1038c2893d6f8ed5bc16c787b837376f4cecd39d4f9d6c803e9b437201",
"background_cover_image_url": "https://s3.us-west-000.backblazeb2.com/proxycurl/person/edumunozsala/cover?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=0004d7f56a0400b0000000001%2F20230521%2Fus-west-000%2Fs3%2Faws4_request&X-Amz-Date=20230521T110202Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=b13820e856246444e5f2391103d2373e6ff9b61351206cb7a83b90d4e4d8e513",
"first_name": "Eduardo Mu\\u00f1oz",
"last_name": "Sala",
"full_name": "Eduardo Mu\\u00f1oz Sala",
"follower_count": null,
"occupation": "Responsable de Proyectos de Aplicaciones e Integraci\\u00f3n de datos at Berg\\u00e9
@edumunozsala
edumunozsala / multihead_attention.py
Created October 26, 2020 18:48
Multi-head attention for Transformer
class MultiHeadAttention(layers.Layer):
def __init__(self, n_heads):
super(MultiHeadAttention, self).__init__()
self.n_heads = n_heads
def build(self, input_shape):
self.d_model = input_shape[-1]
assert self.d_model % self.n_heads == 0
# Calculate the dimension of every head or projection
@edumunozsala
edumunozsala / scaled_dot_product_attention.py
Created October 26, 2020 18:45
Scaled dot product attention for Transformer
def scaled_dot_product_attention(queries, keys, values, mask):
# Calculate the dot product, QK_transpose
product = tf.matmul(queries, keys, transpose_b=True)
# Get the scale factor
keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
# Apply the scale factor to the dot product
scaled_product = product / tf.math.sqrt(keys_dim)
# Apply masking when it is requiered
if mask is not None:
scaled_product += (mask * -1e9)
@edumunozsala
edumunozsala / predict_att_seq2seq.py
Created October 11, 2020 17:12
Make predictions for out seq2seq with attention model
def predict_seq2seq_att(input_text, input_max_len, tokenizer_inputs, word2idx_outputs, idx2word_outputs):
if input_text is None:
input_text = input_data[np.random.choice(len(input_data))]
print(input_text)
# Tokenize the input text
input_seq = tokenizer_inputs.texts_to_sequences([input_text])
# Pad the sentence
input_seq = pad_sequences(input_seq, maxlen=input_max_len, padding='post')
# Get the encoder initial states
en_initial_states = encoder.init_states(1)
@edumunozsala
edumunozsala / train_ckpt_seq2seq.py
Created October 11, 2020 16:33
Train and checkpoint the seq2seq model
# Create an Adam optimizer and clips gradients by norm
optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)
# Create a checkpoint object to save the model
checkpoint_dir = './training_ckpt_seq2seq'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
losses, accuracies = main_train(encoder, decoder, dataset, EPOCHS, BATCH_SIZE, optimizer, checkpoint, checkpoint_prefix)
@edumunozsala
edumunozsala / loss_acc_fn_seq2seq.py
Created October 11, 2020 16:15
Create a custom loss and accuracy functions for the seq2seq model
def loss_func(targets, logits):
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True)
# Mask padding values, they do not have to compute for loss
mask = tf.math.logical_not(tf.math.equal(targets, 0))
mask = tf.cast(mask, dtype=tf.int64)
# Calculate the loss value
loss = crossentropy(targets, logits, sample_weight=mask)
return loss
@edumunozsala
edumunozsala / padding_seq2seq.py
Created October 11, 2020 16:07
Padding the input, target and target input sequence for the seq2seq model
# pad the input sequences
encoder_inputs = pad_sequences(input_sequences, maxlen=input_max_len, padding='post')
print("encoder_inputs.shape:", encoder_inputs.shape)
print("encoder_inputs[0]:", encoder_inputs[0])
# pad the decoder input sequences
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=target_max_len, padding='post')
print("decoder_inputs[0]:", decoder_inputs[0])
print("decoder_inputs.shape:", decoder_inputs.shape)
# pad the target output sequences
decoder_targets = pad_sequences(target_sequences, maxlen=target_max_len, padding='post')
@edumunozsala
edumunozsala / vocabularies_seq2seq.py
Created October 11, 2020 16:05
Create the vocabularies for the seq2seq model
# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))
# get the word to index mapping for output language
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique output tokens.' % len(word2idx_outputs))
# store number of output and input words for later
# remember to add 1 since indexing starts at 1