Skip to content

Instantly share code, notes, and snippets.

View gautham20's full-sized avatar

Gautham Kumaran gautham20

View GitHub Profile
@gautham20
gautham20 / equal_group_sampling.py
Last active October 14, 2020 16:47
Stratified sampling of data in such way that the distribution of the grouped column in the sample is almost same as in original data
# Stratified sampling of data in such way that the distribution of the grouped column in the sample
# is almost same as in original data
def group_sampler(group_data, total_df_len, n_samples):
return group_data.sample(n=int(np.ceil((len(group_data)/ total_df_len)*n_samples)))
group_sampler_200 = partial(group_sampler, total_df_len=len(filtered_cells), n_samples=200)
filtered_200_cells = filtered_cells.groupby('group_column', as_index=False).apply(cell_group_sampler_200)
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
class RNNEncoder(nn.Module):
def __init__(self, rnn_num_layers=1, input_feature_len=1, sequence_len=168, hidden_size=100, bidirectional=False, device='cpu', rnn_dropout=0.2):
super().__init__()
self.sequence_len = sequence_len
self.hidden_size = hidden_size
self.input_feature_len = input_feature_len
self.num_layers = rnn_num_layers
self.rnn_directions = 2 if bidirectional else 1
self.gru = nn.GRU(
num_layers=rnn_num_layers,
class StoreItemDataset(Dataset):
def __init__(self, cat_columns=[], num_columns=[], embed_vector_size=None, decoder_input=True, ohe_cat_columns=False):
super().__init__()
self.sequence_data = None
self.cat_columns = cat_columns
self.num_columns = num_columns
self.cat_classes = {}
self.cat_embed_shape = []
self.cat_embed_vector_size = embed_vector_size if embed_vector_size is not None else {}
self.pass_decoder_input=decoder_input
class EncoderDecoderWrapper(nn.Module):
def __init__(self, encoder, decoder_cell, output_size=3, teacher_forcing=0.3, sequence_len=336, decoder_input=True, device='cpu'):
super().__init__()
self.encoder = encoder
self.decoder_cell = decoder_cell
self.output_size = output_size
self.teacher_forcing = teacher_forcing
self.sequence_length = sequence_len
self.decoder_input = decoder_input
self.device = device
class DecoderCell(nn.Module):
def __init__(self, input_feature_len, hidden_size, dropout=0.2):
super().__init__()
self.decoder_rnn_cell = nn.GRUCell(
input_size=input_feature_len,
hidden_size=hidden_size,
)
self.out = nn.Linear(hidden_size, 1)
self.attention = False
self.dropout = nn.Dropout(dropout)
class RNNEncoder(nn.Module):
def __init__(self, rnn_num_layers=1, input_feature_len=1, sequence_len=168, hidden_size=100, bidirectional=False, device='cpu', rnn_dropout=0.2):
super().__init__()
self.sequence_len = sequence_len
self.hidden_size = hidden_size
self.input_feature_len = input_feature_len
self.num_layers = rnn_num_layers
self.rnn_directions = 2 if bidirectional else 1
self.gru = nn.GRU(
num_layers=rnn_num_layers,
def get_similar_images_annoy(img_index):
start = time.time()
base_img_id, base_vector, base_label = img_repr_df.iloc[img_index, [0, 1, 2]]
similar_img_ids = t.get_nns_by_item(img_index, 13)
end = time.time()
print(f'{(end - start) * 1000} ms')
return base_img_id, base_label, img_repr_df.iloc[similar_img_ids[1:]]
base_image, base_label, similar_images_df = get_similar_images_annoy(212693)
from annoy import AnnoyIndex
feature_dim = len(img_repr_df['img_repr'][0])
t = AnnoyIndex(feature_dim, metric='euclidean')
for i, vector in enumerate(img_repr_df['img_repr']):
t.add_item(i, vector)
_ = t.build(inference_data.c)
def get_similar_images(img_index, n=10):
start = time.time()
base_img_id, base_vector, base_label = img_repr_df.iloc[img_index, [0, 1, 2]]
cosine_similarity = 1 - img_repr_df['img_repr'].apply(lambda x: cosine(x, base_vector))
similar_img_ids = np.argsort(cosine_similarity)[-11:-1][::-1]
end = time.time()
print(f'{end - start} secs')
return base_img_id, base_label, img_repr_df.iloc[similar_img_ids]