Skip to content

Instantly share code, notes, and snippets.

@suhaskv
suhaskv / null_nan_train.py
Last active December 23, 2020 11:31
Check presence of Null and NaN values in the train data
null_arr = []
nan_arr = []
for sig in metadata_train['signal_id'].values:
sig_data = pd.read_parquet('/content/train.parquet',
engine='fastparquet', columns=[str(sig)])
null_arr.append(sig_data.isnull().sum())
nan_arr.append(sig_data.isna().sum())
print(f"Number of Null values in train data: {np.sum(null_arr)}")
print(f"Number of NaN values in train data: {np.sum(nan_arr)}")
@suhaskv
suhaskv / null_nan_test.py
Created December 23, 2020 11:32
Check presence of Null and NaN in test data.
null_arr = []
nan_arr = []
for sig in metadata_test['signal_id'].values:
sig_data = pd.read_parquet('/content/test.parquet',
engine='fastparquet', columns=[str(sig)])
null_arr.append(sig_data.isnull().sum())
nan_arr.append(sig_data.isna().sum())
print(f"Number of Null values in test data: {np.sum(null_arr)}")
print(f"Number of NaN values in test data: {np.sum(nan_arr)}")
@suhaskv
suhaskv / unique_values_train.py
Created December 23, 2020 11:34
VSB Power Line Blog - Check number of unique values in train data
for col in metadata_train.columns:
print(f"Number of unique values in {col} is {metadata_train[col].unique().shape[0]}")
@suhaskv
suhaskv / target_distribution.py
Created December 23, 2020 11:35
VSB Power Line Blog - Get the distribution of the target data
splot = sns.countplot(x='target', data=metadata_train)
# https://github.com/mwaskom/seaborn/issues/1582
for ind, p in enumerate(splot.patches):
percent = np.round((metadata_train[metadata_train['target']==ind].shape[0]/metadata_train['target'].shape[0])*100, 2)
splot.annotate(str(metadata_train[metadata_train['target']==ind].shape[0]) + f" ({percent}%)",
(p.get_x()+p.get_width()/2, p.get_height()))
plt.title("Distribution of target classes")
plt.show()
@suhaskv
suhaskv / target_distribution_phase_wise.py
Created December 23, 2020 11:37
VSB Power Line Blog - Get the target distribution phase wise
splot = sns.countplot(x="phase", data=metadata_train, hue="target")
# Get the total number of signals present in each phase
total_phases = metadata_train[metadata_train['phase']==0].shape[0]
num_points = []
# https://github.com/mwaskom/seaborn/issues/1582
for ind, p in enumerate(splot.patches):
# Phase=[0,1,2] for indices [0,1,2] and indices [3,4,5] respectively
phase = ind%3
# target=[0,1] for indices [0,1], [2,3], [4,5] respectively
tar = ind//3
@suhaskv
suhaskv / imbalance_ratio.py
Created December 23, 2020 11:38
VSB Power Line Blog - Get the imbalance ratio in each phase
# Store the imbalance ratio values of each phase
avg_imb = []
for phase in metadata_train['phase'].unique():
num_0 = metadata_train[(metadata_train.phase==(phase)) & (metadata_train.target==0)].shape[0]
num_1 = metadata_train[(metadata_train.phase==(phase)) & (metadata_train.target==1)].shape[0]
imbalance_ratio = num_0/num_1
avg_imb.append(imbalance_ratio)
print(f"Imbalance ratio - phase {phase}: {imbalance_ratio}")
print(f"Average imbalance ratio: {np.round(np.sum(avg_imb)/len(avg_imb), 2)}")
@suhaskv
suhaskv / plot_raw_signal.py
Created December 23, 2020 11:39
VSB Power Line Blog - Plot the raw signal for a measurement id
sig_list = metadata_train.head(6).values
plt.figure(figsize=(25,10))
n_rows = 2
n_cols = 1
for ind, val in enumerate(sig_list):
plt.subplot(n_rows, n_cols, (ind//3)+1)
plt.plot(train[str(val[0])], label=f"signal_id-{val[0]}, phase-{val[2]}, id_measurement-{val[1]}")
plt.legend()
plt.title(f'Power Line Signal with target={val[3]}')
plt.xlabel('Samples')
@suhaskv
suhaskv / dwt_decompose.py
Created December 23, 2020 11:42
VSB Power Line Blog - Decompose the signal into detail and approximation coefficients
# Decompose to get the wavelet coefficients
# 'db4' refers to the daubechies 4 wavelet
# 'per' refers to periodic-padding, as the raw signal is a periodic one
# x refers to the raw signal data
wavlt_coeffs = pywt.wavedec(x, 'db4', mode="per")
@suhaskv
suhaskv / dwt_hard_thresholding.py
Created December 23, 2020 11:43
VSB Power Line Blog - Perform hard-thresholding on the 1st level detail coefficients
# Calculate the sigma value
sigma = (1/0.6745) * np.mean(np.absolute(wavlt_coeffs[-1] - np.mean(wavlt_coeffs[-1],None)), None)
# Calculate the universal threshold using Stein Unbiased Risk Estimate (SURE)
Td = sigma * np.sqrt(2*np.log(len(x)))
# Perform hard-thresholding on the 1st level detail coefficient values
wavlt_coeffs[1:] = (pywt.threshold(i, value=Td, mode='hard') for i in wavlt_coeffs[1:])
@suhaskv
suhaskv / dwt_reconstruct.py
Created December 23, 2020 11:44
VSB Power Line Blog - Reconstruct the signal using the coefficients
# Reconstruct the signal using the coefficients
x_dn = pywt.waverec(wavlt_coeffs, 'db4', mode='per')