Skip to content

Instantly share code, notes, and snippets.

View smalik's full-sized avatar

Sulaiman Malik smalik

View GitHub Profile
@smalik
smalik / mad.sql
Created September 27, 2018 21:17
SQL Code for mean absolute deviation
SELECT id, MEDIAN(ABS(value - med))
FROM (
SELECT id, value, MEDIAN(value) OVER(PARTITION BY id) AS med
FROM mytable
)
GROUP BY
id
@smalik
smalik / hist_to_distr.py
Created May 10, 2018 17:03
Fit distribution to histogram
from scipy import stats
import numpy as np
import matplotlib.pylab as plt
# create some normal random noisy data
ser = 50*np.random.rand() * np.random.normal(10, 10, 100) + 20
# plot normed histogram
plt.hist(ser, normed=True)
@smalik
smalik / fit_weibull.py
Created May 10, 2018 17:00
Fit weibull distribution to data and estimate parameters
from scipy.stats import exponweib
from scipy.optimize import fmin
import numpy as np
# x is your data array
# returns [shape, scale]
def fitweibull(x):
def optfun(theta):
return -np.sum(np.log(exponweib.pdf(x, 1, theta[0], scale = theta[1], loc = 0)))
@smalik
smalik / get_low_variance_columns.py
Created December 4, 2017 19:55
get_low_variance_columns
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
# Convenience function wraps the VarianceThreshold transformer but you can pass it a pandas dataframe and get one in return
def get_low_variance_columns(dframe=None, columns=None,
skip_columns=None, thresh=0.0,
autoremove=False):
"""
SELECT anchor_table_schema,
anchor_table_name,
SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
FROM v_monitor.column_storage
GROUP BY anchor_table_schema,
anchor_table_name
ORDER BY SUM(used_bytes) DESC;
SELECT anchor_table_schema,
anchor_table_name,
SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
FROM v_monitor.projection_storage
GROUP BY anchor_table_schema,
anchor_table_name
ORDER BY SUM(used_bytes) DESC;
# Ripped off from http://stackoverflow.com/questions/21754319/rule-of-thumb-for-memory-size-of-datasets-in-r
# original author Carlos Cinelli
howMuchRAM <-function(ncol, nrow, cushion=3){
#40 bytes per col
colBytes <- ncol*40
#8 bytes per cell
cellBytes <- ncol*nrow*8
@smalik
smalik / filesplits.sh
Created November 6, 2015 18:32
script to split large CSV file into smaller parts names in sequence
#!/bin/bash
fname=devices1518.csv
HDR=$(head -1 $fname) # Pick up CSV header line to apply to each file
split -l 1000000 $fname prt # Split the file into chunks of 1M lines each
n=1
for f in prt* # iterate over chunks
do
echo $HDR > Part${n} # Write out header to new files"
cat $f >> Part${n} # Add in lines that were split
rm $f
.libPaths(/<Data partition>/R/library)
plist <- read.csv('~/Downloads/packages.csv')
install.packages(as.character(plist[,1]))
# Get memory allocation on an object by object basis. Loop over your namespace and print out memory used by all objects.
for (itm in ls()) {
print(formatC(c(itm, object.size(get(itm))), format="d", big.mark=",", width=30), quote=F)
}
# Print memory used collectively in namespace
print(memory.profile())