Sulaiman Malik smalik

## mad.sql
SELECT  id, MEDIAN(ABS(value - med))
FROM    (
        SELECT  id, value, MEDIAN(value) OVER(PARTITION BY id) AS med
        FROM    mytable
        )
GROUP BY
        id

## hist_to_distr.py
from scipy import stats
import numpy as np
import matplotlib.pylab as plt

# create some normal random noisy data
ser = 50*np.random.rand() * np.random.normal(10, 10, 100) + 20

# plot normed histogram
plt.hist(ser, normed=True)

## fit_weibull.py
from scipy.stats import exponweib
from scipy.optimize import fmin
import numpy as np

# x is your data array
# returns [shape, scale]

def fitweibull(x):
   def optfun(theta):
      return -np.sum(np.log(exponweib.pdf(x, 1, theta[0], scale = theta[1], loc = 0)))

## get_low_variance_columns.py
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

# Convenience function wraps the VarianceThreshold transformer but you can pass it a pandas dataframe and get one in return

def get_low_variance_columns(dframe=None, columns=None,
                             skip_columns=None, thresh=0.0,
                             autoremove=False):
    """

## vertica_column_storage.sql
SELECT anchor_table_schema,
       anchor_table_name,
       SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
FROM   v_monitor.column_storage
GROUP  BY anchor_table_schema,
          anchor_table_name
ORDER  BY SUM(used_bytes) DESC;

## vertica_projection_storage.sql
SELECT anchor_table_schema,
       anchor_table_name,
       SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
FROM   v_monitor.projection_storage
GROUP  BY anchor_table_schema,
          anchor_table_name
ORDER  BY SUM(used_bytes) DESC;

## howMuchRam.R
# Ripped off from http://stackoverflow.com/questions/21754319/rule-of-thumb-for-memory-size-of-datasets-in-r
# original author Carlos Cinelli

howMuchRAM <-function(ncol, nrow, cushion=3){
  #40 bytes per col
  colBytes <- ncol*40

  #8 bytes per cell
  cellBytes <- ncol*nrow*8

## filesplits.sh
#!/bin/bash
fname=devices1518.csv
HDR=$(head -1 $fname)   # Pick up CSV header line to apply to each file
split -l 1000000 $fname prt  # Split the file into chunks of 1M lines each
n=1
for f in prt*              # iterate over chunks
do
   echo $HDR > Part${n}    # Write out header to new files"
   cat $f >> Part${n}      # Add in lines that were split
   rm $f

## install_packages.R
.libPaths(/<Data partition>/R/library)
plist <- read.csv('~/Downloads/packages.csv')
install.packages(as.character(plist[,1]))

## R_object_sizer.R

#   Get memory allocation on an object by object basis.  Loop over your namespace and print out memory used by all objects.
for (itm in ls()) {
    print(formatC(c(itm, object.size(get(itm))), format="d", big.mark=",", width=30), quote=F)
}

#   Print memory used collectively in namespace
print(memory.profile())
	SELECT id, MEDIAN(ABS(value - med))
	FROM (
	SELECT id, value, MEDIAN(value) OVER(PARTITION BY id) AS med
	FROM mytable
	)
	GROUP BY
	id
	from scipy import stats
	import numpy as np
	import matplotlib.pylab as plt

	# create some normal random noisy data
	ser = 50np.random.rand() np.random.normal(10, 10, 100) + 20

	# plot normed histogram
	plt.hist(ser, normed=True)
	from scipy.stats import exponweib
	from scipy.optimize import fmin
	import numpy as np

	# x is your data array
	# returns [shape, scale]

	def fitweibull(x):
	def optfun(theta):
	return -np.sum(np.log(exponweib.pdf(x, 1, theta[0], scale = theta[1], loc = 0)))
	import numpy as np
	import pandas as pd
	from sklearn.feature_selection import VarianceThreshold

	# Convenience function wraps the VarianceThreshold transformer but you can pass it a pandas dataframe and get one in return

	def get_low_variance_columns(dframe=None, columns=None,
	skip_columns=None, thresh=0.0,
	autoremove=False):
	"""
	SELECT anchor_table_schema,
	anchor_table_name,
	SUM(used_bytes) / ( 1024^3 ) AS used_compressed_gb
	FROM v_monitor.column_storage
	GROUP BY anchor_table_schema,
	anchor_table_name
	ORDER BY SUM(used_bytes) DESC;
	# Ripped off from http://stackoverflow.com/questions/21754319/rule-of-thumb-for-memory-size-of-datasets-in-r
	# original author Carlos Cinelli

	howMuchRAM <-function(ncol, nrow, cushion=3){
	#40 bytes per col
	colBytes <- ncol*40

	#8 bytes per cell
	cellBytes <- ncolnrow8
	#!/bin/bash
	fname=devices1518.csv
	HDR=$(head -1 $fname) # Pick up CSV header line to apply to each file
	split -l 1000000 $fname prt # Split the file into chunks of 1M lines each
	n=1
	for f in prt* # iterate over chunks
	do
	echo $HDR > Part${n} # Write out header to new files"
	cat $f >> Part${n} # Add in lines that were split
	rm $f
	.libPaths(/<Data partition>/R/library)
	plist <- read.csv('~/Downloads/packages.csv')
	install.packages(as.character(plist[,1]))

	# Get memory allocation on an object by object basis. Loop over your namespace and print out memory used by all objects.
	for (itm in ls()) {
	print(formatC(c(itm, object.size(get(itm))), format="d", big.mark=",", width=30), quote=F)
	}

	# Print memory used collectively in namespace
	print(memory.profile())