narulkargunjan

## python27_installation.sh
#Run as root

yum groupinstall "Development tools"
yum install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel

# --no-check-certificate Optional
cd /opt
wget --no-check-certificate https://www.python.org/ftp/python/2.7.6/Python-2.7.6.tar.xz
tar xf Python-2.7.6.tar.xz
cd Python-2.7.6

## HappyBase_Sample.py
import csv
import happybase
import time

batch_size = 1000
host = "0.0.0.0"
file_path = "Request_for_Information_Cases.csv"
namespace = "sample_data"
row_count = 0
start_time = time.time()

## NLP_Demo.py
import os
import codecs

data_directory = os.path.join('..', 'data',
                              'yelp_dataset_challenge_academic_dataset')

businesses_filepath = os.path.join(data_directory,
                                   'yelp_academic_dataset_business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f:

## image2image_match.R
## This code is based the code of Roald Bradley Severtson :
## https://github.com/Microsoft/microsoft-r/tree/master/microsoft-ml/Samples/PreTrainedModels/ImageAnalytics/ImageFeaturizer

library(MicrosoftML)

## Change NA to the actual location of the script. Use the absolute path.
workingDir <- "C:/Users/redelang/Documents/Code/projects/image_featurizer/image_featurizer"

if (is.na(workingDir)){
  stop("The working directory needs to be set to the location of the script.")

## K_Means_Clustering.R
#read data in r

iris <- read.csv("C:/Users/Ashwin/Desktop/segmentation/CSV Fishers Iris Data.csv")
View(iris)
summary(iris)
head(iris)

# Randomise data for making little realistic

iris<-iris[sample(1:nrow(iris)),]

## HivePartitioning.md

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              1 star
            
          
                narulkargunjan
                / HivePartitioning.md
            
            
              Created
              August 19, 2017 10:34
                — forked from tim-patterson/HivePartitioning.md
            
              
                Hive Partitioning
              
          
    Hive partitioning scheme for dealing with late arriving data etc.

Over the last few years I've been quite involved with using hive for big data analysis.
I've read many web tutorials and blogs about using hadoop/hive/pig for data analysis but all them seem to be over simplified and targeted as a "my first hive query" kind of audience instead of showing how to structure hive tables and queries for real word use cases eg years of data, reoccurring batch jobs to build aggregate/reporting tables and having to deal with late arriving data etc.
Most of these tutorials look something like this
Twitter Data -> hdfs/external hive table
external hive table -> hive query -> results.

  
## gridsearch_caret
##sources: http://caret.r-forge.r-project.org/training.html, http://cran.r-project.org/web/packages/caret/vignettes/caret.pdf

set.seed(107) #set seed to ensure reproduction if required

## do the setup for parallel processing as per the system available (ensure "allowparallel/seed" are set up accordingly)
## for unix/ubuntu etc:
#library(doMC)
#registerDoMC(cores = 5)
##for windows:
#library(doParallel)

## gridsearch_basic
# source: http://statcompute.wordpress.com/2013/06/01/grid-search-for-free-parameters-with-parallel-computing/
library(MASS)
data(Boston)
X <- I(as.matrix(Boston[-14]))
st.X <- scale(X)
Y <- I(as.matrix(Boston[14]))
boston <- data.frame(X = st.X, Y)

# DIVIDE THE WHOLE DATA INTO TWO SEPARATE SETS
set.seed(2013)

## package-list
#Below is the list of packages that are typically required for any data guy!
#Beware 1 - THIS IS, BY NO MEANS, A "COMPLETE" LIST, JUST WHAT I FEEL APPROPRIATE.
#Beware 2 - MAKE SURE INTERNET CONNECTION IS FAST AND RUNNING FOR ALL THE TIME.

install.packages("vars")
install.packages("forecast")
install.packages("ggplot2")
install.packages("rattle")
install.packages("caret")
install.packages("e1071")

## kanjo_base.
import os
import gzip
import json
import re
import string
import pprint
import esmre
from collections import defaultdict, deque
from senti_classifier import senti_classifier
import requests
	#Run as root

	yum groupinstall "Development tools"
	yum install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel

	# --no-check-certificate Optional
	cd /opt
	wget --no-check-certificate https://www.python.org/ftp/python/2.7.6/Python-2.7.6.tar.xz
	tar xf Python-2.7.6.tar.xz
	cd Python-2.7.6
	import csv
	import happybase
	import time

	batch_size = 1000
	host = "0.0.0.0"
	file_path = "Request_for_Information_Cases.csv"
	namespace = "sample_data"
	row_count = 0
	start_time = time.time()
	import os
	import codecs

	data_directory = os.path.join('..', 'data',
	'yelp_dataset_challenge_academic_dataset')

	businesses_filepath = os.path.join(data_directory,
	'yelp_academic_dataset_business.json')

	with codecs.open(businesses_filepath, encoding='utf_8') as f:
	## This code is based the code of Roald Bradley Severtson :
	## https://github.com/Microsoft/microsoft-r/tree/master/microsoft-ml/Samples/PreTrainedModels/ImageAnalytics/ImageFeaturizer

	library(MicrosoftML)

	## Change NA to the actual location of the script. Use the absolute path.
	workingDir <- "C:/Users/redelang/Documents/Code/projects/image_featurizer/image_featurizer"

	if (is.na(workingDir)){
	stop("The working directory needs to be set to the location of the script.")
	#read data in r

	iris <- read.csv("C:/Users/Ashwin/Desktop/segmentation/CSV Fishers Iris Data.csv")
	View(iris)
	summary(iris)
	head(iris)

	# Randomise data for making little realistic

	iris<-iris[sample(1:nrow(iris)),]
	##sources: http://caret.r-forge.r-project.org/training.html, http://cran.r-project.org/web/packages/caret/vignettes/caret.pdf

	set.seed(107) #set seed to ensure reproduction if required

	## do the setup for parallel processing as per the system available (ensure "allowparallel/seed" are set up accordingly)
	## for unix/ubuntu etc:
	#library(doMC)
	#registerDoMC(cores = 5)
	##for windows:
	#library(doParallel)
	# source: http://statcompute.wordpress.com/2013/06/01/grid-search-for-free-parameters-with-parallel-computing/
	library(MASS)
	data(Boston)
	X <- I(as.matrix(Boston[-14]))
	st.X <- scale(X)
	Y <- I(as.matrix(Boston[14]))
	boston <- data.frame(X = st.X, Y)

	# DIVIDE THE WHOLE DATA INTO TWO SEPARATE SETS
	set.seed(2013)
	#Below is the list of packages that are typically required for any data guy!
	#Beware 1 - THIS IS, BY NO MEANS, A "COMPLETE" LIST, JUST WHAT I FEEL APPROPRIATE.
	#Beware 2 - MAKE SURE INTERNET CONNECTION IS FAST AND RUNNING FOR ALL THE TIME.

	install.packages("vars")
	install.packages("forecast")
	install.packages("ggplot2")
	install.packages("rattle")
	install.packages("caret")
	install.packages("e1071")
	import os
	import gzip
	import json
	import re
	import string
	import pprint
	import esmre
	from collections import defaultdict, deque
	from senti_classifier import senti_classifier
	import requests