Sanjay Akella avrsanjay

## snippets
#snippet to get date for Nth number of day before a particular date
-------------------------------------------------------------------
start = datetime.strptime("20171012", "%Y%m%d")
end = start - timedelta(days=10)
print start,end

#snippet to get the previous day's date
---------------------------------------
date --date="yesterday" +%Y/%m/%d

## pyhive2csv.py
import pyhs2
import getpass
import sys
import csv

if __name__ == "__main__":
	query = sys.argv[1]		#query without a semicolon at the end
	pwd   = getpass.getpass()
	data = []
	header = []

## hive_equality_test_script_builder.py
#!/usr/bin/python
import subprocess
import shlex
import time
import os,sys,argparse
import re,collections

DEBUG = True
return_collection = collections.namedtuple('multi_return',['x','y','z'])

## PrepCentOSForHadoopScript
#!/bin/sh
# Starts and stops script
#
#
# sample        Startup script for the shell file
#
# chkconfig: - 95 07
# description: this script is used to setup nodes in a cluster
#            .
# processname: myscript

## Logger.scala
package com.cloudwick.streaming.spark

import org.apache.log4j.{Level, Logger}
import org.apache.spark.Logging

/** Utility functions for Spark Streaming examples. */
object StreamingExamples extends Logging {

	/** Set reasonable logging levels for streaming if the user has not configured log4j. */
	def setStreamingLogLevels() {

## Kafka-Custom-Producer-ReadME
Note: All this was done in HDP stand-alone cluster

step1
-----
-> Keep your data source, i.e Sample_Data_generator.csv, in the root folder of the project in IDE (Eclipse in my case)
-> maven clean
-> maven install
-> find your .jar file in the target folder and upload it on to the HDP node

step2

## weatherDLL.sql
create table ncdcweather(
stn int,
wban int,
yearmoda int,
temp double,
temp_count int,
dewp double,
deqp_count int,
slp double,
slp_count int,

## extract.sh
#!/bin/sh
count=0
for (( i = 1901; i<2016; i++ ))
do
 wget -O $i.tar ftp://ftp.ncdc.noaa.gov/pub/data/gsod/$i/gsod_$i.tar
 mkdir -p ./tarfiles/extracted/$i
 tar -xvf ./tarfiles/$i.tar -C ./tarfiles/extracted/$i
 count=$(ls ./tarfiles/extracted/$i/ | wc -l)
 if [ $count -eq 0 ]
 then

## test1.txt
#comma seperated file
#getting only required columns into an RDD
val csv = sc.textFile("C:/Users/avrsa/Downloads/zipcode.csv").map(line => (line.split(",")(0),line.split(",")(3),line.split(",")(4)))

#tab seperated file
input : 01000 123:456:789
output: 01000,123
        01000,456
        01000,789
val tsv = sc.textFile("C:/Users/avrsa/Downloads/skuData.tsv").filter(_.nonEmpty).map(x => (x.split('\t')(0),x.split('\t')(1))).flatMapValues(x => x.split(':'))
	#snippet to get date for Nth number of day before a particular date
	-------------------------------------------------------------------
	start = datetime.strptime("20171012", "%Y%m%d")
	end = start - timedelta(days=10)
	print start,end

	#snippet to get the previous day's date
	---------------------------------------
	date --date="yesterday" +%Y/%m/%d
	import pyhs2
	import getpass
	import sys
	import csv

	if __name__ == "__main__":
	query = sys.argv[1] #query without a semicolon at the end
	pwd = getpass.getpass()
	data = []
	header = []
	#!/usr/bin/python
	import subprocess
	import shlex
	import time
	import os,sys,argparse
	import re,collections

	DEBUG = True
	return_collection = collections.namedtuple('multi_return',['x','y','z'])
	#!/bin/sh
	# Starts and stops script
	#
	#
	# sample Startup script for the shell file
	#
	# chkconfig: - 95 07
	# description: this script is used to setup nodes in a cluster
	# .
	# processname: myscript
	package com.cloudwick.streaming.spark

	import org.apache.log4j.{Level, Logger}
	import org.apache.spark.Logging

	/** Utility functions for Spark Streaming examples. */
	object StreamingExamples extends Logging {

	/** Set reasonable logging levels for streaming if the user has not configured log4j. */
	def setStreamingLogLevels() {
	Note: All this was done in HDP stand-alone cluster

	step1
	-----
	-> Keep your data source, i.e Sample_Data_generator.csv, in the root folder of the project in IDE (Eclipse in my case)
	-> maven clean
	-> maven install
	-> find your .jar file in the target folder and upload it on to the HDP node

	step2
	create table ncdcweather(
	stn int,
	wban int,
	yearmoda int,
	temp double,
	temp_count int,
	dewp double,
	deqp_count int,
	slp double,
	slp_count int,
	#!/bin/sh
	count=0
	for (( i = 1901; i<2016; i++ ))
	do
	wget -O $i.tar ftp://ftp.ncdc.noaa.gov/pub/data/gsod/$i/gsod_$i.tar
	mkdir -p ./tarfiles/extracted/$i
	tar -xvf ./tarfiles/$i.tar -C ./tarfiles/extracted/$i
	count=$(ls ./tarfiles/extracted/$i/ \| wc -l)
	if [ $count -eq 0 ]
	then
	#comma seperated file
	#getting only required columns into an RDD
	val csv = sc.textFile("C:/Users/avrsa/Downloads/zipcode.csv").map(line => (line.split(",")(0),line.split(",")(3),line.split(",")(4)))

	#tab seperated file
	input : 01000 123:456:789
	output: 01000,123
	01000,456
	01000,789
	val tsv = sc.textFile("C:/Users/avrsa/Downloads/skuData.tsv").filter(_.nonEmpty).map(x => (x.split('\t')(0),x.split('\t')(1))).flatMapValues(x => x.split(':'))