Nitin Kumar NitinKumar94

## oozie spark error
>>> Invoking Spark class now >>>

Traceback (most recent call last):
  File "/usr/hdp/current/spark-client/AnalyticsJar/wordcount.py", line 29, in <module>
    sc = SparkContext(conf=conf)
  File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 107, in __init__
  File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 155, in _do_init
  File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 201, in _initialize_context
  File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/py4j/java_gateway.py", line 701, in __call__
  File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/py4j/protocol.py", line 300, in get_return_value

## oozie spark workflow
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns='uri:oozie:workflow:0.4' name='sparkjob'>
    <start to='spark-process' />
    <action name='spark-process'>
        <spark xmlns='uri:oozie:spark-action:0.1'>
        <job-tracker>${jobTracker}</job-tracker>
        <name-node>${nameNode}</name-node>
        <configuration>
            <property>
                <name>oozie.service.SparkConfigurationService.spark.configurations</name>

## difference.py
def calc_diff():
    os.environ['SPARK_HOME'] = "/usr/hdp/current/spark-client/bin"
    conf = SparkConf().setMaster("local[*]").setAppName("TestApp")
    sc = SparkContext(conf=conf)
    sqlcontext = HiveContext(sc)

    df = sqlcontext.createDataFrame(
        [(1, "a", 23.0), (2, "B", 23.0), (3, "C", 33.0)], ("x1", "x2", "x3"))

    r = df.select("x3")

## Aggregate.q
------------Query for basic aggregation for performance benchmarking---------------
set hive.tez.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.tez.container.size=${hiveconf:CONTAINER_SIZE};
set hive.tez.cpu.vcores=${hiveconf:VCORES};
--set tez.grouping.split-count=15;
use benchmarking;
--explain
create table ${hiveconf:TARGET_TABLE_NAME} as
select
b.subscriber_id as subscriber_id,

## FakeClass.py
"""
This class is designed for demostrating side_effect functionality of mocked classes
NOTE: PropertiesReader is a utility for reading java style property files. Returns the property file as a dictionary
"""
import PropertiesReader
from GenericProperties import RESOURCES_PATH
import os


class FakeClass(object):

## check_records.q
--run this script after inserting records using spark

use test_inserts;

--should not spawn a map-reduce job and return wrong result immediately
select count(*) from test_hive_inserts;

--should display all inserted records from both hive and spark
select * from test_hive_inserts;

## restart_hadoop_components.sh
#!/bin/bash

curl -uadmin:admin -H 'X-Requested-By: ambari' -X POST -d '
{
   "RequestInfo":{
      "command":"RESTART",
      "context":"Restart Tez Clients on all nodes of the cluster",
      "operation_level":{
         "level":"HOST",
         "cluster_name":"SubexAnalytics"

## hive_query_plan
hive> explain
select
b.subscriber_id,
b.handset,
b.connection_type,
b.agent_code,
b.gender,
b.age_in_network,
b.rate_plan,
b.caller_tune,

## yarn-site.xml
<!--<Here we are creating 3 queues (launcher1, launcher2 and launcher3) as children of the root queue>-->
<!--<Make the following changes to yarn-site.xml file>-->

<configuration>
<!--<This property is the default value for yarn. You may or may not set it explicitly>-->
<property>
     <name>yarn.resourcemanager.scheduler.class</name>
     <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</valur>
</property>
	>>> Invoking Spark class now >>>

	Traceback (most recent call last):
	File "/usr/hdp/current/spark-client/AnalyticsJar/wordcount.py", line 29, in <module>
	sc = SparkContext(conf=conf)
	File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 107, in __init__
	File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 155, in _do_init
	File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 201, in _initialize_context
	File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/py4j/java_gateway.py", line 701, in __call__
	File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/py4j/protocol.py", line 300, in get_return_value
	<?xml version="1.0" encoding="UTF-8"?>
	<workflow-app xmlns='uri:oozie:workflow:0.4' name='sparkjob'>
	<start to='spark-process' />
	<action name='spark-process'>
	<spark xmlns='uri:oozie:spark-action:0.1'>
	<job-tracker>${jobTracker}</job-tracker>
	<name-node>${nameNode}</name-node>
	<configuration>
	<property>
	<name>oozie.service.SparkConfigurationService.spark.configurations</name>
	def calc_diff():
	os.environ['SPARK_HOME'] = "/usr/hdp/current/spark-client/bin"
	conf = SparkConf().setMaster("local[*]").setAppName("TestApp")
	sc = SparkContext(conf=conf)
	sqlcontext = HiveContext(sc)

	df = sqlcontext.createDataFrame(
	[(1, "a", 23.0), (2, "B", 23.0), (3, "C", 33.0)], ("x1", "x2", "x3"))

	r = df.select("x3")
	------------Query for basic aggregation for performance benchmarking---------------
	set hive.tez.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
	set hive.tez.container.size=${hiveconf:CONTAINER_SIZE};
	set hive.tez.cpu.vcores=${hiveconf:VCORES};
	--set tez.grouping.split-count=15;
	use benchmarking;
	--explain
	create table ${hiveconf:TARGET_TABLE_NAME} as
	select
	b.subscriber_id as subscriber_id,
	"""
	This class is designed for demostrating side_effect functionality of mocked classes
	NOTE: PropertiesReader is a utility for reading java style property files. Returns the property file as a dictionary
	"""
	import PropertiesReader
	from GenericProperties import RESOURCES_PATH
	import os


	class FakeClass(object):
	--run this script after inserting records using spark

	use test_inserts;

	--should not spawn a map-reduce job and return wrong result immediately
	select count(*) from test_hive_inserts;

	--should display all inserted records from both hive and spark
	select * from test_hive_inserts;
	#!/bin/bash

	curl -uadmin:admin -H 'X-Requested-By: ambari' -X POST -d '
	{
	"RequestInfo":{
	"command":"RESTART",
	"context":"Restart Tez Clients on all nodes of the cluster",
	"operation_level":{
	"level":"HOST",
	"cluster_name":"SubexAnalytics"
	hive> explain
	select
	b.subscriber_id,
	b.handset,
	b.connection_type,
	b.agent_code,
	b.gender,
	b.age_in_network,
	b.rate_plan,
	b.caller_tune,
	<!--<Here we are creating 3 queues (launcher1, launcher2 and launcher3) as children of the root queue>-->
	<!--<Make the following changes to yarn-site.xml file>-->

	<configuration>
	<!--<This property is the default value for yarn. You may or may not set it explicitly>-->
	<property>
	<name>yarn.resourcemanager.scheduler.class</name>
	<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</valur>
	</property>