Skip to content

Instantly share code, notes, and snippets.

View NitinKumar94's full-sized avatar

Nitin Kumar NitinKumar94

View GitHub Profile
>>> Invoking Spark class now >>>
Traceback (most recent call last):
File "/usr/hdp/current/spark-client/AnalyticsJar/wordcount.py", line 29, in <module>
sc = SparkContext(conf=conf)
File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 107, in __init__
File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 155, in _do_init
File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/pyspark/context.py", line 201, in _initialize_context
File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/py4j/java_gateway.py", line 701, in __call__
File "/hadoop/yarn/local/filecache/662/spark-core_2.10-1.1.0.jar/py4j/protocol.py", line 300, in get_return_value
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns='uri:oozie:workflow:0.4' name='sparkjob'>
<start to='spark-process' />
<action name='spark-process'>
<spark xmlns='uri:oozie:spark-action:0.1'>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.service.SparkConfigurationService.spark.configurations</name>
@NitinKumar94
NitinKumar94 / difference.py
Created November 20, 2015 11:08
Sample code for computing difference
def calc_diff():
os.environ['SPARK_HOME'] = "/usr/hdp/current/spark-client/bin"
conf = SparkConf().setMaster("local[*]").setAppName("TestApp")
sc = SparkContext(conf=conf)
sqlcontext = HiveContext(sc)
df = sqlcontext.createDataFrame(
[(1, "a", 23.0), (2, "B", 23.0), (3, "C", 33.0)], ("x1", "x2", "x3"))
r = df.select("x3")
@NitinKumar94
NitinKumar94 / Aggregate.q
Created May 5, 2016 06:30
Aggregation query for benchmarking
------------Query for basic aggregation for performance benchmarking---------------
set hive.tez.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.tez.container.size=${hiveconf:CONTAINER_SIZE};
set hive.tez.cpu.vcores=${hiveconf:VCORES};
--set tez.grouping.split-count=15;
use benchmarking;
--explain
create table ${hiveconf:TARGET_TABLE_NAME} as
select
b.subscriber_id as subscriber_id,
"""
This class is designed for demostrating side_effect functionality of mocked classes
NOTE: PropertiesReader is a utility for reading java style property files. Returns the property file as a dictionary
"""
import PropertiesReader
from GenericProperties import RESOURCES_PATH
import os
class FakeClass(object):
--run this script after inserting records using spark
use test_inserts;
--should not spawn a map-reduce job and return wrong result immediately
select count(*) from test_hive_inserts;
--should display all inserted records from both hive and spark
select * from test_hive_inserts;
@NitinKumar94
NitinKumar94 / restart_hadoop_components.sh
Created September 16, 2016 11:13
This script restarts all tez clients, hive clients, hcat clients, hive server, hive metastore and webhcat server using ambari REST API
#!/bin/bash
curl -uadmin:admin -H 'X-Requested-By: ambari' -X POST -d '
{
"RequestInfo":{
"command":"RESTART",
"context":"Restart Tez Clients on all nodes of the cluster",
"operation_level":{
"level":"HOST",
"cluster_name":"SubexAnalytics"
hive> explain
select
b.subscriber_id,
b.handset,
b.connection_type,
b.agent_code,
b.gender,
b.age_in_network,
b.rate_plan,
b.caller_tune,
@NitinKumar94
NitinKumar94 / yarn-site.xml
Last active September 22, 2016 04:51
Configuring Yarn Queues
<!--<Here we are creating 3 queues (launcher1, launcher2 and launcher3) as children of the root queue>-->
<!--<Make the following changes to yarn-site.xml file>-->
<configuration>
<!--<This property is the default value for yarn. You may or may not set it explicitly>-->
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</valur>
</property>