LeelaKrishna K leelakrishna

## positweets.hive
drop table if exists raw_tweets;
drop table if exists tweets;
drop table if exists positive_hashtags_per_day;
drop table if exists count_positive_hashtags_per_day;
drop table if exists top5_positive_hashtags_per_day;

create table raw_tweets (json string);
load data local inpath 'sample.json' into table raw_tweets;

create table tweets as

## apache-logs-hive.sql
-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
-- Facebook, because it allows them to query enormous Hadoop data
-- stores using a language much like SQL.

-- Our logs are stored on the Hadoop Distributed File System, in the
-- directory /logs/randomhacks.net/access.  They're ordinary Apache
-- logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,

## latency.markdown

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                leelakrishna
                / latency.markdown
            
            
              Created
              April 16, 2014 06:48
                — forked from hellerbarde/latency.markdown
            
          
    Latency numbers every programmer should know

L1 cache reference ......................... 0.5 ns
Branch mispredict ............................ 5 ns
L2 cache reference ........................... 7 ns
Mutex lock/unlock ........................... 25 ns
Main memory reference ...................... 100 ns             
Compress 1K bytes with Zippy ............. 3,000 ns  =   3 µs
Send 2K bytes over 1 Gbps network ....... 20,000 ns  =  20 µs
SSD random read ........................ 150,000 ns  = 150 µs

Read 1 MB sequentially from memory ..... 250,000 ns = 250 µs

  
## presto-vs-impala.txt
Three comparison points:
Presto + RCFile vs Impala + RCFile vs Impala + Parquet

Note: Query time, CPU utilization, Disk read tput (KBRead)

Impala v1.1.1
Presto v0.52
================================================================================================================================
Presto + RCFile:
select ss_sold_date_sk, count(*) from store_sales_rcfile group by 1 order by 1 limit 2000;

## pyget.py
#!/usr/bin/env python
#
# pyget.py
# A Python download accelerator
#
# by Benjamin Huthcins
# MIT License
#
# This file is incomplete, it also relies on wget to download,
# see http://gist.github.com/424080 for newer version

## spark-wordcount-sorted.py
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#

## SparkGrep.scala
package spark.example

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SparkGrep {
	def main(args: Array[String]) {
		if (args.length < 3) {
			System.err.println("Usage: SparkGrep <host> <input_file> <match_term>")

## hbase-rest-examples.sh
#!/usr/bin/env bash
#
# ===================================
# Experiments with the HBase REST API
# ===================================
#
# <http://hbase.apache.org/docs/r0.20.4/api/org/apache/hadoop/hbase/rest/package-summary.html>
#
# Usage:
#

## benchmark-commands.txt
Producer

Setup
bin/kafka-topics.sh --zookeeper esv4-hcl197.grid.linkedin.com:2181 --create --topic test-rep-one --partitions 6 --replication-factor 1
bin/kafka-topics.sh --zookeeper esv4-hcl197.grid.linkedin.com:2181 --create --topic test --partitions 6 --replication-factor 3

Single thread, no replication

bin/kafka-run-class.sh org.apache.kafka.clients.tools.ProducerPerformance test7 50000000 100 -1 acks=1 bootstrap.servers=esv4-hcl198.grid.linkedin.com:9092 buffer.memory=67108864 batch.size=8196

## gist:f90264365b892ef65203
kawasaki@hadoop11:~$ hdfs cacheadmin -listDirectives
Found 0 entries
kawasaki@hadoop11:~$ hdfs cacheadmin -listDirectives stats
Can't understand argument: stats
kawasaki@hadoop11:~$ hdfs cacheadmin -listDirectives -stats
Found 0 entries
kawasaki@hadoop11:~$ hadoop fs -ls dir1
Found 4 items
drwxr-xr-x   - kawasaki kawasaki          0 2014-04-09 06:38 dir1/a
-rw-r--r--   3 kawasaki kawasaki   75288655 2014-04-09 06:44 dir1/bigfile
	drop table if exists raw_tweets;
	drop table if exists tweets;
	drop table if exists positive_hashtags_per_day;
	drop table if exists count_positive_hashtags_per_day;
	drop table if exists top5_positive_hashtags_per_day;

	create table raw_tweets (json string);
	load data local inpath 'sample.json' into table raw_tweets;

	create table tweets as
	-- This is a Hive program. Hive is an SQL-like language that compiles
	-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
	-- Facebook, because it allows them to query enormous Hadoop data
	-- stores using a language much like SQL.

	-- Our logs are stored on the Hadoop Distributed File System, in the
	-- directory /logs/randomhacks.net/access. They're ordinary Apache
	-- logs in *.gz format.
	--
	-- We want to pretend that these gzipped log files are a database table,
	Three comparison points:
	Presto + RCFile vs Impala + RCFile vs Impala + Parquet

	Note: Query time, CPU utilization, Disk read tput (KBRead)

	Impala v1.1.1
	Presto v0.52
	================================================================================================================================
	Presto + RCFile:
	select ss_sold_date_sk, count(*) from store_sales_rcfile group by 1 order by 1 limit 2000;
	#!/usr/bin/env python
	#
	# pyget.py
	# A Python download accelerator
	#
	# by Benjamin Huthcins
	# MIT License
	#
	# This file is incomplete, it also relies on wget to download,
	# see http://gist.github.com/424080 for newer version
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	package spark.example

	import org.apache.spark.SparkContext
	import org.apache.spark.SparkContext._
	import org.apache.spark.SparkConf

	object SparkGrep {
	def main(args: Array[String]) {
	if (args.length < 3) {
	System.err.println("Usage: SparkGrep <host> <input_file> <match_term>")
	#!/usr/bin/env bash
	#
	# ===================================
	# Experiments with the HBase REST API
	# ===================================
	#
	# <http://hbase.apache.org/docs/r0.20.4/api/org/apache/hadoop/hbase/rest/package-summary.html>
	#
	# Usage:
	#
	Producer

	Setup
	bin/kafka-topics.sh --zookeeper esv4-hcl197.grid.linkedin.com:2181 --create --topic test-rep-one --partitions 6 --replication-factor 1
	bin/kafka-topics.sh --zookeeper esv4-hcl197.grid.linkedin.com:2181 --create --topic test --partitions 6 --replication-factor 3

	Single thread, no replication

	bin/kafka-run-class.sh org.apache.kafka.clients.tools.ProducerPerformance test7 50000000 100 -1 acks=1 bootstrap.servers=esv4-hcl198.grid.linkedin.com:9092 buffer.memory=67108864 batch.size=8196
	kawasaki@hadoop11:~$ hdfs cacheadmin -listDirectives
	Found 0 entries
	kawasaki@hadoop11:~$ hdfs cacheadmin -listDirectives stats
	Can't understand argument: stats
	kawasaki@hadoop11:~$ hdfs cacheadmin -listDirectives -stats
	Found 0 entries
	kawasaki@hadoop11:~$ hadoop fs -ls dir1
	Found 4 items
	drwxr-xr-x - kawasaki kawasaki 0 2014-04-09 06:38 dir1/a
	-rw-r--r-- 3 kawasaki kawasaki 75288655 2014-04-09 06:44 dir1/bigfile