LeelaKrishna K leelakrishna

## positweets.hive
drop table if exists raw_tweets;
drop table if exists tweets;
drop table if exists positive_hashtags_per_day;
drop table if exists count_positive_hashtags_per_day;
drop table if exists top5_positive_hashtags_per_day;

create table raw_tweets (json string);
load data local inpath 'sample.json' into table raw_tweets;

create table tweets as

## apache-logs-hive.sql
-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
-- Facebook, because it allows them to query enormous Hadoop data
-- stores using a language much like SQL.

-- Our logs are stored on the Hadoop Distributed File System, in the
-- directory /logs/randomhacks.net/access.  They're ordinary Apache
-- logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,

## latency.markdown

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                leelakrishna
                / latency.markdown
            
            
              Created
              April 16, 2014 06:48
                — forked from hellerbarde/latency.markdown
            
          
    Latency numbers every programmer should know

L1 cache reference ......................... 0.5 ns
Branch mispredict ............................ 5 ns
L2 cache reference ........................... 7 ns
Mutex lock/unlock ........................... 25 ns
Main memory reference ...................... 100 ns             
Compress 1K bytes with Zippy ............. 3,000 ns  =   3 µs
Send 2K bytes over 1 Gbps network ....... 20,000 ns  =  20 µs
SSD random read ........................ 150,000 ns  = 150 µs

Read 1 MB sequentially from memory ..... 250,000 ns = 250 µs

  
## presto-vs-impala.txt
Three comparison points:
Presto + RCFile vs Impala + RCFile vs Impala + Parquet

Note: Query time, CPU utilization, Disk read tput (KBRead)

Impala v1.1.1
Presto v0.52
================================================================================================================================
Presto + RCFile:
select ss_sold_date_sk, count(*) from store_sales_rcfile group by 1 order by 1 limit 2000;

## elasticsearch.yml
##################### Elasticsearch Configuration Example #####################

# This file contains an overview of various configuration settings,
# targeted at operations staff. Application developers should
# consult the guide at <http://elasticsearch.org/guide>.
#
# The installation procedure is covered at
# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>.
#
# Elasticsearch comes with reasonable defaults for most settings,

## logging.yml
# you can override this using by setting a system property, for example -Des.logger.level=DEBUG
es.logger.level: INFO
rootLogger: ${es.logger.level}, console, file
logger:
  # log action execution errors for easier debugging
  action: DEBUG
  # reduce the logging for aws, too much is logged under the default INFO
  com.amazonaws: WARN

  # gateway

## pyget.py
#!/usr/bin/env python
#
# pyget.py
# A Python download accelerator
#
# by Benjamin Huthcins
# MIT License
#
# This file is incomplete, it also relies on wget to download,
# see http://gist.github.com/424080 for newer version

## hifreqwords.scala
val f = sc.textFile("sample.txt")
// word count
val wc = f.flatMap(l => l.split(" ")).map(word => (word,1)).reduceByKey(_ + _)
// swap k,v to v,k to sort by word frequency
val wc_swap = wc.map(_.swap)
// sort keys by ascending=false (descending)
val hifreq_words = wc_swap.sortByKey(false,1)
hifreq_words.saveAsTextFile("hifreq_words")
// get an array of top 20 frequent words
val top20 = hifreq_words.take(20)

## sparksql
import sqlContext._
case class Person(name: String, age:Int)
val people = sc.textFile("examples/src/main/resources/ people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
people.registerAsTable("people")


## spark-wordcount-sorted.py
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
	drop table if exists raw_tweets;
	drop table if exists tweets;
	drop table if exists positive_hashtags_per_day;
	drop table if exists count_positive_hashtags_per_day;
	drop table if exists top5_positive_hashtags_per_day;

	create table raw_tweets (json string);
	load data local inpath 'sample.json' into table raw_tweets;

	create table tweets as
	-- This is a Hive program. Hive is an SQL-like language that compiles
	-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
	-- Facebook, because it allows them to query enormous Hadoop data
	-- stores using a language much like SQL.

	-- Our logs are stored on the Hadoop Distributed File System, in the
	-- directory /logs/randomhacks.net/access. They're ordinary Apache
	-- logs in *.gz format.
	--
	-- We want to pretend that these gzipped log files are a database table,
	Three comparison points:
	Presto + RCFile vs Impala + RCFile vs Impala + Parquet

	Note: Query time, CPU utilization, Disk read tput (KBRead)

	Impala v1.1.1
	Presto v0.52
	================================================================================================================================
	Presto + RCFile:
	select ss_sold_date_sk, count(*) from store_sales_rcfile group by 1 order by 1 limit 2000;
	##################### Elasticsearch Configuration Example #####################

	# This file contains an overview of various configuration settings,
	# targeted at operations staff. Application developers should
	# consult the guide at <http://elasticsearch.org/guide>.
	#
	# The installation procedure is covered at
	# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>.
	#
	# Elasticsearch comes with reasonable defaults for most settings,
	# you can override this using by setting a system property, for example -Des.logger.level=DEBUG
	es.logger.level: INFO
	rootLogger: ${es.logger.level}, console, file
	logger:
	# log action execution errors for easier debugging
	action: DEBUG
	# reduce the logging for aws, too much is logged under the default INFO
	com.amazonaws: WARN

	# gateway
	#!/usr/bin/env python
	#
	# pyget.py
	# A Python download accelerator
	#
	# by Benjamin Huthcins
	# MIT License
	#
	# This file is incomplete, it also relies on wget to download,
	# see http://gist.github.com/424080 for newer version
	val f = sc.textFile("sample.txt")
	// word count
	val wc = f.flatMap(l => l.split(" ")).map(word => (word,1)).reduceByKey(_ + _)
	// swap k,v to v,k to sort by word frequency
	val wc_swap = wc.map(_.swap)
	// sort keys by ascending=false (descending)
	val hifreq_words = wc_swap.sortByKey(false,1)
	hifreq_words.saveAsTextFile("hifreq_words")
	// get an array of top 20 frequent words
	val top20 = hifreq_words.take(20)
	import sqlContext._
	case class Person(name: String, age:Int)
	val people = sc.textFile("examples/src/main/resources/ people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
	people.registerAsTable("people")
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#