Skip to content

Instantly share code, notes, and snippets.

drop table if exists raw_tweets;
drop table if exists tweets;
drop table if exists positive_hashtags_per_day;
drop table if exists count_positive_hashtags_per_day;
drop table if exists top5_positive_hashtags_per_day;
create table raw_tweets (json string);
load data local inpath 'sample.json' into table raw_tweets;
create table tweets as
-- This is a Hive program. Hive is an SQL-like language that compiles
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
-- Facebook, because it allows them to query enormous Hadoop data
-- stores using a language much like SQL.
-- Our logs are stored on the Hadoop Distributed File System, in the
-- directory /logs/randomhacks.net/access. They're ordinary Apache
-- logs in *.gz format.
--
-- We want to pretend that these gzipped log files are a database table,

Latency numbers every programmer should know

L1 cache reference ......................... 0.5 ns
Branch mispredict ............................ 5 ns
L2 cache reference ........................... 7 ns
Mutex lock/unlock ........................... 25 ns
Main memory reference ...................... 100 ns             
Compress 1K bytes with Zippy ............. 3,000 ns  =   3 µs
Send 2K bytes over 1 Gbps network ....... 20,000 ns  =  20 µs
SSD random read ........................ 150,000 ns  = 150 µs

Read 1 MB sequentially from memory ..... 250,000 ns = 250 µs

Three comparison points:
Presto + RCFile vs Impala + RCFile vs Impala + Parquet
Note: Query time, CPU utilization, Disk read tput (KBRead)
Impala v1.1.1
Presto v0.52
================================================================================================================================
Presto + RCFile:
select ss_sold_date_sk, count(*) from store_sales_rcfile group by 1 order by 1 limit 2000;
##################### Elasticsearch Configuration Example #####################
# This file contains an overview of various configuration settings,
# targeted at operations staff. Application developers should
# consult the guide at <http://elasticsearch.org/guide>.
#
# The installation procedure is covered at
# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>.
#
# Elasticsearch comes with reasonable defaults for most settings,
# you can override this using by setting a system property, for example -Des.logger.level=DEBUG
es.logger.level: INFO
rootLogger: ${es.logger.level}, console, file
logger:
# log action execution errors for easier debugging
action: DEBUG
# reduce the logging for aws, too much is logged under the default INFO
com.amazonaws: WARN
# gateway
@leelakrishna
leelakrishna / pyget.py
Last active August 29, 2015 14:22 — forked from benhutchins/pyget.py
#!/usr/bin/env python
#
# pyget.py
# A Python download accelerator
#
# by Benjamin Huthcins
# MIT License
#
# This file is incomplete, it also relies on wget to download,
# see http://gist.github.com/424080 for newer version
@leelakrishna
leelakrishna / hifreqwords.scala
Last active January 29, 2021 13:01
Spark scala - most frequent words
val f = sc.textFile("sample.txt")
// word count
val wc = f.flatMap(l => l.split(" ")).map(word => (word,1)).reduceByKey(_ + _)
// swap k,v to v,k to sort by word frequency
val wc_swap = wc.map(_.swap)
// sort keys by ascending=false (descending)
val hifreq_words = wc_swap.sortByKey(false,1)
hifreq_words.saveAsTextFile("hifreq_words")
// get an array of top 20 frequent words
val top20 = hifreq_words.take(20)
import sqlContext._
case class Person(name: String, age:Int)
val people = sc.textFile("examples/src/main/resources/ people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
people.registerAsTable("people")
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#