cipri7329 cipri7329

## ubuntu14-cdh5.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                cipri7329
                / ubuntu14-cdh5.md
            
            
              Last active
              July 5, 2017 09:15
            
              
                ubuntu 14.04 desktop and cloudera cdh
              
          
    ubuntu 14.04 desktop and cloudera cdh
UBUNTU setup

tools setup

sudo apt-get install flex git gnupg gperf libesd0-dev liblz4-tool libncurses5-dev libsdl1.2-dev libwxgtk2.8-dev libxml2 libxml2-utils lzop openjdk-7-jdk openjdk-7-jre pngcrush schedtool squashfs-tools xsltproc zip zlib1g-dev g++-multilib gcc-multilib lib32ncurses5-dev lib32readline-gplv2-dev lib32z1-dev

sudo apt-get install gksu


## scala-cheatsheet-code.md

      
              4 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                cipri7329
                / scala-cheatsheet-code.md
            
            
              Last active
              July 3, 2017 15:01
            
              
                Scala Cheetsheet. Code Samples
              
          
      code sample
Scala Code Snippets

Scalar product

given two lists of doubles a scalar product is the sum of the products between each pair of corresponding elements


## spark-aggregation-example.scala
//example from https://courses.bigdatauniversity.com/courses/course-v1:BigDataUniversity+BD0212EN+2016/ exercises

val input1 = sc.textFile("data/trips/*")

val header1 = input1.first // to skip the header row

val trips = input1.
 filter(_ != header1).
 map(_.split(",")).
 map(utils.Trip.parse(_))

## most-common-character-spark-scala.scala
//In the cell below, determine what is the most frequent CHARACTER in the README, and how many times was it used?
//spark and scala

var charCounts2 = readme.flatMap(line => line.toList).
    filter( a => !a.equals("\n") && !a.equals(" ") && !a.equals("") ).
    filter( _ != ' ').
    map(character => (character, 1)).
    reduceByKey((a,b) => a + b).
    reduce((a, b) => if (a._2 > b._2) a else b)
    //take(55).

## flume-spooldir-hdfs.conf

wikiagent.sources = spool
wikiagent.channels = memChannel
wikiagent.sinks = HDFS

# source config
wikiagent.sources.spool.type = spooldir
wikiagent.sources.spool.channels = memChannel

wikiagent.sources.spool.spoolDir = /home/ubuntu/datalake/processed

## elasticsearch-sens-cheetsheet.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                cipri7329
                / elasticsearch-sens-cheetsheet.md
            
            
              Created
              October 14, 2016 09:14
            
              
                some simple quick commands to test an elasticsearch node
              
          
    GET _cluster/state?pretty

GET _search
{
    "query": {
        "match_all": {}
 }


## spark-cheetsheet.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                cipri7329
                / spark-cheetsheet.md
            
            
              Last active
              October 13, 2016 14:09
            
              
                spark-cheetsheet. quick code samples
              
          
    load from file
val moviesDump = sc.textFile("hdfs://localhost:8020/user/datalake/movies/ml-latest/movies.csv")

case class Movie(movieId : Integer, title : String, genres : List[String])
 
val movies = moviesDump.map(s => s.split(",")).filter(s => s(0)!="movieId")
    .map( 
        s => Movie(s(0).toInt,
            s.slice(1, s.size-1).mkString(""),

  
## zeppelin0.6-cloudera5.7.md

      
              2 files
            
          
              4 forks
            
          
              0 comments
            
          
              6 stars
            
          
                cipri7329
                / zeppelin0.6-cloudera5.7.md
            
            
              Last active
              January 29, 2021 03:38
            
              
                Zeppelin notebook with Cloudera
              
          
    zeppelin 0.6.1 with cdh 5.7.0 on ubuntu 14.04 lts

Instalation

resources


http://blog.cloudera.com/blog/2015/07/how-to-install-apache-zeppelin-on-cdh/
https://ypg-data.github.io/post/2016/02/running-zeppelin-on-cdh/

required packages

sudo apt-get install node nodejs npm

  
## kafka_producer.py
import time
from kafka import KafkaProducer
import json
import base64

KAFKA_TOPIC = "scraped-data"
KAFKA_HOST = "localhost:9092"

producer = KafkaProducer(bootstrap_servers=KAFKA_HOST, value_serializer=lambda v: json.dumps(v).encode('utf-8'))

## kafka_consumer.py
import time

__author__ = 'user'

import base64
import json
from kafka import KafkaConsumer

from kafka import TopicPartition
	//example from https://courses.bigdatauniversity.com/courses/course-v1:BigDataUniversity+BD0212EN+2016/ exercises

	val input1 = sc.textFile("data/trips/*")

	val header1 = input1.first // to skip the header row

	val trips = input1.
	filter(_ != header1).
	map(_.split(",")).
	map(utils.Trip.parse(_))
	//In the cell below, determine what is the most frequent CHARACTER in the README, and how many times was it used?
	//spark and scala

	var charCounts2 = readme.flatMap(line => line.toList).
	filter( a => !a.equals("\n") && !a.equals(" ") && !a.equals("") ).
	filter( _ != ' ').
	map(character => (character, 1)).
	reduceByKey((a,b) => a + b).
	reduce((a, b) => if (a._2 > b._2) a else b)
	//take(55).

	wikiagent.sources = spool
	wikiagent.channels = memChannel
	wikiagent.sinks = HDFS

	# source config
	wikiagent.sources.spool.type = spooldir
	wikiagent.sources.spool.channels = memChannel

	wikiagent.sources.spool.spoolDir = /home/ubuntu/datalake/processed
	import time
	from kafka import KafkaProducer
	import json
	import base64

	KAFKA_TOPIC = "scraped-data"
	KAFKA_HOST = "localhost:9092"

	producer = KafkaProducer(bootstrap_servers=KAFKA_HOST, value_serializer=lambda v: json.dumps(v).encode('utf-8'))
	import time

	__author__ = 'user'

	import base64
	import json
	from kafka import KafkaConsumer

	from kafka import TopicPartition