NOTE: This is a question I found on StackOverflow which I’ve archived here, because the answer is so effing phenomenal.
If you are not into long explanations, see [Paolo Bergantino’s answer][2].
| var data = sc.parallelize(Seq(("A", 2), ("A", 4), ("B", 2), ("Z", 0), ("B", 10))) | |
| // data: org.apache.spark.rdd.RDD[(java.lang.String, Int)] = ParallelCollectionRDD[31] at parallelize at <console>:12 | |
| val avgValue = data.mapValues((_, 1) | |
| .reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)) | |
| .mapValues{ case (sum, count) => (1.0 * sum) / count } | |
| .collectAsMap() | |
| // avgValue: scala.collection.Map[java.lang.String,Double] = Map(Z -> 0.0, B -> 6.0, A -> 3.0) |
| val b = data.map(x =>(x._1,1)).reduceByKey(_+_).collect // Each customer how many times purchased | |
| val z = data.map(x=>( x._1,x._3)).mapValues(y => (y,1)).reduceByKey((a,b) => (a._1 + b._1,a._2+b._2)). | |
| | map{ x => | |
| | val temp = x._2 | |
| | val total = temp._1 | |
| | val count = temp._2 | |
| | (x._1,total,total/count) | |
| | }.collect //How much average money spent and how many times purchased |
| sudo add-apt-repository ppa:webupd8team/sublime-text-3; | |
| sudo apt-get update; | |
| sudo apt-get install sublime-text-installer; | |
| sudo ln -s /usr/lib/sublime-text-3/sublime_text /usr/local/bin/sublime; |
| // method1 | |
| val i = 1 to 10 toList | |
| i.flatMap(List.fill(2)(_)) | |
| // method 2 | |
| for { e <- i ; y <- 1 to 2} yield e | |
| // method 3 | |
| def f(num : Int, arr : List[Int]) : List[Int] = { |
| #!/bin/bash -e | |
| # | |
| # Copyright 2014 Tony Burns | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # |
NOTE: This is a question I found on StackOverflow which I’ve archived here, because the answer is so effing phenomenal.
If you are not into long explanations, see [Paolo Bergantino’s answer][2].
| This gist includes hive ql scripts to create an external partitioned table for Syslog | |
| generated log files using regex serde; | |
| Usecase: Count the number of occurances of processes that got logged, by year, month, | |
| day and process. | |
| Includes: | |
| --------- | |
| Sample data and structure: 01-SampleDataAndStructure | |
| Data download: 02-DataDownload | |
| Data load commands: 03-DataLoadCommands |
| import argparse, datetime, logging | |
| import apache_beam as beam | |
| from apache_beam.options.pipeline_options import PipelineOptions | |
| from apache_beam.options.pipeline_options import SetupOptions | |
| class GetTimestampFn(beam.DoFn): | |
| """Prints element timestamp""" | |
| def process(self, element, timestamp=beam.DoFn.TimestampParam): |
| #!/bin/bash | |
| # This script will update a new workspace created on Cloud9 IDE with the latest packages. | |
| # In order to use it, create a update_workspace.sh file in your C9 Workspace and then make it executable using the command | |
| # `touch update_workspace.sh && chmod +x update_workspace.sh`. | |
| # Now you can open the updata_workspace.sh file and copy/paste this full script, save and close. | |
| # Run the command `./update_workspace.sh` to execute the script. | |
| # | |
| # Alternatively you can use this command to download and make this script executable from github | |
| # wget -O update_workspace.sh https://gist.githubusercontent.com/aubort/836888f8aaeeeff75024c87e9c9199f0/raw && chmod +x update_workspace.sh |