Umberto Griffo umbertogriffo

## RddAPI.scala
/*
This is a collections of examples about Apache Spark's RDD Api. These examples aim to help me test the RDD functionality.

References:

http://spark.apache.org/docs/latest/programming-guide.html
http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html
*/
object RddAPI {

## HashMapUtils.java
import java.util.*;
import java.util.Map.Entry;
import java.util.stream.Collectors;

/**
 * Created by Umberto on 16/05/2017.
 */

public class HashMapUtils {

## TestPerformance.scala
import org.apache.commons.lang.SystemUtils
import org.apache.spark.mllib.random.RandomRDDs._
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

import scala.math.sqrt

/**
  * Created by Umberto on 08/02/2017.
  */

## Winner.java
package knowledgebase.java.stream;

import java.time.Duration;
import java.util.*;

import static java.util.stream.Collectors.*;

/**
 * Created by Umberto on 15/02/2017.
 * https://dzone.com/articles/a-java-8-streams-cookbook

## DataFrameSuite.scala
package test.com.idlike.junit.df

import breeze.numerics.abs
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, Row}

/**
  * Created by Umberto on 06/02/2017.
  */

## Method1.java
public class Method1 {
/*
Adding synchronized to this method will makes it thread-safe.
When synchronized is added to a static method, the Class object is the object which is locked.
*/
  public static void main(String[] args) throws InterruptedException {

		ProcessingThreadS pt = new ProcessingThreadS();

		Thread t1 = new Thread(pt, "t1");

## UniqueId.java
    /**
     * Genereate unique ID from UUID in positive space
     * Reference: http://www.gregbugaj.com/?p=587
     * @return long value representing UUID
     */
    private Long generateUniqueId()
    {
        long val = -1;
        do
        {

## Transpose.scala
  def transposeRowMatrix(m: RowMatrix): RowMatrix = {
    val transposedRowsRDD = m.rows.zipWithIndex.map{case (row, rowIndex) => rowToTransposedTriplet(row, rowIndex)}
      .flatMap(x => x) // now we have triplets (newRowIndex, (newColIndex, value))
      .groupByKey
      .sortByKey().map(_._2) // sort rows and remove row indexes
      .map(buildRow) // restore order of elements in each row and remove column indexes
    new RowMatrix(transposedRowsRDD)
  }

  def rowToTransposedTriplet(row: Vector, rowIndex: Long): Array[(Long, (Long, Double))] = {

## ObjectPool.java
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @param <T>
*/
public abstract class ObjectPool<T> {

## Kmeans Readme.md

      
              2 files
            
          
              1 fork
            
          
              1 comment
            
          
              1 star
            
          
                umbertogriffo
                / Kmeans Readme.md
            
            
              Last active
              March 8, 2024 13:40
            
              
                Step by step Code Tutorial on implementing a basic k-means in Spark in order to cluster a geo-located devices
              
          
    DATASET


Download dataset here

CODE

* Follow the well-comented code kmeans.scala
	/*
	This is a collections of examples about Apache Spark's RDD Api. These examples aim to help me test the RDD functionality.

	References:

	http://spark.apache.org/docs/latest/programming-guide.html
	http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html
	*/
	object RddAPI {
	import java.util.*;
	import java.util.Map.Entry;
	import java.util.stream.Collectors;

	/**
	* Created by Umberto on 16/05/2017.
	*/

	public class HashMapUtils {
	import org.apache.commons.lang.SystemUtils
	import org.apache.spark.mllib.random.RandomRDDs._
	import org.apache.spark.sql.SQLContext
	import org.apache.spark.{SparkConf, SparkContext}

	import scala.math.sqrt

	/**
	* Created by Umberto on 08/02/2017.
	*/
	package knowledgebase.java.stream;

	import java.time.Duration;
	import java.util.*;

	import static java.util.stream.Collectors.*;

	/**
	* Created by Umberto on 15/02/2017.
	* https://dzone.com/articles/a-java-8-streams-cookbook
	package test.com.idlike.junit.df

	import breeze.numerics.abs
	import org.apache.spark.rdd.RDD
	import org.apache.spark.sql.functions.col
	import org.apache.spark.sql.{Column, DataFrame, Row}

	/**
	* Created by Umberto on 06/02/2017.
	*/
	public class Method1 {
	/*
	Adding synchronized to this method will makes it thread-safe.
	When synchronized is added to a static method, the Class object is the object which is locked.
	*/
	public static void main(String[] args) throws InterruptedException {

	ProcessingThreadS pt = new ProcessingThreadS();

	Thread t1 = new Thread(pt, "t1");
	/**
	* Genereate unique ID from UUID in positive space
	* Reference: http://www.gregbugaj.com/?p=587
	* @return long value representing UUID
	*/
	private Long generateUniqueId()
	{
	long val = -1;
	do
	{
	def transposeRowMatrix(m: RowMatrix): RowMatrix = {
	val transposedRowsRDD = m.rows.zipWithIndex.map{case (row, rowIndex) => rowToTransposedTriplet(row, rowIndex)}
	.flatMap(x => x) // now we have triplets (newRowIndex, (newColIndex, value))
	.groupByKey
	.sortByKey().map(_._2) // sort rows and remove row indexes
	.map(buildRow) // restore order of elements in each row and remove column indexes
	new RowMatrix(transposedRowsRDD)
	}

	def rowToTransposedTriplet(row: Vector, rowIndex: Long): Array[(Long, (Long, Double))] = {
	import java.util.Queue;
	import java.util.concurrent.ConcurrentLinkedQueue;
	import java.util.concurrent.Executors;
	import java.util.concurrent.ScheduledExecutorService;
	import java.util.concurrent.TimeUnit;
	import java.util.concurrent.atomic.AtomicInteger;
	/**
	* @param <T>
	*/
	public abstract class ObjectPool<T> {