Umberto Griffo umbertogriffo

## TwitterSentimentAnalysisAndN-gramWithHadoopAndHiveSQL.md

      
              1 file
            
          
              6 forks
            
          
              0 comments
            
          
              8 stars
            
          
                umbertogriffo
                / TwitterSentimentAnalysisAndN-gramWithHadoopAndHiveSQL.md
            
            
              Last active
              May 11, 2021 13:22
            
              
                Step by step Tutorial on Twitter Sentiment Analysis and n-gram with Hadoop and Hive SQL
              
          
    PREREQUISITES

* Download JSON Serde at:
* http://files.cloudera.com/samples/hive-serdes-1.0-SNAPSHOT.jar
* and to renominate it as hive-serdes-1.0.jar


Add Jar to HIVE_AUX_JARS_PATH of HiveServer2:

Copy the JAR files to the host on which HiveServer2 is running. Save the JARs to any directory you choose, and make a note of the path (create directory in /usr/share/).


## HBaseBackup.rb
# Checking if the hbase.snapshot.enabled property in hbase-site.xml is set to true
# To execute script launch this command on shell: hbase shell HBaseBackup.rb

@clusterToSave = "hdfs:///srv2:8082/hbase"
# CHECK THE PATH OF HBase lib
@libjars = `ls /opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/hbase/*.jar | tr "\n" ","`
@ignore = [ /zipkin\..*/i, /.*_temp/i, /.*tmp/i, /test_.*/i, /.*_test/i, /.*_old/i ]
@mappers = "2"

include Java

## HBaseRestore.rb
# To execute script launch this command on shell: hbase shell HBaseRestore.rb

include Java

java_import org.apache.hadoop.hbase.HBaseConfiguration
java_import org.apache.hadoop.hbase.client.HBaseAdmin
java_import org.apache.hadoop.hbase.snapshot.ExportSnapshot
java_import org.apache.hadoop.hbase.TableExistsException
java_import org.apache.hadoop.util.ToolRunner

## Kmeans Readme.md

      
              2 files
            
          
              1 fork
            
          
              1 comment
            
          
              1 star
            
          
                umbertogriffo
                / Kmeans Readme.md
            
            
              Last active
              March 8, 2024 13:40
            
              
                Step by step Code Tutorial on implementing a basic k-means in Spark in order to cluster a geo-located devices
              
          
    DATASET


Download dataset here

CODE

* Follow the well-comented code kmeans.scala


## ObjectPool.java
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @param <T>
*/
public abstract class ObjectPool<T> {

## Transpose.scala
  def transposeRowMatrix(m: RowMatrix): RowMatrix = {
    val transposedRowsRDD = m.rows.zipWithIndex.map{case (row, rowIndex) => rowToTransposedTriplet(row, rowIndex)}
      .flatMap(x => x) // now we have triplets (newRowIndex, (newColIndex, value))
      .groupByKey
      .sortByKey().map(_._2) // sort rows and remove row indexes
      .map(buildRow) // restore order of elements in each row and remove column indexes
    new RowMatrix(transposedRowsRDD)
  }

  def rowToTransposedTriplet(row: Vector, rowIndex: Long): Array[(Long, (Long, Double))] = {

## UniqueId.java
    /**
     * Genereate unique ID from UUID in positive space
     * Reference: http://www.gregbugaj.com/?p=587
     * @return long value representing UUID
     */
    private Long generateUniqueId()
    {
        long val = -1;
        do
        {

## Method1.java
public class Method1 {
/*
Adding synchronized to this method will makes it thread-safe.
When synchronized is added to a static method, the Class object is the object which is locked.
*/
  public static void main(String[] args) throws InterruptedException {

		ProcessingThreadS pt = new ProcessingThreadS();

		Thread t1 = new Thread(pt, "t1");

## DataFrameSuite.scala
package test.com.idlike.junit.df

import breeze.numerics.abs
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, Row}

/**
  * Created by Umberto on 06/02/2017.
  */

## Winner.java
package knowledgebase.java.stream;

import java.time.Duration;
import java.util.*;

import static java.util.stream.Collectors.*;

/**
 * Created by Umberto on 15/02/2017.
 * https://dzone.com/articles/a-java-8-streams-cookbook
	# Checking if the hbase.snapshot.enabled property in hbase-site.xml is set to true
	# To execute script launch this command on shell: hbase shell HBaseBackup.rb

	@clusterToSave = "hdfs:///srv2:8082/hbase"
	# CHECK THE PATH OF HBase lib
	@libjars = `ls /opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/hbase/*.jar \| tr "\n" ","`
	@ignore = [ /zipkin\../i, /._temp/i, /.tmp/i, /test_./i, /._test/i, /._old/i ]
	@mappers = "2"

	include Java
	# To execute script launch this command on shell: hbase shell HBaseRestore.rb

	include Java

	java_import org.apache.hadoop.hbase.HBaseConfiguration
	java_import org.apache.hadoop.hbase.client.HBaseAdmin
	java_import org.apache.hadoop.hbase.snapshot.ExportSnapshot
	java_import org.apache.hadoop.hbase.TableExistsException
	java_import org.apache.hadoop.util.ToolRunner
	import java.util.Queue;
	import java.util.concurrent.ConcurrentLinkedQueue;
	import java.util.concurrent.Executors;
	import java.util.concurrent.ScheduledExecutorService;
	import java.util.concurrent.TimeUnit;
	import java.util.concurrent.atomic.AtomicInteger;
	/**
	* @param <T>
	*/
	public abstract class ObjectPool<T> {
	def transposeRowMatrix(m: RowMatrix): RowMatrix = {
	val transposedRowsRDD = m.rows.zipWithIndex.map{case (row, rowIndex) => rowToTransposedTriplet(row, rowIndex)}
	.flatMap(x => x) // now we have triplets (newRowIndex, (newColIndex, value))
	.groupByKey
	.sortByKey().map(_._2) // sort rows and remove row indexes
	.map(buildRow) // restore order of elements in each row and remove column indexes
	new RowMatrix(transposedRowsRDD)
	}

	def rowToTransposedTriplet(row: Vector, rowIndex: Long): Array[(Long, (Long, Double))] = {
	/**
	* Genereate unique ID from UUID in positive space
	* Reference: http://www.gregbugaj.com/?p=587
	* @return long value representing UUID
	*/
	private Long generateUniqueId()
	{
	long val = -1;
	do
	{
	public class Method1 {
	/*
	Adding synchronized to this method will makes it thread-safe.
	When synchronized is added to a static method, the Class object is the object which is locked.
	*/
	public static void main(String[] args) throws InterruptedException {

	ProcessingThreadS pt = new ProcessingThreadS();

	Thread t1 = new Thread(pt, "t1");
	package test.com.idlike.junit.df

	import breeze.numerics.abs
	import org.apache.spark.rdd.RDD
	import org.apache.spark.sql.functions.col
	import org.apache.spark.sql.{Column, DataFrame, Row}

	/**
	* Created by Umberto on 06/02/2017.
	*/
	package knowledgebase.java.stream;

	import java.time.Duration;
	import java.util.*;

	import static java.util.stream.Collectors.*;

	/**
	* Created by Umberto on 15/02/2017.
	* https://dzone.com/articles/a-java-8-streams-cookbook