Vaquar Khan vaquarkhan

## AWS Athena SQL.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                vaquarkhan
                / AWS Athena SQL.md
            
            
              Created
              October 28, 2022 21:22
                — forked from Integralist/AWS Athena SQL.md
            
              
                [AWS Athena SQL] #aws #athena #s3 #sql
              
          
Note: reference article

Basic query example:
SELECT status_code,
         COUNT(status_code) AS requests
FROM fastly_logs.example_com
GROUP BY  requests
ORDER BY  requests DESC

  
## AWS Athena SQL.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                vaquarkhan
                / AWS Athena SQL.md
            
            
              Created
              October 28, 2022 21:22
                — forked from Integralist/AWS Athena SQL.md
            
              
                [AWS Athena SQL] #aws #athena #s3 #sql
              
          
Note: reference article

Basic query example:
SELECT status_code,
         COUNT(status_code) AS requests
FROM fastly_logs.example_com
GROUP BY  requests
ORDER BY  requests DESC

  
## amazon_athena_create_table.ddl
CREATE EXTERNAL TABLE IF NOT EXISTS default.table
(
  `id` int,
  `name` string,
  `timestamp` string,
  `is_debug` boolean
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'escapeChar'='\\',

## Spark Tuning
1.mapPartition() instead of map()    - when some expensive initializations like DBconnection need to be done

2.RDD Parallelism: for No parent RDDs, example, sc.parallelize(',,,',4),Unless specified YARN will try to use as many CPU cores as available
	This could be tuned using spark.default.parallelism property.

	- to find default parallelism use sc.defaultParallelism
	  rdd.getNumPartitions()
	  rdd = sc.parallelize(<value>, numSlices=4)
	  rdd.getNumPartitions() will return 4


## spark-aes-encryption-demo.ino
void sixteenRandomBytes(unsigned char buf[16]) {
  for (int i = 0; i < 16; i++) {
    buf[i] = rand() & 0xff;
  }
}

// PKCS #7 padding
// Do this before encrypting to get the message
// up to a multiple of 16 bytes.
size_t pad(unsigned char *buf, size_t messageLength) {

## create_empty_dataframe_pyspark.py
from pyspark.sql.types import StringType, FloatType, StructField, StructType
from pyspark.sql import SparkSession, SQLContext, Row
import pyspark

# spark initialization
spark_context = pyspark.SparkContext.getOrCreate()
spark_session = SparkSession(spark_context) \
                    .builder \
                    .enableHiveSupport() \
                    .getOrCreate()

## SparkKafka10.java
package com.test;

import com.test.schema.ContactType;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;

## ExampleApp.scala
package com.experiments.calvin

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.{Seconds, StreamingContext}

object ExampleApp extends App {

## GlueLastJobDuration.py
## Python 2.7
## GlueLastRunDuration.py
## Version 1
## by Lydon Carter October 2018

## USE
# Script to get a specific AWS Glue Job and tell you the duration of
# the last run.
# Notes:
# -- The script will use the location you setup for your Glue Context in the "Needed stuff"

## aws_signature_v4.py
# AWS Version 4 signing example
# taken from:
#    http://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html
#    https://docs.aws.amazon.com/general/latest/gr/sigv4_signing.html
#    https://www.javaquery.com/2016/01/aws-version-4-signing-process-complete.html
# Lambda API (InvokeAsync)
# http://docs.aws.amazon.com/lambda/latest/dg/API_InvokeAsync.html

# See: http://docs.aws.amazon.com/general/latest/gr/sigv4_signing.html
# This version makes a POST request and passes request parameters
	CREATE EXTERNAL TABLE IF NOT EXISTS default.table
	(
	`id` int,
	`name` string,
	`timestamp` string,
	`is_debug` boolean
	)
	ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
	WITH SERDEPROPERTIES (
	'escapeChar'='\\',
	1.mapPartition() instead of map() - when some expensive initializations like DBconnection need to be done

	2.RDD Parallelism: for No parent RDDs, example, sc.parallelize(',,,',4),Unless specified YARN will try to use as many CPU cores as available
	This could be tuned using spark.default.parallelism property.

	- to find default parallelism use sc.defaultParallelism
	rdd.getNumPartitions()
	rdd = sc.parallelize(<value>, numSlices=4)
	rdd.getNumPartitions() will return 4
	void sixteenRandomBytes(unsigned char buf[16]) {
	for (int i = 0; i < 16; i++) {
	buf[i] = rand() & 0xff;
	}
	}

	// PKCS #7 padding
	// Do this before encrypting to get the message
	// up to a multiple of 16 bytes.
	size_t pad(unsigned char *buf, size_t messageLength) {
	from pyspark.sql.types import StringType, FloatType, StructField, StructType
	from pyspark.sql import SparkSession, SQLContext, Row
	import pyspark

	# spark initialization
	spark_context = pyspark.SparkContext.getOrCreate()
	spark_session = SparkSession(spark_context) \
	.builder \
	.enableHiveSupport() \
	.getOrCreate()
	package com.test;

	import com.test.schema.ContactType;
	import org.apache.kafka.clients.consumer.ConsumerConfig;
	import org.apache.kafka.clients.consumer.ConsumerRecord;
	import org.apache.spark.SparkConf;
	import org.apache.spark.api.java.JavaPairRDD;
	import org.apache.spark.api.java.function.*;
	import org.apache.spark.streaming.Durations;
	import org.apache.spark.streaming.api.java.JavaDStream;
	package com.experiments.calvin

	import org.apache.kafka.common.serialization.StringDeserializer
	import org.apache.spark.sql.SparkSession
	import org.apache.spark.streaming.kafka010._
	import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
	import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
	import org.apache.spark.streaming.{Seconds, StreamingContext}

	object ExampleApp extends App {
	## Python 2.7
	## GlueLastRunDuration.py
	## Version 1
	## by Lydon Carter October 2018

	## USE
	# Script to get a specific AWS Glue Job and tell you the duration of
	# the last run.
	# Notes:
	# -- The script will use the location you setup for your Glue Context in the "Needed stuff"
	# AWS Version 4 signing example
	# taken from:
	# http://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html
	# https://docs.aws.amazon.com/general/latest/gr/sigv4_signing.html
	# https://www.javaquery.com/2016/01/aws-version-4-signing-process-complete.html
	# Lambda API (InvokeAsync)
	# http://docs.aws.amazon.com/lambda/latest/dg/API_InvokeAsync.html

	# See: http://docs.aws.amazon.com/general/latest/gr/sigv4_signing.html
	# This version makes a POST request and passes request parameters