Phani Kumar Yadavilli wandermonk

## redshift_date_dim.sql
-- Potentially quirky when it comes to week numbers.

BEGIN TRANSACTION;

DROP TABLE IF EXISTS numbers_small;
CREATE TABLE numbers_small (
  number SMALLINT NOT NULL
) DISTSTYLE ALL SORTKEY (number
);
INSERT INTO numbers_small VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9);

## vault_client.py
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
import properties

retries = properties.getenv('VAULT','REQUEST_RETRIES_ON_FAILURE')
backoff = properties.getenv('VAULT','REQUEST_BACKOFF_FACTOR')
timeout_in_seconds = properties.getenv('VAULT','REQUEST_TIMEOUT_IN_SECONDS')

## Triggering Airflow DAG using WatchDog
import os
import time
from airflow import DAG
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from datetime import datetime, timedelta


dag = DAG(dag_id="test_trigger_dag_operator",default_args={"owner":"Airflow", "start_date":datetime(2020,3,9)})

## docker-commands
#Installation

sudo apt-get remove docker docker-engine docker.io
sudo apt-get install -y apt-transport-https ca-certificates curl software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo apt-key fingerprint 0EBFCD88
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
sudo apt-get update
sudo apt-get install -y docker-ce

## StreamingConsumer.java
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.struct;

import java.util.Arrays;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;

## Indexing documents using HighLevelRestClient java client API
The ElasticSearchIndexCreator class provides a static method to create the index and type.

The ElasticSearchWriter class indexes documents to the index created.

The MainApp is the executor class.


public class ElasticSearchIndexCreator {

    	private static RestHighLevelClient client = new RestHighLevelClient(

## gist:d6f604d2b2e7c74c49565c7400930439
Native OS implementations have fixed FileSystem buffer sizes. Inorder to change the size the OS needs to be reinstalled.

Hadoop on the other hand is a User Space implementation. Therefore, changing the read write buffers sizes is flexible without making any
changes to the OS or reinstalling the OS.

The buffer sizes determines the Read/Write performances. Because, the data is written or read from the buffers before performing an operation on the datasets in the I/O pipelines.

Usually, the IO buffer sizes should be in the multiples of OS Filesystem buffer size. The linux filesystem internally uses the sendFile() which is said to achieve the ZeroCopy. i.e the data is not moved from the Kernel to User space rather it is buffered in the kernel space.

HADOOP-3164 adopted Java NIO FileChannel's transferTo() and transferFrom() methods so that it can utilize the Linux sendfile() system call under the hood to achieve "zero copy" performance gain because this copying is done within the kernel, sendfile() does not n
	-- Potentially quirky when it comes to week numbers.

	BEGIN TRANSACTION;

	DROP TABLE IF EXISTS numbers_small;
	CREATE TABLE numbers_small (
	number SMALLINT NOT NULL
	) DISTSTYLE ALL SORTKEY (number
	);
	INSERT INTO numbers_small VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9);
	import requests
	from requests.adapters import HTTPAdapter
	from requests.exceptions import HTTPError
	from requests.packages.urllib3.util.retry import Retry
	import properties

	retries = properties.getenv('VAULT','REQUEST_RETRIES_ON_FAILURE')
	backoff = properties.getenv('VAULT','REQUEST_BACKOFF_FACTOR')
	timeout_in_seconds = properties.getenv('VAULT','REQUEST_TIMEOUT_IN_SECONDS')
	import os
	import time
	from airflow import DAG
	from airflow.operators.dagrun_operator import TriggerDagRunOperator
	from watchdog.observers import Observer
	from watchdog.events import FileSystemEventHandler
	from datetime import datetime, timedelta


	dag = DAG(dag_id="test_trigger_dag_operator",default_args={"owner":"Airflow", "start_date":datetime(2020,3,9)})
	#Installation

	sudo apt-get remove docker docker-engine docker.io
	sudo apt-get install -y apt-transport-https ca-certificates curl software-properties-common
	curl -fsSL https://download.docker.com/linux/ubuntu/gpg \| sudo apt-key add -
	sudo apt-key fingerprint 0EBFCD88
	sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
	sudo apt-get update
	sudo apt-get install -y docker-ce
	import static org.apache.spark.sql.functions.col;
	import static org.apache.spark.sql.functions.struct;

	import java.util.Arrays;

	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.spark.SparkConf;
	import org.apache.spark.sql.Dataset;
	import org.apache.spark.sql.SparkSession;
	The ElasticSearchIndexCreator class provides a static method to create the index and type.

	The ElasticSearchWriter class indexes documents to the index created.

	The MainApp is the executor class.


	public class ElasticSearchIndexCreator {

	private static RestHighLevelClient client = new RestHighLevelClient(
	Native OS implementations have fixed FileSystem buffer sizes. Inorder to change the size the OS needs to be reinstalled.

	Hadoop on the other hand is a User Space implementation. Therefore, changing the read write buffers sizes is flexible without making any
	changes to the OS or reinstalling the OS.

	The buffer sizes determines the Read/Write performances. Because, the data is written or read from the buffers before performing an operation on the datasets in the I/O pipelines.

	Usually, the IO buffer sizes should be in the multiples of OS Filesystem buffer size. The linux filesystem internally uses the sendFile() which is said to achieve the ZeroCopy. i.e the data is not moved from the Kernel to User space rather it is buffered in the kernel space.

	HADOOP-3164 adopted Java NIO FileChannel's transferTo() and transferFrom() methods so that it can utilize the Linux sendfile() system call under the hood to achieve "zero copy" performance gain because this copying is done within the kernel, sendfile() does not n