Emmanuel ebrard

## airflow.yaml
openapi: 3.0.0
info:
  version: 1.0.0
  title: Airflow events
  description: 'This file is used to hold schema definition, please refer to asyncapi.yaml for service description'
paths: {}
components:
  schemas:
    Audit:
      type: object

## migration_ddl.sql
CREATE TABLE `my_db_name`.`_my_table_name_new` (
  `my_table_name_id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `tour_id` int(11) unsigned NOT NULL,
  `status` enum('new','pending','approved','rejected','deleted') NOT NULL DEFAULT 'new',
  `fields` varchar(600) NOT NULL,
  `comment` text NOT NULL,
  `reject_reasons` varchar(500) DEFAULT NULL,
  `reject_text` varchar(1000) DEFAULT NULL,
  `creation_timestamp` datetime NOT NULL,
  `requester_user_type` enum('staff','supplier') DEFAULT NULL,

## stack_trace.log
6-89a4-11e8-88e2-0ac8e5ea0169:1-11 and binlog file 'db-server-mysql-bin.215447', pos=40508711, skipping 0 events plus 1 rows (io.debezium.connector.mysql.BinlogReader:1004)
[2019-05-15 07:41:57,968] INFO Creating thread debezium-mysqlconnector-db-stream.cloud_pxc-binlog-client (io.debezium.util.Threads:267)
[2019-05-15 07:41:57,968] INFO WorkerSourceTask{id=db-stream.dbz.my_db_name-0} Source task finished initialization and start (org.apache.kafka.connect.runtime.WorkerSourceTask:199)
[2019-05-15 07:41:59,969] ERROR Failed to properly convert data value for 'my_db_name.my_table_name.creation_timestamp' of type DATETIME for row [478482, 65180, 3, [123, 34, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 95, 105, 110, 102, 111, 114, 109, 97, 116, 105, 111, 110, 34, 58, 34, 100, 111, 110, 101, 34, 125], [91, 123, 34, 113, 117, 101, 115, 116, 105, 111, 110, 34, 58, 34, 87, 104, 97, 116, 32, 115, 104, 111, 117, 108, 100, 32, 98, 101, 32, 82, 69, 77, 79, 86, 69, 68, 32, 102, 114, 111, 109, 32, 121, 111, 117, 114, 3

## kafka-connect-links.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ebrard
                / kafka-connect-links.md
            
            
              Created
              October 27, 2018 18:21
            
              
                kafka connect for Parquet links
              
          
    General linkts


https://dzone.com/articles/understanding-how-parquet
http://blog.cloudera.com/blog/2014/05/how-to-convert-existing-data-into-parquet/
https://github.com/qubole/streamx

Relevant for archiver


https://www.confluent.io/connector/kafka-connect-s3/

Relevant for Thrift to Parquet on S3


https://github.com/pinterest/secor


## playing-around-with-minikube-and-spark.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ebrard
                / playing-around-with-minikube-and-spark.md
            
            
              Created
              February 13, 2018 17:58
            
          
    Manual creation

Creation of the docker images

eval (minikube docker-env) # Use VM built-in docker daemon

# Creation of spark base
cd spark-2.2.0-k8s-0.5.0-bin-2.7.3
docker build . -f Dockerfile -t spark-history:v1 # Build docker image for spark base

  
## pod-spark-shuffle.yaml
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  labels:
    app: spark-shuffle-service
    spark-version: 2.2.0
  name: shuffle
spec:
  template:
    metadata:

## spark-submit-k8s.sh
bin/spark-submit \
  --deploy-mode cluster \
  --class org.apache.spark.examples.SparkPi \
  --master k8s://https://192.168.99.100:8443 \
  --kubernetes-namespace default \
  --conf spark.executor.instances=1 \
  --conf spark.app.name=spark-pi \
  --conf spark.executor.memory=500M \
  --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.2.0-kubernetes-0.5.0 \
  --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.2.0-kubernetes-0.5.0 \

## deduplicate.py
import pprint

# ["id", "col1", "col2"]
dataset = [ [1,2,3], [1,6,8], [2,37,0], [1,2,3], [1,6,8], [2,60,0] ]

def duplicate_finder(subdataset):
  ids = {}
  for record in subdataset:
    if record[0] in ids:
      ids[record[0]] += 1
	openapi: 3.0.0
	info:
	version: 1.0.0
	title: Airflow events
	description: 'This file is used to hold schema definition, please refer to asyncapi.yaml for service description'
	paths: {}
	components:
	schemas:
	Audit:
	type: object
	CREATE TABLE `my_db_name`.`_my_table_name_new` (
	`my_table_name_id` int(11) unsigned NOT NULL AUTO_INCREMENT,
	`tour_id` int(11) unsigned NOT NULL,
	`status` enum('new','pending','approved','rejected','deleted') NOT NULL DEFAULT 'new',
	`fields` varchar(600) NOT NULL,
	`comment` text NOT NULL,
	`reject_reasons` varchar(500) DEFAULT NULL,
	`reject_text` varchar(1000) DEFAULT NULL,
	`creation_timestamp` datetime NOT NULL,
	`requester_user_type` enum('staff','supplier') DEFAULT NULL,
	6-89a4-11e8-88e2-0ac8e5ea0169:1-11 and binlog file 'db-server-mysql-bin.215447', pos=40508711, skipping 0 events plus 1 rows (io.debezium.connector.mysql.BinlogReader:1004)
	[2019-05-15 07:41:57,968] INFO Creating thread debezium-mysqlconnector-db-stream.cloud_pxc-binlog-client (io.debezium.util.Threads:267)
	[2019-05-15 07:41:57,968] INFO WorkerSourceTask{id=db-stream.dbz.my_db_name-0} Source task finished initialization and start (org.apache.kafka.connect.runtime.WorkerSourceTask:199)
	[2019-05-15 07:41:59,969] ERROR Failed to properly convert data value for 'my_db_name.my_table_name.creation_timestamp' of type DATETIME for row [478482, 65180, 3, [123, 34, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 95, 105, 110, 102, 111, 114, 109, 97, 116, 105, 111, 110, 34, 58, 34, 100, 111, 110, 101, 34, 125], [91, 123, 34, 113, 117, 101, 115, 116, 105, 111, 110, 34, 58, 34, 87, 104, 97, 116, 32, 115, 104, 111, 117, 108, 100, 32, 98, 101, 32, 82, 69, 77, 79, 86, 69, 68, 32, 102, 114, 111, 109, 32, 121, 111, 117, 114, 3
	apiVersion: extensions/v1beta1
	kind: DaemonSet
	metadata:
	labels:
	app: spark-shuffle-service
	spark-version: 2.2.0
	name: shuffle
	spec:
	template:
	metadata:
	bin/spark-submit \
	--deploy-mode cluster \
	--class org.apache.spark.examples.SparkPi \
	--master k8s://https://192.168.99.100:8443 \
	--kubernetes-namespace default \
	--conf spark.executor.instances=1 \
	--conf spark.app.name=spark-pi \
	--conf spark.executor.memory=500M \
	--conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.2.0-kubernetes-0.5.0 \
	--conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.2.0-kubernetes-0.5.0 \
	import pprint

	# ["id", "col1", "col2"]
	dataset = [ [1,2,3], [1,6,8], [2,37,0], [1,2,3], [1,6,8], [2,60,0] ]

	def duplicate_finder(subdataset):
	ids = {}
	for record in subdataset:
	if record[0] in ids:
	ids[record[0]] += 1