General linkts
- https://dzone.com/articles/understanding-how-parquet
- http://blog.cloudera.com/blog/2014/05/how-to-convert-existing-data-into-parquet/
- https://github.com/qubole/streamx
openapi: 3.0.0 | |
info: | |
version: 1.0.0 | |
title: Airflow events | |
description: 'This file is used to hold schema definition, please refer to asyncapi.yaml for service description' | |
paths: {} | |
components: | |
schemas: | |
Audit: | |
type: object |
CREATE TABLE `my_db_name`.`_my_table_name_new` ( | |
`my_table_name_id` int(11) unsigned NOT NULL AUTO_INCREMENT, | |
`tour_id` int(11) unsigned NOT NULL, | |
`status` enum('new','pending','approved','rejected','deleted') NOT NULL DEFAULT 'new', | |
`fields` varchar(600) NOT NULL, | |
`comment` text NOT NULL, | |
`reject_reasons` varchar(500) DEFAULT NULL, | |
`reject_text` varchar(1000) DEFAULT NULL, | |
`creation_timestamp` datetime NOT NULL, | |
`requester_user_type` enum('staff','supplier') DEFAULT NULL, |
6-89a4-11e8-88e2-0ac8e5ea0169:1-11 and binlog file 'db-server-mysql-bin.215447', pos=40508711, skipping 0 events plus 1 rows (io.debezium.connector.mysql.BinlogReader:1004) | |
[2019-05-15 07:41:57,968] INFO Creating thread debezium-mysqlconnector-db-stream.cloud_pxc-binlog-client (io.debezium.util.Threads:267) | |
[2019-05-15 07:41:57,968] INFO WorkerSourceTask{id=db-stream.dbz.my_db_name-0} Source task finished initialization and start (org.apache.kafka.connect.runtime.WorkerSourceTask:199) | |
[2019-05-15 07:41:59,969] ERROR Failed to properly convert data value for 'my_db_name.my_table_name.creation_timestamp' of type DATETIME for row [478482, 65180, 3, [123, 34, 97, 100, 100, 105, 116, 105, 111, 110, 97, 108, 95, 105, 110, 102, 111, 114, 109, 97, 116, 105, 111, 110, 34, 58, 34, 100, 111, 110, 101, 34, 125], [91, 123, 34, 113, 117, 101, 115, 116, 105, 111, 110, 34, 58, 34, 87, 104, 97, 116, 32, 115, 104, 111, 117, 108, 100, 32, 98, 101, 32, 82, 69, 77, 79, 86, 69, 68, 32, 102, 114, 111, 109, 32, 121, 111, 117, 114, 3 |
eval (minikube docker-env) # Use VM built-in docker daemon
# Creation of spark base
cd spark-2.2.0-k8s-0.5.0-bin-2.7.3
docker build . -f Dockerfile -t spark-history:v1 # Build docker image for spark base
apiVersion: extensions/v1beta1 | |
kind: DaemonSet | |
metadata: | |
labels: | |
app: spark-shuffle-service | |
spark-version: 2.2.0 | |
name: shuffle | |
spec: | |
template: | |
metadata: |
bin/spark-submit \ | |
--deploy-mode cluster \ | |
--class org.apache.spark.examples.SparkPi \ | |
--master k8s://https://192.168.99.100:8443 \ | |
--kubernetes-namespace default \ | |
--conf spark.executor.instances=1 \ | |
--conf spark.app.name=spark-pi \ | |
--conf spark.executor.memory=500M \ | |
--conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.2.0-kubernetes-0.5.0 \ | |
--conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.2.0-kubernetes-0.5.0 \ |
import pprint | |
# ["id", "col1", "col2"] | |
dataset = [ [1,2,3], [1,6,8], [2,37,0], [1,2,3], [1,6,8], [2,60,0] ] | |
def duplicate_finder(subdataset): | |
ids = {} | |
for record in subdataset: | |
if record[0] in ids: | |
ids[record[0]] += 1 |