Train Chen TrainTravel

## BaseQueryValidator.java
public class BaseQueryValidator {
    private static List<String> extractTableAliases(SqlNode node) {
        final List<String> tables = new ArrayList<>();

        // If order by comes in the query.
        if (node.getKind().equals(SqlKind.ORDER_BY)) {
            // Retrieve exact select.
            node = ((SqlSelect) ((SqlOrderBy) node).query).getFrom();
        } else {
            node = ((SqlSelect) node).getFrom();

## structured_streaming_kinesis_watermark.py
from pyspark.sql.functions import window

# configure reading from the stream
kinesis_df = spark.readStream.format("kinesis")
    .option("streamName", KINESIS_STREAM_NAME)
    .option("region", AWS_REGION)
    .option("roleArn", KINESIS_ACCESS_ROLE_ARN
    .option("initialPosition", "latest")
    .load()

## collab_filter.py
import pandas as pd
from pyspark.mllib.recommendation import ALS, Rating
from pyspark.sql import SparkSession, SQLContext
from sklearn.metrics.pairwise import cosine_similarity
if __name__ == "__main__":  # run this by typing "python collaborative_filter.py"
    app_name = "collab_filter_example"

    # create a Spark context
    spark = SparkSession.builder.master("local").appName(app_name).getOrCreate()

## _file_formats.md

      
              2 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                TrainTravel
                / _file_formats.md
            
            
              Created
              January 30, 2024 08:05
                — forked from kzzzr/_file_formats.md
            
              
                File formats comparison: CSV, JSON, Parquet, ORC
              
          
    File formats comparison: CSV, JSON, Parquet, ORC

Key results

Whenever you need to store your data on S3 / Data Lake / External table choose file format wisely:

Parquet / ORC are the best options due to efficient data layout, compression, indexing capabilities
Columnar formats allow for column projection and partition pruning (reading only relevant data!)
Binary formats enable schema evolution which is very applicable for constantly changing business environment


## jq-cheetsheet.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                TrainTravel
                / jq-cheetsheet.md
            
            
              Created
              May 13, 2022 09:15
                — forked from olih/jq-cheetsheet.md
            
              
                jq Cheet Sheet
              
          
    Processing JSON using jq

jq is useful to slice, filter, map and transform structured json data.
Installing jq

On Mac OS

brew install jq

  
## gist:05a234fe87db95355df746d73386e9e5
-- find out all the available catalog views
    SELECT
      n.nspname as "Schema",
      c.relname as "Name",
      CASE c.relkind WHEN 'r' THEN 'table' WHEN 'v' THEN 'view' WHEN 'i' THEN 'index' WHEN 'S' THEN 'sequence' WHEN 's' THEN 'special' END as "Type",
      r.rolname as "Owner"
    FROM pg_catalog.pg_class c
      JOIN pg_catalog.pg_roles r ON r.oid = c.relowner
      LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
    WHERE c.relkind IN ('v','')

## gist:5cd9df0f5ab05235f21af2ee620b72e3
-- createdb test_buffers;
-- pgbench -i -s 50 test_buffers;

-- psql test_buffers;
-- \dt+;
-- select * from pgbench_branches;

-- pgbench -S -c 8 -t 25000 test_buffers


## postgres_config.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                TrainTravel
                / postgres_config.md
            
            
              Created
              August 13, 2021 08:09
                — forked from rgreenjr/postgres_config.md
            
              
                PostgreSQL Configuration Optimization
              
          
    PostgreSQL Configuration Optimization

Memory

Only four values really matter:

shared-buffers: below 2GB: set it to 20% of full memory; below 32GB: 25% of your full memory.


## postgres_queries_and_commands.sql
-- show running queries (pre 9.2)
SELECT procpid, age(clock_timestamp(), query_start), usename, current_query
FROM pg_stat_activity
WHERE current_query != '<IDLE>' AND current_query NOT ILIKE '%pg_stat_activity%'
ORDER BY query_start desc;

-- show running queries (9.2)
SELECT pid, age(clock_timestamp(), query_start), usename, query
FROM pg_stat_activity
WHERE query != '<IDLE>' AND query NOT ILIKE '%pg_stat_activity%'

## ammonium - Scio - King Lear.scala
$ ./ammonium
Loading...
Welcome to the Ammonite Repl 0.8.0
(Scala 2.11.8 Java 1.8.0_112)

@ import $ivy.`org.jupyter-scala::scio:0.4.0-RC1`
import $ivy.$

@ {
  import jupyter.scio._
	public class BaseQueryValidator {
	private static List<String> extractTableAliases(SqlNode node) {
	final List<String> tables = new ArrayList<>();

	// If order by comes in the query.
	if (node.getKind().equals(SqlKind.ORDER_BY)) {
	// Retrieve exact select.
	node = ((SqlSelect) ((SqlOrderBy) node).query).getFrom();
	} else {
	node = ((SqlSelect) node).getFrom();
	from pyspark.sql.functions import window

	# configure reading from the stream
	kinesis_df = spark.readStream.format("kinesis")
	.option("streamName", KINESIS_STREAM_NAME)
	.option("region", AWS_REGION)
	.option("roleArn", KINESIS_ACCESS_ROLE_ARN
	.option("initialPosition", "latest")
	.load()
	import pandas as pd
	from pyspark.mllib.recommendation import ALS, Rating
	from pyspark.sql import SparkSession, SQLContext
	from sklearn.metrics.pairwise import cosine_similarity
	if __name__ == "__main__": # run this by typing "python collaborative_filter.py"
	app_name = "collab_filter_example"

	# create a Spark context
	spark = SparkSession.builder.master("local").appName(app_name).getOrCreate()
	-- find out all the available catalog views
	SELECT
	n.nspname as "Schema",
	c.relname as "Name",
	CASE c.relkind WHEN 'r' THEN 'table' WHEN 'v' THEN 'view' WHEN 'i' THEN 'index' WHEN 'S' THEN 'sequence' WHEN 's' THEN 'special' END as "Type",
	r.rolname as "Owner"
	FROM pg_catalog.pg_class c
	JOIN pg_catalog.pg_roles r ON r.oid = c.relowner
	LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
	WHERE c.relkind IN ('v','')
	-- createdb test_buffers;
	-- pgbench -i -s 50 test_buffers;

	-- psql test_buffers;
	-- \dt+;
	-- select * from pgbench_branches;

	-- pgbench -S -c 8 -t 25000 test_buffers
	-- show running queries (pre 9.2)
	SELECT procpid, age(clock_timestamp(), query_start), usename, current_query
	FROM pg_stat_activity
	WHERE current_query != '<IDLE>' AND current_query NOT ILIKE '%pg_stat_activity%'
	ORDER BY query_start desc;

	-- show running queries (9.2)
	SELECT pid, age(clock_timestamp(), query_start), usename, query
	FROM pg_stat_activity
	WHERE query != '<IDLE>' AND query NOT ILIKE '%pg_stat_activity%'
	$ ./ammonium
	Loading...
	Welcome to the Ammonite Repl 0.8.0
	(Scala 2.11.8 Java 1.8.0_112)

	@ import $ivy.`org.jupyter-scala::scio:0.4.0-RC1`
	import $ivy.$

	@ {
	import jupyter.scio._