Maria Karanasou mkaranasou

## pyspark_ml_dense_vectors_isolation_forest_example.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark_iforest.ml.iforest import IForest, IForestModel
from pyspark.ml.linalg import Vectors, VectorUDT

conf = SparkConf()
conf.set('spark.jars', '/full/path/to/spark-iforest-2.4.0.jar')

spark = SparkSession \

## pyspark_vector_assembler_dense_and_sparse.py
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark_iforest.ml.iforest import IForest, IForestModel
import tempfile

conf = SparkConf()
conf.set('spark.jars', '/full/path/to/spark-iforest-2.4.0.jar')

spark = SparkSession \

## pyspark_read_from_database.py
user = 'postgres'
password = 'secret'
db_driver = 'org.postgresql.Driver'
host = '127.0.0.1'
db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}'

df = spark.read.format(
  'jdbc'
).options(
  url=db_url,

## pyspark_parallel_read_from_db.py
import os

q = '(select min(id) as min, max(id) as max from table_name where condition) as bounds'
user = 'postgres'
password = 'secret'
db_driver = 'org.postgresql.Driver'
host = '127.0.0.1'
db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}'
partitions = os.cpu_count() * 2       # a good starting point
conn_properties = {

## view_tables_by_size.sql
-- source: https://makandracards.com/makandra/52141-postgresql-how-to-show-table-sizes

SELECT nspname || '.' || relname AS "relation",
  pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size"
FROM pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema')
  AND C.relkind <> 'i'
  AND nspname !~ '^pg_toast'
ORDER BY pg_total_relation_size(C.oid) DESC

## pgpartition_by_week_example.sql
-- File generated at: 2020-01-19 16:34:07.677439

-- customer_transactions_y2020_w1--------------------------------------------------------

CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w1 (
    CHECK (transasction_date >= '2019-12-30 00:00:00' AND transasction_date <= '2020-01-05 23:59:59' )
) INHERITS (customer_transactions);
-- customer_transactions_y2020_w2--------------------------------------------------------

CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w2 (

## partition_creation_example.sql
-- customer_transactions_y2020_m1--------------------------------------------------------
CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w1 (
    CHECK (transasction_date >= '2020-01-01 00:00:00' AND transasction_date <= '2020-01-31 23:59:59' )
) INHERITS (customer_transactions);

-- customer_transactions_y2020_m2--------------------------------------------------------
CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w2 (
    CHECK (transasction_date >= '2020-02-01 00:00:00' AND transasction_date <= '2020-02-29 23:59:59' )
) INHERITS (customer_transactions);

## before_insert_trigger_example.sql
----- Attach before insert trigger to table --------
DROP TRIGGER IF EXISTS before_insert_customer_transactions_trigger on customer_transactions;
CREATE TRIGGER before_insert_customer_transactions_trigger
BEFORE INSERT ON customer_transactions
FOR EACH ROW EXECUTE PROCEDURE customer_transactions_insert_trigger();

## customer_transactions_insert_function_example.sql
CREATE OR REPLACE FUNCTION customer_transactions_insert_trigger()
RETURNS TRIGGER AS $$
DECLARE
  target_partitioned_by text;
  table_name text;
BEGIN
    -- get the month from the datetime transasction_date field
    SELECT cast(extract(month from NEW.transasction_date) AS TEXT) INTO target_partitioned_by;
    table_name = 'customer_transactions_y2020_m' || target_partitioned_by;

## postgres-cheatsheet.md

      
        
          
            
              
              1 file
            
          
          
            
              
              0 forks
            
          
          
            
              
              0 comments
            
          
          
            
              
              0 stars
            
          
        
        
          
              
          
          
            
                mkaranasou
                / postgres-cheatsheet.md
            
            
              Created
              November 15, 2019 10:58
                — forked from Kartones/postgres-cheatsheet.md
            
              
                PostgreSQL command line cheatsheet
              
          
        
      
        
  
      
    PSQL

Magic words:
psql -U postgres
Some interesting flags (to see all, use -h or --help depending on your psql version):

-E: will describe the underlaying queries of the \ commands (cool for learning!)
-l: psql will list all databases and then exit (useful if the user you connect with doesn't has a default database, like at AWS RDS)
	from pyspark import SparkConf
	from pyspark.sql import SparkSession, functions as F, types as T
	from pyspark.ml.feature import VectorAssembler, StandardScaler
	from pyspark_iforest.ml.iforest import IForest, IForestModel
	from pyspark.ml.linalg import Vectors, VectorUDT

	conf = SparkConf()
	conf.set('spark.jars', '/full/path/to/spark-iforest-2.4.0.jar')

	spark = SparkSession \
	user = 'postgres'
	password = 'secret'
	db_driver = 'org.postgresql.Driver'
	host = '127.0.0.1'
	db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}'

	df = spark.read.format(
	'jdbc'
	).options(
	url=db_url,
	import os

	q = '(select min(id) as min, max(id) as max from table_name where condition) as bounds'
	user = 'postgres'
	password = 'secret'
	db_driver = 'org.postgresql.Driver'
	host = '127.0.0.1'
	db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}'
	partitions = os.cpu_count() * 2 # a good starting point
	conn_properties = {
	-- source: https://makandracards.com/makandra/52141-postgresql-how-to-show-table-sizes

	SELECT nspname \|\| '.' \|\| relname AS "relation",
	pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size"
	FROM pg_class C
	LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
	WHERE nspname NOT IN ('pg_catalog', 'information_schema')
	AND C.relkind <> 'i'
	AND nspname !~ '^pg_toast'
	ORDER BY pg_total_relation_size(C.oid) DESC
	-- File generated at: 2020-01-19 16:34:07.677439

	-- customer_transactions_y2020_w1--------------------------------------------------------

	CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w1 (
	CHECK (transasction_date >= '2019-12-30 00:00:00' AND transasction_date <= '2020-01-05 23:59:59' )
	) INHERITS (customer_transactions);
	-- customer_transactions_y2020_w2--------------------------------------------------------

	CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w2 (
	-- customer_transactions_y2020_m1--------------------------------------------------------
	CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w1 (
	CHECK (transasction_date >= '2020-01-01 00:00:00' AND transasction_date <= '2020-01-31 23:59:59' )
	) INHERITS (customer_transactions);

	-- customer_transactions_y2020_m2--------------------------------------------------------
	CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w2 (
	CHECK (transasction_date >= '2020-02-01 00:00:00' AND transasction_date <= '2020-02-29 23:59:59' )
	) INHERITS (customer_transactions);
	----- Attach before insert trigger to table --------
	DROP TRIGGER IF EXISTS before_insert_customer_transactions_trigger on customer_transactions;
	CREATE TRIGGER before_insert_customer_transactions_trigger
	BEFORE INSERT ON customer_transactions
	FOR EACH ROW EXECUTE PROCEDURE customer_transactions_insert_trigger();
	CREATE OR REPLACE FUNCTION customer_transactions_insert_trigger()
	RETURNS TRIGGER AS $$
	DECLARE
	target_partitioned_by text;
	table_name text;
	BEGIN
	-- get the month from the datetime transasction_date field
	SELECT cast(extract(month from NEW.transasction_date) AS TEXT) INTO target_partitioned_by;
	table_name = 'customer_transactions_y2020_m' \|\| target_partitioned_by;