Skip to content

Instantly share code, notes, and snippets.

View mkaranasou's full-sized avatar
🏠
Working from home

Maria Karanasou mkaranasou

🏠
Working from home
View GitHub Profile
@mkaranasou
mkaranasou / pyspark_ml_dense_vectors_isolation_forest_example.py
Created March 24, 2020 14:48
Gather the features as dense vectors with a udf to train an Isolation Forest
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark_iforest.ml.iforest import IForest, IForestModel
from pyspark.ml.linalg import Vectors, VectorUDT
conf = SparkConf()
conf.set('spark.jars', '/full/path/to/spark-iforest-2.4.0.jar')
spark = SparkSession \
@mkaranasou
mkaranasou / pyspark_vector_assembler_dense_and_sparse.py
Created March 24, 2020 14:30
VectorAssembler example - dense and sparse output
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark_iforest.ml.iforest import IForest, IForestModel
import tempfile
conf = SparkConf()
conf.set('spark.jars', '/full/path/to/spark-iforest-2.4.0.jar')
spark = SparkSession \
@mkaranasou
mkaranasou / pyspark_read_from_database.py
Last active March 21, 2020 13:01
Pyspark read from database (Postgres)
user = 'postgres'
password = 'secret'
db_driver = 'org.postgresql.Driver'
host = '127.0.0.1'
db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}'
df = spark.read.format(
'jdbc'
).options(
url=db_url,
@mkaranasou
mkaranasou / pyspark_parallel_read_from_db.py
Last active March 14, 2023 05:38
Parallel read from db with pyspark
import os
q = '(select min(id) as min, max(id) as max from table_name where condition) as bounds'
user = 'postgres'
password = 'secret'
db_driver = 'org.postgresql.Driver'
host = '127.0.0.1'
db_url = f'jdbc:postgresql://{host}:5432/dbname?user={user}&password={password}'
partitions = os.cpu_count() * 2 # a good starting point
conn_properties = {
@mkaranasou
mkaranasou / view_tables_by_size.sql
Created March 10, 2020 10:32
Get the tables by size - Postgres
-- source: https://makandracards.com/makandra/52141-postgresql-how-to-show-table-sizes
SELECT nspname || '.' || relname AS "relation",
pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size"
FROM pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
WHERE nspname NOT IN ('pg_catalog', 'information_schema')
AND C.relkind <> 'i'
AND nspname !~ '^pg_toast'
ORDER BY pg_total_relation_size(C.oid) DESC
@mkaranasou
mkaranasou / pgpartition_by_week_example.sql
Created January 19, 2020 17:13
Output of pgpartition - partition by week for 2020
-- File generated at: 2020-01-19 16:34:07.677439
-- customer_transactions_y2020_w1--------------------------------------------------------
CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w1 (
CHECK (transasction_date >= '2019-12-30 00:00:00' AND transasction_date <= '2020-01-05 23:59:59' )
) INHERITS (customer_transactions);
-- customer_transactions_y2020_w2--------------------------------------------------------
CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w2 (
@mkaranasou
mkaranasou / partition_creation_example.sql
Created January 19, 2020 17:01
An example for generating data partitions by month - Postgres
-- customer_transactions_y2020_m1--------------------------------------------------------
CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w1 (
CHECK (transasction_date >= '2020-01-01 00:00:00' AND transasction_date <= '2020-01-31 23:59:59' )
) INHERITS (customer_transactions);
-- customer_transactions_y2020_m2--------------------------------------------------------
CREATE TABLE IF NOT EXISTS customer_transactions_y2020_w2 (
CHECK (transasction_date >= '2020-02-01 00:00:00' AND transasction_date <= '2020-02-29 23:59:59' )
) INHERITS (customer_transactions);
@mkaranasou
mkaranasou / before_insert_trigger_example.sql
Created January 19, 2020 16:55
Before insert trigger example - Postgres
----- Attach before insert trigger to table --------
DROP TRIGGER IF EXISTS before_insert_customer_transactions_trigger on customer_transactions;
CREATE TRIGGER before_insert_customer_transactions_trigger
BEFORE INSERT ON customer_transactions
FOR EACH ROW EXECUTE PROCEDURE customer_transactions_insert_trigger();
@mkaranasou
mkaranasou / customer_transactions_insert_function_example.sql
Created January 19, 2020 16:54
Example of a data partitioning by month function - Postgres
CREATE OR REPLACE FUNCTION customer_transactions_insert_trigger()
RETURNS TRIGGER AS $$
DECLARE
target_partitioned_by text;
table_name text;
BEGIN
-- get the month from the datetime transasction_date field
SELECT cast(extract(month from NEW.transasction_date) AS TEXT) INTO target_partitioned_by;
table_name = 'customer_transactions_y2020_m' || target_partitioned_by;
@mkaranasou
mkaranasou / postgres-cheatsheet.md
Created November 15, 2019 10:58 — forked from Kartones/postgres-cheatsheet.md
PostgreSQL command line cheatsheet

PSQL

Magic words:

psql -U postgres

Some interesting flags (to see all, use -h or --help depending on your psql version):

  • -E: will describe the underlaying queries of the \ commands (cool for learning!)
  • -l: psql will list all databases and then exit (useful if the user you connect with doesn't has a default database, like at AWS RDS)