nuria nuria

## json_blobs_are_evil
You might have thought , “mmm this is un-structured data I want to put in this record , let me use a json field and we will see later what we want to do with it.”   Stop right there! These are the reasons why postgres JSON columns are evil and we should not use them as they cause a lot of problems.

(this applies to JSON stored in varchar columns and proper JSON blobs)

- **They grow unbounded to major sizes.** We had an incident in which code gone wild persisted 1GB of json in the users.details column, this affected the database but also cascaded to other parts of our system. The 1GB JSON column also broke our search pipeline as kafka messages are capped to 8MB so that record could not be processed. Until we cleaned up it halted processing of changes for indexing of classes on elastic search.
    - **Particularly problematic when used to store a logs of events**. We have used JSON fields to store logs of events which have a many-to-one relationship with the primary record the JSON field is stored on. We’ve

## percolator_opensearch_example.sh
DELETE /percolator-queries

PUT percolator-queries
{
  "mappings": {
    "properties": {
      "search": {
        "properties": {
          "query": {
            "type": "percolator"

## example_window_function.sql
with base_data as (
--
select date_trunc('day', received_at) as received_at, count(*) as events_received
 from some_com.performance_largest_contentful_paint where received_at > '2022-01-01'
group by date_trunc('day', received_at) order by date_trunc('day', received_at)

)

select received_at, events_received ,
avg(events_received) over (partition by date_trunc('month',received_at)) as monthly_avg ,

## window_function_inside_case_statement.sql
select *,
	case
		when confirmed_at is not null and price_dollars > 0
		then row_number() over (
			partition by user_id, price_dollars > 0
			order by confirmed_at
		)
		else null
	end as user_enroll_rank,
	case

## gist:5fba9f7517114bdff6961bb0838aa963
select 'alter table ' || table_schema || '.' || table_name || ' alter column experiment_assignments type varchar(65535);'
from svv_columns
where table_schema = 'some'
    and column_name = 'some'
    and character_maximum_length = 512;

## delete_honeycomb_columns_no_op_by_default.py
#!/usr/bin/env python3

# usage: hny-column-cleanup.py [-h] -k API_KEY -d DATASET [-m {hidden,spammy}]
# Honeycomb Dataset Column Cleanup tool
# arguments:
#   -h, --help            show this help message and exit
#   -k API_KEY, --api-key API_KEY
#                         Honeycomb API key
#   -d DATASET, --dataset DATASET
#                         Honeycomb Dataset

## 28_day_running_percentile.sql
with  base_data as (
    select received_at, date_trunc('day', received_at)::date as end_date ,
           date_add('day', -28, date_trunc('day', received_at))::date start_date, load_time
    from outschool_com.performance_largest_contentful_paint
    where
      received_at > date_add('day',-90, sysdate::date)
    and load_time > 0
    and load_time < 60000
    --order by md5('seed' || received_at)
    ),

## redshift_connector_playground.sh


#!/bin/bash
set -e

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
source ${DIR}/../../scripts/utils.sh

if [ ! -f ${DIR}/RedshiftJDBC4-1.2.20.1043.jar ]
then

## produce-kafka-rest-jsonschema.sh
curl -X POST -H "Content-Type: application/vnd.kafka.jsonschema.v2+json" \
 -H "Accept: application/vnd.kafka.v2+json" \
 --data '{"value_schema_id":100101,   "records": [{"value": {"name": "testUser", "planet":"moon"} } ] }' \
   "http://localhost:8082/topics/rest-proxy-test"

## kafkacat_auth
kafkacat -F ./kafkacat.auth -p 0  -t postgres.public.activities -o 31992 -u -c1 | hexdump -C


sasl.username=
sasl.password=

sasl.mechanisms=PLAIN
# Required for correctness in Apache Kafka clients prior to 2.6
#client.dns.lookup=use_all_dns_ips
# Best practice for Kafka producer to prevent data loss
	You might have thought , “mmm this is un-structured data I want to put in this record , let me use a json field and we will see later what we want to do with it.” Stop right there! These are the reasons why postgres JSON columns are evil and we should not use them as they cause a lot of problems.

	(this applies to JSON stored in varchar columns and proper JSON blobs)

	- They grow unbounded to major sizes. We had an incident in which code gone wild persisted 1GB of json in the users.details column, this affected the database but also cascaded to other parts of our system. The 1GB JSON column also broke our search pipeline as kafka messages are capped to 8MB so that record could not be processed. Until we cleaned up it halted processing of changes for indexing of classes on elastic search.
	- Particularly problematic when used to store a logs of events. We have used JSON fields to store logs of events which have a many-to-one relationship with the primary record the JSON field is stored on. We’ve
	DELETE /percolator-queries

	PUT percolator-queries
	{
	"mappings": {
	"properties": {
	"search": {
	"properties": {
	"query": {
	"type": "percolator"
	with base_data as (
	--
	select date_trunc('day', received_at) as received_at, count(*) as events_received
	from some_com.performance_largest_contentful_paint where received_at > '2022-01-01'
	group by date_trunc('day', received_at) order by date_trunc('day', received_at)

	)

	select received_at, events_received ,
	avg(events_received) over (partition by date_trunc('month',received_at)) as monthly_avg ,
	select *,
	case
	when confirmed_at is not null and price_dollars > 0
	then row_number() over (
	partition by user_id, price_dollars > 0
	order by confirmed_at
	)
	else null
	end as user_enroll_rank,
	case
	select 'alter table ' \|\| table_schema \|\| '.' \|\| table_name \|\| ' alter column experiment_assignments type varchar(65535);'
	from svv_columns
	where table_schema = 'some'
	and column_name = 'some'
	and character_maximum_length = 512;
	#!/usr/bin/env python3

	# usage: hny-column-cleanup.py [-h] -k API_KEY -d DATASET [-m {hidden,spammy}]
	# Honeycomb Dataset Column Cleanup tool
	# arguments:
	# -h, --help show this help message and exit
	# -k API_KEY, --api-key API_KEY
	# Honeycomb API key
	# -d DATASET, --dataset DATASET
	# Honeycomb Dataset
	with base_data as (
	select received_at, date_trunc('day', received_at)::date as end_date ,
	date_add('day', -28, date_trunc('day', received_at))::date start_date, load_time
	from outschool_com.performance_largest_contentful_paint
	where
	received_at > date_add('day',-90, sysdate::date)
	and load_time > 0
	and load_time < 60000
	--order by md5('seed' \|\| received_at)
	),


	#!/bin/bash
	set -e

	DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
	source ${DIR}/../../scripts/utils.sh

	if [ ! -f ${DIR}/RedshiftJDBC4-1.2.20.1043.jar ]
	then
	curl -X POST -H "Content-Type: application/vnd.kafka.jsonschema.v2+json" \
	-H "Accept: application/vnd.kafka.v2+json" \
	--data '{"value_schema_id":100101, "records": [{"value": {"name": "testUser", "planet":"moon"} } ] }' \
	"http://localhost:8082/topics/rest-proxy-test"
	kafkacat -F ./kafkacat.auth -p 0 -t postgres.public.activities -o 31992 -u -c1 \| hexdump -C


	sasl.username=
	sasl.password=

	sasl.mechanisms=PLAIN
	# Required for correctness in Apache Kafka clients prior to 2.6
	#client.dns.lookup=use_all_dns_ips
	# Best practice for Kafka producer to prevent data loss