Skip to content

Instantly share code, notes, and snippets.

View dsmdavid's full-sized avatar

David Sanchez dsmdavid

View GitHub Profile
create or replace table "SUPPLIER_ORIGINAL" as
select * from "SNOWFLAKE_SAMPLE_DATA"."TPCH_SF1"."SUPPLIER"; -- 10000 suppliers
create or replace table "SUPPLIER_FEW_CHANGES" clone "SUPPLIER_ORIGINAL";
-- introduce some errors
select count(*) from SUPPLIER_ORIGINAL
where s_address like '%Z%'; --3190 rows will be modified
update "SUPPLIER_FEW_CHANGES"
set S_ADDRESS = replace(S_ADDRESS, 'Z','z');
select approximate_similarity (mh) from
(
(select minhash(100, *) as mh from "LINEITEM_ORIGINAL" )
union all
(select minhash(100, *) as mh from "LINEITEM_FEW_CHANGES" )
);
-- ~4m25 (17487 rows)
select
*
from "TEST_DB"."DAVID_TEST"."LINEITEM_ORIGINAL"
where hash(L_COMMENT,L_SUPPKEY,L_DISCOUNT,L_EXTENDEDPRICE,L_COMMITDATE,L_LINESTATUS,L_QUANTITY,L_RETURNFLAG,L_LINENUMBER,L_PARTKEY,L_SHIPDATE,L_SHIPINSTRUCT,L_TAX,L_ORDERKEY,L_RECEIPTDATE,L_SHIPMODE)
not in (select
hash(L_COMMENT,L_SUPPKEY,L_DISCOUNT,L_EXTENDEDPRICE,L_COMMITDATE,L_LINESTATUS,L_QUANTITY,L_RETURNFLAG,L_LINENUMBER,L_PARTKEY,L_SHIPDATE,L_SHIPINSTRUCT,L_TAX,L_ORDERKEY,L_RECEIPTDATE,L_SHIPMODE)
from "TEST_DB"."DAVID_TEST"."LINEITEM_FEW_CHANGES");
hash_agg(
L_COMMENT,
L_SUPPKEY,
L_DISCOUNT,
L_EXTENDEDPRICE,
L_COMMITDATE,
L_LINESTATUS,
L_QUANTITY,
L_RETURNFLAG,
L_LINENUMBER,
-- ~ 45s
select hash_agg(*), 'original' as s_
from "LINEITEM_ORIGINAL"
union all
select hash_agg(*), 'exact_copy' as s_
from "LINEITEM_EXACT_COPY" ;
select hash_agg(*), 'original' as s_
from "LINEITEM_ORIGINAL"
union all
-- preparation:
-- create three identical tables derived from Snowflake's sample data TPCH_SF100
-- introduce some small variations in one of them
set var_db = 'test_db';
set var_schema = 'david_test';
use database identifier($var_db);
use schema identifier($var_schema);
create or replace table "LINEITEM_ORIGINAL" as
/*
This is the companion code to
https://infinitelambda.atlassian.net/wiki/spaces/HANDBOOK/pages/2079784966/Data+validation+after+refactoring+or+whenever+you+want+to+compare+two+datasets
about using data validation / regression tests
*/
-- preparation:
-- create three identical tables derived from Snowflake's sample data TPCH_SF100
-- introduce some small variations in one of them
use role data_engineering;
@dsmdavid
dsmdavid / get_lat_lon_exif_pil.py
Last active May 27, 2019 13:22 — forked from erans/get_lat_lon_exif_pil.py
Get Latitude and Longitude from EXIF using PIL
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
def get_exif_data(image):
"""Returns a dictionary from the exif data of an PIL Image item. Also converts the GPS Tags"""
exif_data = {}
info = image._getexif()
if info:
for tag, value in info.items():
decoded = TAGS.get(tag, tag)
@dsmdavid
dsmdavid / index.html
Created December 16, 2018 21:55
Tableau JS
<div class="buttons">
<button class="apply_RegionFilter btn" style="background-color: #a8dba2; ">Europe</button>
<button class="apply_RegionFilter btn" style="background-color: #f5a8a3; ">Middle East</button>
<button class="apply_RegionFilter btn" style="background-color: #afc8e3; ">The Americas</button>
<button class="apply_RegionFilter btn" style="background-color: #d0bde0; ">Oceania</button>
<button class="apply_RegionFilter btn" style="background-color: #ffc898; ">Asia</button>
<button class="apply_RegionFilter btn" style="background-color: #cdb2ac; ">Africa</button>
<button class="selectAll_RegionFilter btn btn-outline">All</button>
</div>
<div id="tableauViz"></div>