Skip to content

Instantly share code, notes, and snippets.

View djouallah's full-sized avatar

Mimoune djouallah

View GitHub Profile
SELECT
--Query01
l_returnflag,
l_linestatus,
SUM(l_quantity) AS sum_qty,
SUM(l_extendedprice) AS sum_base_price,
SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
AVG(l_quantity) AS avg_qty,
AVG(l_extendedprice) AS avg_price,
@djouallah
djouallah / delta.py
Created October 25, 2023 12:50
write to Onelake using Python
%%time
!pip install -q duckdb
!pip install -q deltalake
import duckdb
from deltalake.writer import write_deltalake
from trident_token_library_wrapper import PyTridentTokenLibrary
aadToken = PyTridentTokenLibrary.get_access_token("storage")
sf =1
for x in range(0, sf) :
con=duckdb.connect()
@djouallah
djouallah / gist:998571cf7560fb697ed174d1ef65b7fe
Created July 5, 2023 11:14
load latest metadata iceberg
import boto3
import pandas as pd
s3_client = boto3.client('s3')
bucket = 'xxxxxxx'
prefix = 'zzzz/yyyyyy/metadata'
paginator = s3_client.get_paginator('list_objects_v2')
response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
file_names = pd.DataFrame(columns=['file','date'])
for response in response_iterator:
for object_data in response['Contents']:
#V order thing you can ignore those two lines
spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")
#Load from the default lakehouse, make sure you click on the pin
from pyspark.sql.types import *
df = spark.read.option("header", "true").format("csv").load("Files/csv/*.csv")
df.write.mode("overwrite").format("delta").save("Tables/tablecsv")
from datetime import datetime, date, timedelta
import urllib.request as urllib2
import tempfile
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import re ,shutil
from urllib.request import urlopen
import os
import adlfs
let
parquetfunctin =(params) =>
let
Parquet = Parquet.Document(Binary.Buffer(Web.Contents("https://nyc-tlc.s3.amazonaws.com/trip+data/",[RelativePath=params])))
in
Parquet,
select `WKT`,
`C1`,
`C2`
from
(
select `WKT`,
sum(`area`) as `C1`,
min(`WKT`) as `C2`
from
(
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"usermeta": {
"deneb": {
"build": "1.1.0.20220119#7e76f47",
"metaVersion": 1,
"provider": "vegaLite",
"providerVersion": "5.2.0"
},
"interactivity": {
// DAX Query
DEFINE
VAR __DS0FilterTable =
TREATAS({"Coal",
"Renewable",
"Fuel"}, 'UNITARCHIVE'[Technology])
VAR __DS0FilterTable2 =
FILTER(
KEEPFILTERS(VALUES('UNITARCHIVE'[SETTLEMENTDATE])),
WITH
xx AS (
SELECT
"australia" AS key,
state,
suburbs,
longitude,
latitude,
IRSAD
FROM