#!/usr/bin/env bash
CWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
start='2019-01-01'
end='2019-02-01'
start=$(date -d $start +%Y%m%d)
end=$(date -d $end +%Y%m%d)
while [[ $start -le $end ]]
do
echo $(date -d $start +%Y-%m-%d)
start=$(date -d"$start + 1 day" +"%Y%m%d")
done
# ssh
mkdir .ssh
touch .ssh/authorized_keys
chmod 700 .ssh
chmod 600 .ssh/authorized_keys
# get user's ram usage
function ram {
for _user in $(ps haux | awk '{print $1}' | sort -u); do ps haux | awk -v user=${_user} '$1 ~ user { sum += $4} END { print user, sum; }' ; done | sort -n -k2,2
}
# zsh
emulate sh -c "source ~/.bash_profile"
# find
find . -type f -mmin -10 -not -path "./scheduler/*" -not -path "./dag_processor_manager/*"
find . -type f -mmin -10 -not -path "./scheduler/*" -not -path "./dag_processor_manager/*" -exec grep -i 'error' {} +
# rsync
## not create new folder inside /...tests
rsync -avzp --del -e "ssh -p myport" user@hostname:/var/www/tests/ /var/www/tests
[alias]
st = "status -s -b"
c = "commit"
# --all
# --verbose
br = "branch -a -v"
co = "checkout"
cb = "checkout -b"
# Short hash, relative date and message.
logd = "log --pretty='%C(yellow)%h %C(cyan)%ar %Creset%s'"
# --graph:
# Draw a text-based graphical representation of the commit history on
# the left hand side of the output.
# --decorate:
# Print out the ref names of any commits that are shown. Defaults to
# short optionm such that the name prefixes refs/heads/, refs/tags/ and
# refs/remotes/ will not be printed.
logline = "log --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit"
hs = "log --pretty='%C(yellow)%h %C(cyan)%cd %Cblue%aN%C(auto)%d %Creset%s' --graph --date=relative --date-order"
conda env list
conda create --name dagster-3.9.11 python=3.9.11
pip install ipykernel
python -m ipykernel install --user --name python3.9.4-spark-nb --display-name "Python 3.9 (spark-nb)"
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def chunks_iter(l, n):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def range_generator(upper, batch_size):
"""Yield successive n-sized chunks from l."""
return [
(i, i + batch_size)
for i in range(0, upper, batch_size)
]
import os
import sys
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.core.display import display, HTML
display(
HTML("<style>.container {width: 100% ! important;}</style>")
)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
pd.set_option('max_colwidth', 400)
%matplotlib inline
sns.set()
%load_ext autoreload
%autoreload 2
xls = pd.ExcelFile('/apps/jupyter/users/vietvu/FS/reports/monthly_report/20220215/final/20220211_vnpt_fe_fs_final.xlsx')
print(f"Sheets: {xls.sheet_names}")
df_detail = xls.parse('Sheet1')
select * from all_tables;
select * from table_privileges where owner = '' or grantee = '';
-- tables
select * from pg_tables where schemaname like 'data%';
-- create table
spark.sparkContext.getConf().getAll()
from delta.tables import DeltaTable
deltaTable = DeltaTable.forPath(spark, "/data/events/")
deltaTable.delete("date = '2017-01-01'") # predicate using SQL formatted string
# compact (not work at the time of testing)
partition = 'date = "2021-20-10"'
spark.read
.format("delta")
.load(table)
.where(partition)
.repartition(numFiles)
.write
.format("delta")
.mode("overwrite")
.option("replaceWhere", partition)
.save(table)
# delta asVersion
val df = spark.read
.format("delta")
.option("versionAsOf", "5238")
.load("/path/to/my/table")
# delta history
fullHistoryDF = deltaTable.history()
# join indicator
def merge_indicator(
df1: DataFrame,
df2: DataFrame,
col: str,
how: str) -> DataFrame:
"""
merge dataframes and append column `_merge` (both, right_only, left_only) like pandas'
"""
col2 = col + '2'
df2 = df2.withColumnRenamed(col, col2)
sdf = df2.join(df2, df1[col] == df2[col2], how)
out_df = (sdf
.withColumn('_merge',
F.when(sdf[col].isNull(), 'right_only')
.when(sdf[col2].isNull(), 'left_only')
.otherwise('both')
)
.withColumn(col, F.coalesce(sdf[col], sdf[col2]))
.drop(col2)
)
return out_df
# mount data from Google Drive for using in this notebook
from pathlib import Path
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_path = Path("/content/gdrive/My Drive/")
# logs
docker logs --since 20220101 fsprod3.7
docker update --restart no redis
import ssl
ssl._create_default_https_context = ssl._create_unverified_context