Skip to content

Instantly share code, notes, and snippets.

View vepetkov's full-sized avatar

V. Petkov vepetkov

  • Munich, Germany
View GitHub Profile
# Parse the whole git history and show files larger than 1Mb (2^20 b)
git rev-list --objects --all |
git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' |
sed -n 's/^blob //p' |
awk '$2 >= 2^20' |
sort --numeric-sort --key=2 |
cut -c 1-12,41- |
$(command -v gnumfmt || echo numfmt) --field=2 --to=iec-i --suffix=B --padding=7 --round=nearest
@vepetkov
vepetkov / snowflake_upload_local.py
Created December 20, 2019 12:56
Snowflake Upload Local Files from Python
import os
import snowflake.connector
ctx = snowflake.connector.connect(
authenticator="snowflake",
user=os.getenv("SNOWSQL_USER"),
password=os.getenv("SNOWSQL_PWD"),
account=os.getenv("SNOWSQL_ACCOUNT"),
warehouse=os.getenv("SNOWSQL_WAREHOUSE")
)
@vepetkov
vepetkov / direnvrc
Created July 14, 2023 09:14
Load .venv automatically using DirEnv
# Store in ~/.config/direnv/direnvrc to run for all folders automatically
# check if VENV is loaded
if [[ -z "${VIRTUAL_ENV_PROMPT}" ]] ; then
if [ ! -d ".venv" ] ; then
echo "Installing virtualenv for $(python -V)"
python -m venv .venv
fi
echo "Activating $(python -V) virtualenv from .venv"
source .venv/bin/activate
fi
@vepetkov
vepetkov / hdfs_pq_access.py
Created September 4, 2018 11:10
Python HDFS + Parquet (hdfs3, PyArrow + libhdfs, HdfsCLI + Knox)
##################################################################
## Native hdfs access (only on the cluster)
# conda install -c conda-forge libhdfs3=2.3.0=1 hdfs3 --yes
import hdfs3
import pandas as pd
nameNodeHost = 'hadoopnn1.localdomain'
nameNodeIPCPort = 8020
hdfs = hdfs3.HDFileSystem(nameNodeHost, port=nameNodeIPCPort)
@vepetkov
vepetkov / gitlab_backup.py
Created September 4, 2018 11:29
Backup All GitLab Projects
## pip install python-gitlab gitpython
import gitlab # python-gitlab
from git import Repo # gitpython
import os, time
##########################
### Python Gitlab Config: ~/.python-gitlab.cfg
# [global]
# default = GitLab
# ssl_verify = true
@vepetkov
vepetkov / hive_csv2avro.py
Last active November 12, 2020 20:45
Convert a CSV to Hive DDL + AVRO Schema (with type inference)
#!/usr/bin/python
import pandas
import sys
import argparse
import string
import subprocess
import json
import textwrap
import re
@vepetkov
vepetkov / pyhive_sample.py
Last active May 19, 2020 12:59
PyHive Sample
from pyhive import hive
import pandas as pd
from vdom import pre
# Nteract Data Explorer
pd.options.display.html.table_schema = True # Data Explorer On!
pd.options.display.max_rows = None # Send all the data! (careful!)
def getHiveConn(host, username, port=10000, schema="db_user1"):
return hive.connect(host=host, port=port, username=username, database=schema, auth=None)
# Test SMTP with STARTTLS
openssl s_client -showcerts -connect smtp.office365.com:587 -servername smtp.office365.com -starttls smtp
# Test IMAP with SSL/TLS
openssl s_client -showcerts -connect outlook.office365.com:993 -servername outlook.office365.com
# Test POP3 with SSL/TLS
openssl s_client -showcerts -connect outlook.office365.com:995 -servername outlook.office365.com
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import tarfile
import json
def get_tar_metadata(file_name):
tar = tarfile.open(file_name, encoding="iso8859-1")
# Get the Hive Symlinks
aws s3 ls s3://<BUCKET>/hive/ --recursive | awk '{print "s3://<BUCKET>/"$4}'
# Delete all data files for the selected Hive partition
aws s3 cp s3://<BUCKET>/hive/dt=2019-07-24-00-00/symlink.txt - | xargs -I {} sh -c 'aws s3 rm {}'