Skip to content

Instantly share code, notes, and snippets.

View vepetkov's full-sized avatar

V. Petkov vepetkov

  • Munich, Germany
View GitHub Profile
@vepetkov
vepetkov / hdfs_pq_access.py
Created September 4, 2018 11:10
Python HDFS + Parquet (hdfs3, PyArrow + libhdfs, HdfsCLI + Knox)
##################################################################
## Native hdfs access (only on the cluster)
# conda install -c conda-forge libhdfs3=2.3.0=1 hdfs3 --yes
import hdfs3
import pandas as pd
nameNodeHost = 'hadoopnn1.localdomain'
nameNodeIPCPort = 8020
hdfs = hdfs3.HDFileSystem(nameNodeHost, port=nameNodeIPCPort)
@vepetkov
vepetkov / gitlab_backup.py
Created September 4, 2018 11:29
Backup All GitLab Projects
## pip install python-gitlab gitpython
import gitlab # python-gitlab
from git import Repo # gitpython
import os, time
##########################
### Python Gitlab Config: ~/.python-gitlab.cfg
# [global]
# default = GitLab
# ssl_verify = true
@vepetkov
vepetkov / hive_csv2avro.py
Last active November 12, 2020 20:45
Convert a CSV to Hive DDL + AVRO Schema (with type inference)
#!/usr/bin/python
import pandas
import sys
import argparse
import string
import subprocess
import json
import textwrap
import re
@vepetkov
vepetkov / pyhive_sample.py
Last active May 19, 2020 12:59
PyHive Sample
from pyhive import hive
import pandas as pd
from vdom import pre
# Nteract Data Explorer
pd.options.display.html.table_schema = True # Data Explorer On!
pd.options.display.max_rows = None # Send all the data! (careful!)
def getHiveConn(host, username, port=10000, schema="db_user1"):
return hive.connect(host=host, port=port, username=username, database=schema, auth=None)
@vepetkov
vepetkov / kubectl-resources.sh
Created September 12, 2018 10:38
kubectl: get requested resources for all pods
# Get the requested resources for all pods by container
kubectl get pods -ao jsonpath='{range .items[*]}{@.metadata.name}{"\n"}{range @.spec.containers[*]}{"\t"}{@.name}{" cpu:"}{@.resources.requests.cpu}{" mem:"}{@.resources.requests.memory}{"\n"}{end}{end}'
@vepetkov
vepetkov / ssl_keystores.sh
Last active May 28, 2019 12:37
PKCS12 & JKS keystores from a signed cert, private key and DigiCert CA chain
# Concatenate the Root and SubCA certs from DigiCert
# to get the full certification chain
cat DigiCert_Global_Root_CA.pem DigiCertSHA2SecureServerCA.pem > DigiCertCA_Chain.pem
# Generate a new key store from the signed cert, the private key
openssl pkcs12 -export \
-in my_cert_signed.crt
-inkey my_cert_key.pem
-chain -CAfile DigiCertCA_Chain.pem \
-name "my_cert" -out my_cert.keystore.p12
# Unzip in-place (i.e. in the folder containing the file and not the current one)
find . -type f -name "*.zip" | xargs -P4 -I fileName sh -c 'unzip -o -d "$(dirname "fileName")" "fileName" && rm "fileName"'
# Gzip all CSV extracted from the ZIP files
find . -type f -name *.csv -print0 | xargs -0 -n1 -P4 gzip
@vepetkov
vepetkov / read_orc.py
Created May 7, 2019 15:20
Read a local ORC file in Python and convert it to a DF
import pandas as pd
import pyarrow.orc as orc
file0 = open('/hive/warehouse/000000_0', 'rb')
data0 = orc.ORCFile(file0)
df0 = data0.read(columns=['_col10', '_col50']).to_pandas()
df0.describe()
# Get the Hive Symlinks
aws s3 ls s3://<BUCKET>/hive/ --recursive | awk '{print "s3://<BUCKET>/"$4}'
# Delete all data files for the selected Hive partition
aws s3 cp s3://<BUCKET>/hive/dt=2019-07-24-00-00/symlink.txt - | xargs -I {} sh -c 'aws s3 rm {}'
@vepetkov
vepetkov / snowflake_upload_local.py
Created December 20, 2019 12:56
Snowflake Upload Local Files from Python
import os
import snowflake.connector
ctx = snowflake.connector.connect(
authenticator="snowflake",
user=os.getenv("SNOWSQL_USER"),
password=os.getenv("SNOWSQL_PWD"),
account=os.getenv("SNOWSQL_ACCOUNT"),
warehouse=os.getenv("SNOWSQL_WAREHOUSE")
)