Skip to content

Instantly share code, notes, and snippets.

@jitsejan
jitsejan / Dockerfile
Last active April 9, 2024 11:15
PySpark, Docker and S3
FROM jupyter/pyspark-notebook
USER root
# Add essential packages
RUN apt-get update && apt-get install -y build-essential curl git gnupg2 nano apt-transport-https software-properties-common
# Set locale
RUN apt-get update && apt-get install -y locales \
&& echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \
&& locale-gen
# Add config to Jupyter notebook
COPY jupyter/jupyter_notebook_config.py /home/jovyan/.jupyter/
@jitsejan
jitsejan / main.py
Created May 8, 2018 16:13
Splunk dependency checker
""" main.py """
from argparse import ArgumentParser
from lxml import etree as ET
import getpass
import glob
import os
import re
import requests
from requests.auth import HTTPBasicAuth
from requests.packages.urllib3.exceptions import InsecureRequestWarning
@jitsejan
jitsejan / main.py
Last active February 13, 2024 12:54
Azure Table Storage Pandas Dataframe
import pandas as pd
from azure.cosmosdb.table.tableservice import TableService
CONNECTION_STRING = "DUMMYSTRING"
SOURCE_TABLE = "DUMMYTABLE"
def set_table_service():
""" Set the Azure Table Storage service """
return TableService(connection_string=CONNECTION_STRING)
@jitsejan
jitsejan / _write_dataframe_to_csv_on_s3.py
Last active February 13, 2024 12:53
Write a Pandas dataframe to CSV format on AWS S3.
import boto3
from io import StringIO
def _write_dataframe_to_csv_on_s3(dataframe, filename):
""" Write a dataframe to a CSV on S3 """
print("Writing {} records to {}".format(len(dataframe), filename))
# Create buffer
csv_buffer = StringIO()
# Write dataframe to buffer
dataframe.to_csv(csv_buffer, sep="|", index=False)
@jitsejan
jitsejan / _write_dataframe_to_parquet_on_s3.py
Created October 5, 2018 13:35
Write a Pandas dataframe to Parquet format on AWS S3.
# Note: make sure `s3fs` is installed in order to make Pandas use S3.
# Credentials for AWS in the normal location ~/.aws/credentials
def _write_dataframe_to_parquet_on_s3(dataframe, filename):
""" Write a dataframe to a Parquet on S3 """
print("Writing {} records to {}".format(len(dataframe), filename))
output_file = f"s3://{DESTINATION}/{filename}/data.parquet"
dataframe.to_parquet(output_file)
@jitsejan
jitsejan / deploy.sh
Created May 13, 2019 13:29
Terraform, AWS Lambda and Looker
#!/usr/bin/env bash
export PKG_DIR="python"
export PY_VERSION="python3.7"
printf "\033[1;33m[1/3] Creating packages for Lambda \033[0m\n"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
LAMBDA_DIR="sources/lambda-functions"
FULL_DIR=${SCRIPT_DIR}/${LAMBDA_DIR}
printf "\033[1;35m> Checking for Lambda functions in ${FULL_DIR} \033[0m\n"
for fldr in ${FULL_DIR}/*
@jitsejan
jitsejan / check_glue_jobs.py
Created June 10, 2019 11:09
Check the running AWS Glue jobs and print a link to either the logs or S3.
import boto3
import datetime
JOB_NAME = 'mi-glue-job-run-queries-dev'
REGION = 'eu-west-1'
TIME_FORMAT = '%y-%m-%d %H:%M'
GLUE_URL = "https://{region}.console.aws.amazon.com/glue/home?region={region}#jobRun:jobName={job_name};jobRunId={run_id}"
S3_URL = "https://s3.console.aws.amazon.com/s3/buckets/datalake/{table_name}"
CW_URL = "https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logEventViewer:group=/aws-glue/jobs/error;stream={run_id}"
@jitsejan
jitsejan / minio_spark.py
Created June 30, 2019 22:52
Reading and writing to minIO from Spark
from pyspark import SparkContext, SparkConf, SQLContext
import os
os.environ['HADOOP_HOME'] = '/opt/hadoop/'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3'
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['LD_LIBRARY_PATH'] = '/opt/hadoop/lib/native'
os.environ['SPARK_DIST_CLASSPATH'] = "/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*"
os.environ['SPARK_HOME'] = '/opt/spark/'
@jitsejan
jitsejan / trigger_glue.py
Created August 9, 2019 10:25
Trigger AWS Glue job
import boto3
ENV = "dev"
ETL_GLUE_JOB = "my-glue-job"
REGION = "eu-west-1"
session = boto3.session.Session(profile_name=ENV)
glue = session.client('glue', REGION)
def trigger_glue(file_path):
@jitsejan
jitsejan / export_gsheet_to_s3.py
Created August 13, 2019 11:44
Export a Google sheet to S3
# Note: make sure the Google Sheet is published to the web first
import boto3
import json
import pandas as pd
import pyarrow
import requests
GOOGLE_URL = "https://spreadsheets.google.com/feeds/list/{key}/{worksheet}/public/full?alt={format}"
GOOGLE_SHEET_ID = '1234567-abcedf'
GOOGLE_SHEET_TITLE = 'My title'