Skip to content

Instantly share code, notes, and snippets.

@jitsejan
jitsejan / trigger_glue.py
Created August 9, 2019 10:25
Trigger AWS Glue job
import boto3
ENV = "dev"
ETL_GLUE_JOB = "my-glue-job"
REGION = "eu-west-1"
session = boto3.session.Session(profile_name=ENV)
glue = session.client('glue', REGION)
def trigger_glue(file_path):
@jitsejan
jitsejan / doughnut_plot.py
Created July 7, 2019 13:23
Create a dougnut plot with Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
plt.rcParams["figure.figsize"] = (12,12)
# Improve style of the plots
style.use('seaborn-deep')
style.use('ggplot')
@jitsejan
jitsejan / minio_spark.py
Created June 30, 2019 22:52
Reading and writing to minIO from Spark
from pyspark import SparkContext, SparkConf, SQLContext
import os
os.environ['HADOOP_HOME'] = '/opt/hadoop/'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3'
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['LD_LIBRARY_PATH'] = '/opt/hadoop/lib/native'
os.environ['SPARK_DIST_CLASSPATH'] = "/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*"
os.environ['SPARK_HOME'] = '/opt/spark/'
@jitsejan
jitsejan / check_glue_jobs.py
Created June 10, 2019 11:09
Check the running AWS Glue jobs and print a link to either the logs or S3.
import boto3
import datetime
JOB_NAME = 'mi-glue-job-run-queries-dev'
REGION = 'eu-west-1'
TIME_FORMAT = '%y-%m-%d %H:%M'
GLUE_URL = "https://{region}.console.aws.amazon.com/glue/home?region={region}#jobRun:jobName={job_name};jobRunId={run_id}"
S3_URL = "https://s3.console.aws.amazon.com/s3/buckets/datalake/{table_name}"
CW_URL = "https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logEventViewer:group=/aws-glue/jobs/error;stream={run_id}"
@jitsejan
jitsejan / deploy.sh
Created May 13, 2019 13:29
Terraform, AWS Lambda and Looker
#!/usr/bin/env bash
export PKG_DIR="python"
export PY_VERSION="python3.7"
printf "\033[1;33m[1/3] Creating packages for Lambda \033[0m\n"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
LAMBDA_DIR="sources/lambda-functions"
FULL_DIR=${SCRIPT_DIR}/${LAMBDA_DIR}
printf "\033[1;35m> Checking for Lambda functions in ${FULL_DIR} \033[0m\n"
for fldr in ${FULL_DIR}/*
@jitsejan
jitsejan / blobspark.py
Created February 26, 2019 12:38
Interacting with Blob storage (no Spark writing)
from azure.storage.blob import BlockBlobService
import pandas as pd
import pyarrow.parquet as pq
from io import BytesIO
from configparser import RawConfigParser
from pyspark import SparkConf, SparkContext, SQLContext
CONTAINER_NAME = "userjj"
BLOB_NAME = "characters.parquet"
@jitsejan
jitsejan / _write_dataframe_to_parquet_on_s3.py
Created October 5, 2018 13:35
Write a Pandas dataframe to Parquet format on AWS S3.
# Note: make sure `s3fs` is installed in order to make Pandas use S3.
# Credentials for AWS in the normal location ~/.aws/credentials
def _write_dataframe_to_parquet_on_s3(dataframe, filename):
""" Write a dataframe to a Parquet on S3 """
print("Writing {} records to {}".format(len(dataframe), filename))
output_file = f"s3://{DESTINATION}/{filename}/data.parquet"
dataframe.to_parquet(output_file)
@jitsejan
jitsejan / _write_dataframe_to_csv_on_s3.py
Last active February 13, 2024 12:53
Write a Pandas dataframe to CSV format on AWS S3.
import boto3
from io import StringIO
def _write_dataframe_to_csv_on_s3(dataframe, filename):
""" Write a dataframe to a CSV on S3 """
print("Writing {} records to {}".format(len(dataframe), filename))
# Create buffer
csv_buffer = StringIO()
# Write dataframe to buffer
dataframe.to_csv(csv_buffer, sep="|", index=False)
@jitsejan
jitsejan / main.py
Last active February 13, 2024 12:54
Azure Table Storage Pandas Dataframe
import pandas as pd
from azure.cosmosdb.table.tableservice import TableService
CONNECTION_STRING = "DUMMYSTRING"
SOURCE_TABLE = "DUMMYTABLE"
def set_table_service():
""" Set the Azure Table Storage service """
return TableService(connection_string=CONNECTION_STRING)
version: '3.2'
services:
scraper:
build:
context: .
dockerfile: ./Dockerfile
command: scrapy crawl insolvencydirect
environment:
- HUB_PORT_4444_TCP_ADDR=hub
- HUB_PORT_4444_TCP_PORT=4444