Skip to content

Instantly share code, notes, and snippets.

Jitse-Jan jitsejan

Block or report user

Report or block jitsejan

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
@jitsejan
jitsejan / export_gsheet_to_s3.py
Created Aug 13, 2019
Export a Google sheet to S3
View export_gsheet_to_s3.py
# Note: make sure the Google Sheet is published to the web first
import boto3
import json
import pandas as pd
import pyarrow
import requests
GOOGLE_URL = "https://spreadsheets.google.com/feeds/list/{key}/{worksheet}/public/full?alt={format}"
GOOGLE_SHEET_ID = '1234567-abcedf'
GOOGLE_SHEET_TITLE = 'My title'
@jitsejan
jitsejan / trigger_lambda.py
Created Aug 9, 2019
Trigger AWS Lambda function
View trigger_lambda.py
import boto3
import json
REGION = "eu-west-1"
ENV = "dev"
FUNCTION_NAME = 'lambda-function-name'
session = boto3.session.Session(profile_name=ENV)
lambda_client = session.client('lambda', REGION)
@jitsejan
jitsejan / trigger_glue.py
Created Aug 9, 2019
Trigger AWS Glue job
View trigger_glue.py
import boto3
ENV = "dev"
ETL_GLUE_JOB = "my-glue-job"
REGION = "eu-west-1"
session = boto3.session.Session(profile_name=ENV)
glue = session.client('glue', REGION)
def trigger_glue(file_path):
@jitsejan
jitsejan / doughnut_plot.py
Created Jul 7, 2019
Create a dougnut plot with Matplotlib
View doughnut_plot.py
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
plt.rcParams["figure.figsize"] = (12,12)
# Improve style of the plots
style.use('seaborn-deep')
style.use('ggplot')
@jitsejan
jitsejan / minio_spark.py
Created Jun 30, 2019
Reading and writing to minIO from Spark
View minio_spark.py
from pyspark import SparkContext, SparkConf, SQLContext
import os
os.environ['HADOOP_HOME'] = '/opt/hadoop/'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3'
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['LD_LIBRARY_PATH'] = '/opt/hadoop/lib/native'
os.environ['SPARK_DIST_CLASSPATH'] = "/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*"
os.environ['SPARK_HOME'] = '/opt/spark/'
@jitsejan
jitsejan / check_glue_jobs.py
Created Jun 10, 2019
Check the running AWS Glue jobs and print a link to either the logs or S3.
View check_glue_jobs.py
import boto3
import datetime
JOB_NAME = 'mi-glue-job-run-queries-dev'
REGION = 'eu-west-1'
TIME_FORMAT = '%y-%m-%d %H:%M'
GLUE_URL = "https://{region}.console.aws.amazon.com/glue/home?region={region}#jobRun:jobName={job_name};jobRunId={run_id}"
S3_URL = "https://s3.console.aws.amazon.com/s3/buckets/datalake/{table_name}"
CW_URL = "https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logEventViewer:group=/aws-glue/jobs/error;stream={run_id}"
@jitsejan
jitsejan / deploy.sh
Created May 13, 2019
Terraform, AWS Lambda and Looker
View deploy.sh
#!/usr/bin/env bash
export PKG_DIR="python"
export PY_VERSION="python3.7"
printf "\033[1;33m[1/3] Creating packages for Lambda \033[0m\n"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
LAMBDA_DIR="sources/lambda-functions"
FULL_DIR=${SCRIPT_DIR}/${LAMBDA_DIR}
printf "\033[1;35m> Checking for Lambda functions in ${FULL_DIR} \033[0m\n"
for fldr in ${FULL_DIR}/*
@jitsejan
jitsejan / blobspark.py
Created Feb 26, 2019
Interacting with Blob storage (no Spark writing)
View blobspark.py
from azure.storage.blob import BlockBlobService
import pandas as pd
import pyarrow.parquet as pq
from io import BytesIO
from configparser import RawConfigParser
from pyspark import SparkConf, SparkContext, SQLContext
CONTAINER_NAME = "userjj"
BLOB_NAME = "characters.parquet"
@jitsejan
jitsejan / _write_dataframe_to_parquet_on_s3.py
Created Oct 5, 2018
Write a Pandas dataframe to Parquet format on AWS S3.
View _write_dataframe_to_parquet_on_s3.py
# Note: make sure `s3fs` is installed in order to make Pandas use S3.
# Credentials for AWS in the normal location ~/.aws/credentials
def _write_dataframe_to_parquet_on_s3(dataframe, filename):
""" Write a dataframe to a Parquet on S3 """
print("Writing {} records to {}".format(len(dataframe), filename))
output_file = f"s3://{DESTINATION}/{filename}/data.parquet"
dataframe.to_parquet(output_file)
@jitsejan
jitsejan / _write_dataframe_to_csv_on_s3.py
Last active Mar 1, 2019
Write a Pandas dataframe to CSV format on AWS S3.
View _write_dataframe_to_csv_on_s3.py
import boto3
from io import StringIO
def _write_dataframe_to_csv_on_s3(dataframe, filename):
""" Write a dataframe to a CSV on S3 """
print("Writing {} records to {}".format(len(dataframe), filename))
# Create buffer
csv_buffer = StringIO()
# Write dataframe to buffer
dataframe.to_csv(csv_buffer, sep="|", index=False)
You can’t perform that action at this time.