This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM jupyter/pyspark-notebook | |
USER root | |
# Add essential packages | |
RUN apt-get update && apt-get install -y build-essential curl git gnupg2 nano apt-transport-https software-properties-common | |
# Set locale | |
RUN apt-get update && apt-get install -y locales \ | |
&& echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \ | |
&& locale-gen | |
# Add config to Jupyter notebook | |
COPY jupyter/jupyter_notebook_config.py /home/jovyan/.jupyter/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" main.py """ | |
from argparse import ArgumentParser | |
from lxml import etree as ET | |
import getpass | |
import glob | |
import os | |
import re | |
import requests | |
from requests.auth import HTTPBasicAuth | |
from requests.packages.urllib3.exceptions import InsecureRequestWarning |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from azure.cosmosdb.table.tableservice import TableService | |
CONNECTION_STRING = "DUMMYSTRING" | |
SOURCE_TABLE = "DUMMYTABLE" | |
def set_table_service(): | |
""" Set the Azure Table Storage service """ | |
return TableService(connection_string=CONNECTION_STRING) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
from io import StringIO | |
def _write_dataframe_to_csv_on_s3(dataframe, filename): | |
""" Write a dataframe to a CSV on S3 """ | |
print("Writing {} records to {}".format(len(dataframe), filename)) | |
# Create buffer | |
csv_buffer = StringIO() | |
# Write dataframe to buffer | |
dataframe.to_csv(csv_buffer, sep="|", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: make sure `s3fs` is installed in order to make Pandas use S3. | |
# Credentials for AWS in the normal location ~/.aws/credentials | |
def _write_dataframe_to_parquet_on_s3(dataframe, filename): | |
""" Write a dataframe to a Parquet on S3 """ | |
print("Writing {} records to {}".format(len(dataframe), filename)) | |
output_file = f"s3://{DESTINATION}/{filename}/data.parquet" | |
dataframe.to_parquet(output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
export PKG_DIR="python" | |
export PY_VERSION="python3.7" | |
printf "\033[1;33m[1/3] Creating packages for Lambda \033[0m\n" | |
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |
LAMBDA_DIR="sources/lambda-functions" | |
FULL_DIR=${SCRIPT_DIR}/${LAMBDA_DIR} | |
printf "\033[1;35m> Checking for Lambda functions in ${FULL_DIR} \033[0m\n" | |
for fldr in ${FULL_DIR}/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import datetime | |
JOB_NAME = 'mi-glue-job-run-queries-dev' | |
REGION = 'eu-west-1' | |
TIME_FORMAT = '%y-%m-%d %H:%M' | |
GLUE_URL = "https://{region}.console.aws.amazon.com/glue/home?region={region}#jobRun:jobName={job_name};jobRunId={run_id}" | |
S3_URL = "https://s3.console.aws.amazon.com/s3/buckets/datalake/{table_name}" | |
CW_URL = "https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#logEventViewer:group=/aws-glue/jobs/error;stream={run_id}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkContext, SparkConf, SQLContext | |
import os | |
os.environ['HADOOP_HOME'] = '/opt/hadoop/' | |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64' | |
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3' | |
os.environ['PYSPARK_PYTHON'] = 'python3' | |
os.environ['LD_LIBRARY_PATH'] = '/opt/hadoop/lib/native' | |
os.environ['SPARK_DIST_CLASSPATH'] = "/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*" | |
os.environ['SPARK_HOME'] = '/opt/spark/' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
ENV = "dev" | |
ETL_GLUE_JOB = "my-glue-job" | |
REGION = "eu-west-1" | |
session = boto3.session.Session(profile_name=ENV) | |
glue = session.client('glue', REGION) | |
def trigger_glue(file_path): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: make sure the Google Sheet is published to the web first | |
import boto3 | |
import json | |
import pandas as pd | |
import pyarrow | |
import requests | |
GOOGLE_URL = "https://spreadsheets.google.com/feeds/list/{key}/{worksheet}/public/full?alt={format}" | |
GOOGLE_SHEET_ID = '1234567-abcedf' | |
GOOGLE_SHEET_TITLE = 'My title' |
NewerOlder