Skip to content

Instantly share code, notes, and snippets.

View vincentclaes's full-sized avatar
:octocat:

Vincent Claes vincentclaes

:octocat:
View GitHub Profile
@vincentclaes
vincentclaes / scala-spark-datetype-to-timestamptype
Last active May 18, 2018 13:21
convert DateType to TimeStampType because a df from parquet cannot publish DateTypes to a hive table (see issue 6384)
import org.apache.spark.sql.types.{DateType, TimestampType}
import org.apache.spark.sql.DataFrame
/**
* convert DateType to TimeStampType because a df from parquet cannot publish DateTypes to a hive table
* https://stackoverflow.com/questions/37357009/cloudera-5-6-parquet-does-not-support-date-see-hive-6384
* @param df spark dataframe
* @return spark dataframe
*/
def convertDateToTimestamp(df: DataFrame): DataFrame ={
val convertedDf = df.columns.foldLeft(df){(memoDf, colName) =>
def handler(event, context):
"""
extract the contents from the emails and
put the resulting object back to s3.
"""
# get the source key and bucket from
# the event that triggered this lambda_function function.
source_key = event['Records'][0]['s3']['object']['key']
source_bucket = event['Records'][0]['s3']['bucket']['name']
import unittest
import os
from serverless_data_pipeline.lambda_function import extract
from moto import mock_s3
import boto3
import json
@mock_s3
class TestExtract(unittest.TestCase):
import os
import signal
import subprocess
import unittest
from unittest import mock
import boto3
from pyspark.sql import SparkSession
import glue_job
@vincentclaes
vincentclaes / build-and-push-docker-image-to-aws-ecr.sh
Created January 23, 2020 18:43
script to build docker image and push the image to aws ecr
# The name of our algorithm
export AWS_PROFILE= # your AWS profile
algorithm_name= # name of your algorithm
region= # aws region e.g. eu-central-1
account=$(aws sts get-caller-identity --query Account --output text)
@vincentclaes
vincentclaes / secure-serverless-s3-deployment-user.json
Created January 24, 2020 08:25
limit the serverless deployment user from reading data from your s3 buckets
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "VisualEditor2",
"Effect": "Allow",
"Action": "s3:*",
"Resource": [
"arn:aws:s3:::*-serverlessdeploymentbuck*"
]
@vincentclaes
vincentclaes / moto_mock_s3_example.py
Last active October 21, 2020 16:50
example of how to use moto
import boto3
import json
import os
import unittest
from moto import mock_s3
from my_project.lambda_functions import lambda_handler
class TestLambda(unittest.TestCase):
@mock_s3
@vincentclaes
vincentclaes / serverless.yml
Created March 6, 2020 19:41
example of how to configure a lambda with a s3 event
service: my-pipeline
provider:
name: aws
region: eu-central-1
runtime: python3.7
functions:
extract:
handler: lambda_function/extract.handler
# remove all local branches except for master
git branch | grep -v "master" | xargs git branch -d
git branch | grep -v "main" | xargs git branch -d
# fetch objects and prune; delete the ones that you cannot reach.
git fetch -pv
"Version": "2012-10-17",
"Statement": [
{
"Sid": "VisualEditor0",
"Effect": "Allow",
"Action": [
"cloudformation:CreateUploadBucket",
"cloudformation:CancelUpdateStack",
"cloudformation:CreateStack",
"cloudformation:DeleteStack",