Justin Naldzin justinnaldzin

## list_s3_objects.py
import boto3


def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with this prefix (optional).
    :param suffix: Only fetch objects whose keys end with this suffix (optional).

## aws_cloudwatch_logs_delete.sh
LOG_GROUP='my/log/group'
AWS_PROFILE='my-profile'

# Delete all log streams within a log group
aws logs describe-log-streams --profile $AWS_PROFILE --log-group-name $LOG_GROUP --query 'logStreams[*].logStreamName' --output table | awk '{print $2}' | grep -v ^$ | while read x; do aws logs delete-log-stream --profile $AWS_PROFILE --log-group-name $LOG_GROUP --log-stream-name $x; done

# Delete log streams starting with string
aws logs describe-log-streams --profile $AWS_PROFILE --log-group-name $LOG_GROUP --query 'logStreams[?starts_with(logStreamName,`2020/04/3`)].logStreamName' --output table | awk '{print $2}' | grep -v ^$ | while read x; do aws logs delete-log-stream --profile $AWS_PROFILE --log-group-name $LOG_GROUP --log-stream-name $x; done

## process_logging.py
import os
import sys
import logging


FORMATTER = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(message)s')
LOG_FOLDER = 'log'


def get_console_handler():

## bigquery_delete_multiple_tables.sh
# Delete all tables matching a grep pattern in a BigQuery dataset

DATASET=my_dataset
TABLE_PATTERN=my_table_

# Confirm the table names before deleting
for TABLE in `bq ls --max_results=10000 $DATASET | grep TABLE | grep $TABLE_PATTERN | awk '{print $1}'`; do echo $TABLE; done

# Delete the tables; USE WITH CAUTION!
for TABLE in `bq ls --max_results=10000 $DATASET | grep TABLE | grep $TABLE_PATTERN | awk '{print $1}'`; do echo $TABLE; bq rm -f -t $DATASET.$TABLE; done

## gcp_kms_encrypt_decrypt.py
from google.cloud import kms_v1


def encrypt(project_id, location_id, key_ring_id, crypto_key_id, plaintext):
    """Encrypts input plaintext data using the provided symmetric CryptoKey."""

    # Creates an API client for the KMS API.
    client = kms_v1.KeyManagementServiceClient()

    # The resource name of the CryptoKey.

## pyspark_unique_id_column.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                justinnaldzin
                / pyspark_unique_id_column.md
            
            
              Created
              February 13, 2019 21:35
            
          
    Unique ID column

Generate a unique identifier that consistently produces the same result each time based on the values in the row.  The ID column will be the first column positioned in the DataFrame.
from pyspark.sql.functions import sha2, concat_ws

columns = df.columns
df = df.withColumn(id_col, sha2(concat_ws("||", *df.columns), 256))
df = df.select([id_col] + columns)


## google_cloud_composer_manually_trigger_dag.sh
# Google Cloud Composer - Manually trigger DAG runs using Airflow v1.10+
ENVIRONMENT_NAME=my-composer
LOCATION=us-east1

# Trigger DAG - individual
DAG_ID=my_daily_dag
EXEC_DATE=2019-02-11
gcloud composer environments run ${ENVIRONMENT_NAME} --location ${LOCATION} trigger_dag -- -r manual__${EXEC_DATE} -e ${EXEC_DATE} ${DAG_ID}

# Trigger DAG - multiple

## load_data_from_bigquery.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                justinnaldzin
                / load_data_from_bigquery.md
            
            
              Created
              January 17, 2019 20:06
            
          
    Load data from BigQuery

Using the BigQuery client library

pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

  
## aws_s3_unzip_files.sh
# Copy zip files from S3 to local directory, unzip and upload to S3

aws s3 cp s3://bucket/folder/ . --recursive
for f in *.zip; do unzip $f; done
aws s3 cp . s3://bucket/folder/ --recursive --exclude "*.zip"

## aws_utility.py
import boto3


def get_matching_s3_objects(bucket, prefix='', suffix=''):
    """
    Fetch objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
	import boto3


	def get_matching_s3_objects(bucket, prefix="", suffix=""):
	"""
	Generate objects in an S3 bucket.

	:param bucket: Name of the S3 bucket.
	:param prefix: Only fetch objects whose key starts with this prefix (optional).
	:param suffix: Only fetch objects whose keys end with this suffix (optional).
	LOG_GROUP='my/log/group'
	AWS_PROFILE='my-profile'

	# Delete all log streams within a log group
	aws logs describe-log-streams --profile $AWS_PROFILE --log-group-name $LOG_GROUP --query 'logStreams[*].logStreamName' --output table \| awk '{print $2}' \| grep -v ^$ \| while read x; do aws logs delete-log-stream --profile $AWS_PROFILE --log-group-name $LOG_GROUP --log-stream-name $x; done

	# Delete log streams starting with string
	aws logs describe-log-streams --profile $AWS_PROFILE --log-group-name $LOG_GROUP --query 'logStreams[?starts_with(logStreamName,`2020/04/3`)].logStreamName' --output table \| awk '{print $2}' \| grep -v ^$ \| while read x; do aws logs delete-log-stream --profile $AWS_PROFILE --log-group-name $LOG_GROUP --log-stream-name $x; done
	import os
	import sys
	import logging


	FORMATTER = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(message)s')
	LOG_FOLDER = 'log'


	def get_console_handler():
	# Delete all tables matching a grep pattern in a BigQuery dataset

	DATASET=my_dataset
	TABLE_PATTERN=my_table_

	# Confirm the table names before deleting
	for TABLE in `bq ls --max_results=10000 $DATASET \| grep TABLE \| grep $TABLE_PATTERN \| awk '{print $1}'`; do echo $TABLE; done

	# Delete the tables; USE WITH CAUTION!
	for TABLE in `bq ls --max_results=10000 $DATASET \| grep TABLE \| grep $TABLE_PATTERN \| awk '{print $1}'`; do echo $TABLE; bq rm -f -t $DATASET.$TABLE; done
	from google.cloud import kms_v1


	def encrypt(project_id, location_id, key_ring_id, crypto_key_id, plaintext):
	"""Encrypts input plaintext data using the provided symmetric CryptoKey."""

	# Creates an API client for the KMS API.
	client = kms_v1.KeyManagementServiceClient()

	# The resource name of the CryptoKey.
	# Google Cloud Composer - Manually trigger DAG runs using Airflow v1.10+
	ENVIRONMENT_NAME=my-composer
	LOCATION=us-east1

	# Trigger DAG - individual
	DAG_ID=my_daily_dag
	EXEC_DATE=2019-02-11
	gcloud composer environments run ${ENVIRONMENT_NAME} --location ${LOCATION} trigger_dag -- -r manual__${EXEC_DATE} -e ${EXEC_DATE} ${DAG_ID}

	# Trigger DAG - multiple
	# Copy zip files from S3 to local directory, unzip and upload to S3

	aws s3 cp s3://bucket/folder/ . --recursive
	for f in *.zip; do unzip $f; done
	aws s3 cp . s3://bucket/folder/ --recursive --exclude "*.zip"
	import boto3


	def get_matching_s3_objects(bucket, prefix='', suffix=''):
	"""
	Fetch objects in an S3 bucket.

	:param bucket: Name of the S3 bucket.
	:param prefix: Only fetch objects whose key starts with
	this prefix (optional).