Skip to content

Instantly share code, notes, and snippets.

View jastang's full-sized avatar

Jason Tang jastang

View GitHub Profile
@jastang
jastang / iam-auth-role.json
Created January 25, 2019 22:04
rds-iam-auth-role
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"rds-db:connect"
],
"Resource": [
"arn:aws:rds-db:your-region:XXXXXXXXXX:dbuser:*/benchsci"
@jastang
jastang / trust-policy.json
Created January 25, 2019 22:01
iam-rds-trust-policy
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"AWS": [
"arn:aws:iam::XXXXXXXXX:role/your-lambda-function-role"
],
"Service": "lambda.amazonaws.com"
@jastang
jastang / serverless.yml
Created January 25, 2019 20:24
Serverless configuration for RDS authentication
provider:
name: aws
runtime: python3.7
stage: 'production'
region: us-east-1
iamRoleStatements:
- Effect: "Allow"
Action:
- "rds:*"
- "sts:*"
@jastang
jastang / create_iam_user.sql
Last active December 21, 2018 20:57
rds-postgres iam auth
CREATE USER benchsci WITH LOGIN; 
GRANT rds_iam TO benchsci;
-- optional
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA <schema> TO benchsci;
@jastang
jastang / udpate_metastore.py
Last active August 30, 2018 20:03
Lambda function to update the Glue catalog
"""
First, start the crawler if it is not already running.
"""
crawler = glue.get_crawler(Name='%s-%s' % (client, DATA_FREQUENCY))
if crawler['Crawler']['State'] == 'READY':
response = glue.start_crawler(Name='%s-%s' % (client, DATA_FREQUENCY))
else:
print('Crawler is not in a READY state or already running! Checking the catalog instead.')
@jastang
jastang / metadata_validator.py
Last active August 30, 2018 20:04
Lambda function for metadata validation
def lambda_handler(event, context):
records = event['Records']
for record in records:
b = record['s3']['bucket']['name']
s3_file = record['s3']['object']['key']
# Load the metadata validation configuration for this client
conf = json.loads(s3.meta.client.get_object(Bucket=b, Key=VALIDATION_CONFIG)['Body'].read(),
object_pairs_hook=OrderedDict)
def lambda_handler(event, context):
if event is None:
print("Couldn't find the object or byte range!")
return
# The byte range is passed from the Chunking function
start_byte = event['start_byte']
end_byte = event['end_byte']
bucket = event['bucket']
key = event['key']
@jastang
jastang / chunk_s3.py
Created March 9, 2018 00:08
Divide large S3 objects for preprocessing
while end_byte <= objectsize:
# Ensure the end_byte is a carriage return, so the line splits work.
end_byte = scan_to_eol(end_byte, objectsize, bucket, key)
# Invoke the cleaning function
ctx = {
"start_byte": start_byte,
"end_byte": end_byte,
"bucket": bucket,
@jastang
jastang / replicate_s3.py
Last active March 12, 2018 20:17
Moving an S3 object from point A to point B
def lambda_handler(event, context):
sb = event['Src_Bucket']
sk = event['Src_Key']
db = event['Dest_Bucket']
dk = event['Dest_Key']
src = {
'Bucket': sb,
'Key': sk
}
@jastang
jastang / read_json_s3.py
Last active March 12, 2018 20:17
Reading a JSON configuration from S3
s3 = boto3.resource('s3')
conf = json.loads(s3.meta.client.get_object(Bucket=b, Key='my-config.json')['Body'].read(), object_pairs_hook=OrderedDict)
bucket = s3.Bucket(name=b)
# This is just metadata
src = bucket.objects.filter(Prefix='sftp')
# If we have N mandatory files, are most recent N uploads representative of the mandatory files?
mandatory_objects = [o.key for o in sorted(src, key=lambda x: x.last_modified, reverse=True)][:len(conf['mandatory_files'])]
# You can now validate whatever you want in mandatory_objects