ftfarias/readS3CSV.txt

## readS3CSV.txt
import boto3
import argparse
import elasticsearch
from io import TextIOWrapper
from gzip import GzipFile
import csv

fact_key = "/2018/05/15/mycsv_files"
BUCKET = 'csv_data'
print(f'Reading files at {fact_key}')

parser = argparse.ArgumentParser(description='S3 Reader')
parser.add_argument('token', type=str, help='6 digit mfa token', default='', nargs='?')

args = parser.parse_args()
token = args.token

if token:
    # if supplied, the MFA token will authenticate thought API
    sts_client = boto3.client('sts')

    print('Assuming Role...')
    # From the response that contains the assumed role, get the temporary
    # credentials that can be used to make subsequent API calls
    assumedRoleObject = sts_client.assume_role(
        RoleArn="arn:aws:iam::1234:role/developer-role",
        RoleSessionName="currentRoleSession",
        DurationSeconds=3600,
        SerialNumber="arn:aws:iam::1234:mfa/felipe.farias",
        TokenCode=token
    )
    credentials = assumedRoleObject['Credentials']
    print('Credentials:')
    print(credentials)

    # Use the temporary credentials that AssumeRole returns to make a
    # connection to Amazon S3
    s3 = boto3.client(
        's3',
        aws_access_key_id = credentials['AccessKeyId'],
        aws_secret_access_key = credentials['SecretAccessKey'],
        aws_session_token = credentials['SessionToken'],
    )
else:
    # Not token supplied, so runs with current user
    s3 = boto3.client('s3')


# for bucket in s3.buckets.all():
#     print(bucket.name)


def process_file(key):
    print(f'processing key {key}')
    count = 0
    response = s3.get_object(Bucket=BUCKET, Key=key)
    gzipped = GzipFile(None, 'rb', fileobj=response['Body'])
    data = TextIOWrapper(gzipped)
    input_csv = csv.reader(data, delimiter=';', quotechar='"')
    for line in input_csv:
        if count % 1000000 == 0:
            print(f'{count:,}')
        if count < 10:
            print(line)
        count += 1
    print(f'Processed {count:,} lines')


# Read all files from bucket/key
response = s3.list_objects(Bucket=BUCKET, Prefix=fact_key)
for row in response['Contents']:
    file_key = row['Key']
    process_file(file_key)
	import boto3
	import argparse
	import elasticsearch
	from io import TextIOWrapper
	from gzip import GzipFile
	import csv

	fact_key = "/2018/05/15/mycsv_files"
	BUCKET = 'csv_data'
	print(f'Reading files at {fact_key}')

	parser = argparse.ArgumentParser(description='S3 Reader')
	parser.add_argument('token', type=str, help='6 digit mfa token', default='', nargs='?')

	args = parser.parse_args()
	token = args.token

	if token:
	# if supplied, the MFA token will authenticate thought API
	sts_client = boto3.client('sts')

	print('Assuming Role...')
	# From the response that contains the assumed role, get the temporary
	# credentials that can be used to make subsequent API calls
	assumedRoleObject = sts_client.assume_role(
	RoleArn="arn:aws:iam::1234:role/developer-role",
	RoleSessionName="currentRoleSession",
	DurationSeconds=3600,
	SerialNumber="arn:aws:iam::1234:mfa/felipe.farias",
	TokenCode=token
	)
	credentials = assumedRoleObject['Credentials']
	print('Credentials:')
	print(credentials)

	# Use the temporary credentials that AssumeRole returns to make a
	# connection to Amazon S3
	s3 = boto3.client(
	's3',
	aws_access_key_id = credentials['AccessKeyId'],
	aws_secret_access_key = credentials['SecretAccessKey'],
	aws_session_token = credentials['SessionToken'],
	)
	else:
	# Not token supplied, so runs with current user
	s3 = boto3.client('s3')


	# for bucket in s3.buckets.all():
	# print(bucket.name)


	def process_file(key):
	print(f'processing key {key}')
	count = 0
	response = s3.get_object(Bucket=BUCKET, Key=key)
	gzipped = GzipFile(None, 'rb', fileobj=response['Body'])
	data = TextIOWrapper(gzipped)
	input_csv = csv.reader(data, delimiter=';', quotechar='"')
	for line in input_csv:
	if count % 1000000 == 0:
	print(f'{count:,}')
	if count < 10:
	print(line)
	count += 1
	print(f'Processed {count:,} lines')


	# Read all files from bucket/key
	response = s3.list_objects(Bucket=BUCKET, Prefix=fact_key)
	for row in response['Contents']:
	file_key = row['Key']
	process_file(file_key)