Skip to content

Instantly share code, notes, and snippets.

@pistachiyoda
Created March 19, 2018 15:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pistachiyoda/235ce579080e0a8b2e4b4f0dacc5e757 to your computer and use it in GitHub Desktop.
Save pistachiyoda/235ce579080e0a8b2e4b4f0dacc5e757 to your computer and use it in GitHub Desktop.
import json
import urllib.parse
import boto3
import gzip
import csv
from datetime import datetime
from operator import itemgetter, attrgetter
def lambda_handler(event, context):
# 使用するバケット
bucket = 'BUCKET_NAME'
########各月の最新データへのパスをKeys[]に格納する########
def json_serial(obj):
if isinstance(obj, datetime):
return obj.isoformat()
raise TypeError ("Type %s not serializable" % type(obj))
# そのバケット内のフォルダごとの最新のオブジェクトを取得
# Keysが格納されているPrefixを格納する配列
Prefixes = []
# 最新データへのパスを格納する配列
Keys = []
client = boto3.client('s3')
result = client.list_objects_v2(Bucket=bucket, Prefix='/billingreport/', Delimiter='/')
for o in result.get('CommonPrefixes'):
prefix = o.get('Prefix')
Prefixes.append(prefix)
for prefix in Prefixes:
response = client.list_objects(
Bucket=bucket,
Prefix=prefix
)
str_response = json.loads(json.dumps(response, default=json_serial))
contents = str_response['Contents']
#content部をLastModifiedで並び替え、最新のものを取得
sorted_contents = sorted(contents,key=lambda x:x['LastModified'],reverse=True)
#もしlatest_contentに格納されたのがcsv.gzだったら格納、ちがったら2周目
for latest_content in sorted_contents:
if 'csv.gz' in latest_content['Key']:
Keys.append(latest_content['Key'])
break
else:
continue
########Keys[]にパスが格納された各月のデータをトリムしてマージ
client = boto3.client('s3')
fieldnames = [
'lineItem/UsageStartDate',
'lineItem/ProductCode',
'lineItem/BlendedCost',
'lineItem/UsageType',
'lineItem/CurrencyCode',
'lineItem/LineItemDescription',
'product/productFamily',
'product/region',
'resourceTags/user:Cost'
]
try:
with open('/tmp/processed.csv', 'w') as f:
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
for key in Keys:
response = client.get_object(Bucket=bucket, Key=key)
gz = gzip.GzipFile(fileobj=response['Body'])
with open('/tmp/origin.csv', 'w') as f:
f.write(gz.read().decode('utf-8'))
with open('/tmp/origin.csv') as f:
reader = csv.DictReader(f)
with open('/tmp/processed.csv','a') as csvfile:
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
for row in reader:
writer.writerow(
{
'lineItem/UsageStartDate': row['lineItem/UsageStartDate'],
'lineItem/ProductCode': row['lineItem/ProductCode'],
'lineItem/BlendedCost': row['lineItem/BlendedCost'],
'lineItem/CurrencyCode': row['lineItem/CurrencyCode'],
'lineItem/UsageType': row['lineItem/UsageType'],
'lineItem/LineItemDescription': row['lineItem/LineItemDescription'],
'product/productFamily': row['product/productFamily'],
'product/region': row['product/region'],
'resourceTags/user:Cost': row['resourceTags/user:Cost'],
}
)
with open('/tmp/processed.csv', 'r') as f:
client.put_object(
Bucket=bucket,
Key='latest/marged_processed_latest.csv',
Body=f.read()
)
return response['ContentType']
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment