Skip to content

Instantly share code, notes, and snippets.

@kkrishnan90
Created October 13, 2022 09:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kkrishnan90/37a1447099653dc6d504b2d2cf6281e7 to your computer and use it in GitHub Desktop.
Save kkrishnan90/37a1447099653dc6d504b2d2cf6281e7 to your computer and use it in GitHub Desktop.
single-page-report.py
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os, time
from os import path
from google.cloud import storage
import re
import pdfplumber
import datetime
from collections import namedtuple
import functions_framework
import tempfile
import re
import datetime
page_lines=[]
final_lines=[]
Row = namedtuple('Row',['service_name','region','description','price','page_number'])
service_margin=''
service_margin_index=0
words=[]
dfs = {}
def extract_line_items(pdf_file_path, csv_file_path, base_file_name):
temp_output_path = '/tmp/awsvsgcp.pdf'
temp_dir_path = '/tmp/'
print('Temp output pdf report is saved at {}'.format(temp_output_path))
report = PdfPages(temp_output_path)
#Create invoice breakup chart
with pdfplumber.open(pdf_file_path) as pdf:
pages = pdf.pages
df=pd.DataFrame()
for page in pages:
words_extracted = page.extract_words(extra_attrs=['fontname','size','page_number'])
page_lines.append(words_extracted)
pdf.close()
for idx,lines in enumerate(page_lines):
dfs[idx]=pd.DataFrame(lines)
df = pd.concat([dfs[idx] for idx, df in enumerate(dfs)])
df2=df.groupby(['page_number','bottom'],as_index=False).agg({'x0':'first','text':lambda x: ' '.join(x),'fontname':'first','size':'first'})
for idx,row in df2.iterrows():
if 'Details' in row['text']:
service_margin=df2['x0'][idx+2]
service_margin_index = idx+2
print('Service Margin:{},{}'.format(service_margin, service_margin_index))
for idx,row in df2.loc[service_margin_index:].iterrows():
page_number = row['page_number']
if row['x0']==service_margin:
service_name = row['text'].split('$')[0].strip()
region_name=''
description=''
price=float(row['text'].replace(',','').split('$')[-1])
final_lines.append(Row(service_name=service_name,region=region_name, description=description, price=price,page_number=page_number))
df3 = pd.DataFrame(final_lines)
df3.drop_duplicates(subset=['service_name'],keep='first',inplace=True)
df3.reset_index(inplace=True,drop=True)
df3.sort_values('price',ascending=False,inplace=True)
# Setup figures and axes
fig, ax = plt.subplots(nrows=4, ncols=1, figsize=(40, 40))
#Create pie chart
price_np_array = df3['price'].to_numpy()
service_np_array = df3['service_name'].to_numpy()
pie_figure = plt.figure(figsize=(40,5))
# pie_figure.suptitle('AWS Bill Breakup', fontsize=22,y=1,)
title = ax[0].set_title('AWS Bill Breakup', fontsize=22, fontweight='bold')
title.set_ha("center")
percents = df3['price'] * 100 / price_np_array.sum()
labels = ['%s, %1.1f %%' % (l, s) for l, s in zip(service_np_array, percents)]
patches, texts = ax[0].pie(price_np_array,startangle=0,radius=1.2)
ax[0].legend(patches,labels,title="Services",loc="upper right",fontsize=13,bbox_to_anchor=(0,0.8),ncol=3,)
pie_figure.subplots_adjust(left=2, right=3)
# Create service level breakup comparison chart between AWS & GCP
df = pd.read_csv(csv_file_path)
df_s3 = df.query('ProductCode=="AmazonS3"')
df_ec2 = df.query('ProductCode=="AmazonEC2"')
df_rds = df.query('ProductCode=="AmazonRDS"')
if not df_s3.empty:
# Create plots for S3
df_s3_aws_cost=df_s3['Source Cost']
df_s3_product_type=df_s3['ProductType']
df_s3_gcp_cost=df_s3['GCP Cost']
N_S3=len(df_s3_product_type)
ind_s3 = np.arange(N_S3)
s3_fig=plt.figure(figsize=(40,5))
width = 0.3
ax[1].bar(ind_s3, df_s3_aws_cost , width, label='AWS',color='orange')
ax[1].bar(ind_s3 + width, df_s3_gcp_cost, width, label='GCP',color='royalblue')
ax[1].set_xlabel('Product Type')
ax[1].set_ylabel('Price(in USD)')
ax[1].set_title('AWS S3 vs GCP GCS Cost Comparison',fontsize=22, fontweight='bold')
ax[1].set_xticks(ind_s3 + width / 2, df_s3_product_type)
ax[1].legend(loc='best')
# report.savefig(s3_fig, bbox_inches='tight')
if not df_ec2.empty:
# Create plots for EC2
df_ec2_aws_cost=df_ec2['Source Cost']
df_ec2_product_type=df_ec2['ProductType'].str.split(':').str[1]
df_gce_gcp_cost=df_ec2['GCP Cost']
N_EC2 = len(df_ec2_product_type)
ind_ec2 = np.arange(N_EC2)
ec2_fig=plt.figure(figsize=(40,5))
width = 0.3
ax[2].bar(ind_ec2, df_ec2_aws_cost , width, label='AWS',color='orange')
ax[2].bar(ind_ec2 + width, df_gce_gcp_cost, width, label='GCP',color='royalblue')
ax[2].set_xlabel('Product Type')
ax[2].set_ylabel('Price(in USD)')
ax[2].set_title('AWS EC2 vs GCP GCE Cost Comparison',fontsize=22, fontweight='bold')
ax[2].set_xticks(ind_ec2 + width / 2, df_ec2_product_type)
ax[2].legend(loc='best')
# report.savefig(ec2_fig, bbox_inches='tight')
if not df_rds.empty:
# Create plots for RDS
df_rds_aws_cost=df_rds['Source Cost']
df_rds_product_type=df_rds['ProductType'].str.split(':').str[1]
df_csql_gcp_cost=df_rds['GCP Cost']
N_RDS = len(df_rds_product_type)
ind_rds = np.arange(N_RDS)
rds_fig=plt.figure(figsize=(40,5))
width = 0.3
ax[3].bar(ind_rds, df_rds_aws_cost , width, label='AWS',color='orange')
ax[3].bar(ind_rds + width, df_csql_gcp_cost, width, label='GCP',color='royalblue')
ax[3].set_xlabel('Product Type')
ax[3].set_ylabel('Price(in USD)')
ax[3].set_title('AWS RDS vs GCP Cloud SQL Cost Comparison', fontsize=22, fontweight='bold')
ax[3].set_xticks(ind_rds + width / 2, df_rds_product_type)
ax[3].legend(loc='best')
# report.savefig(rds_fig, bbox_inches='tight')
report.savefig(fig)
report.close()
print('Listing files in dir : {}'.format(os.listdir(temp_dir_path)))
while not os.path.exists(temp_output_path):
print('Checking file if exists')
time.sleep(5)
if os.path.isfile(temp_output_path):
print('File exists!!!!')
root = path.dirname(path.abspath(__file__))
return path.join(root,temp_output_path)
return ""
@functions_framework.cloud_event
def extract(cloud_event):
data = cloud_event.data
event_id = cloud_event["id"]
event_type = cloud_event["type"]
bucket = data["bucket"]
name = data["name"]
metageneration = data["metageneration"]
timeCreated = data["timeCreated"]
updated = data["updated"]
print(f"Event Data:{data}")
print(f"Event ID: {event_id}")
print(f"Event type: {event_type}")
print(f"Bucket: {bucket}")
print(f"File: {name}")
print(f"Metageneration: {metageneration}")
print(f"Created: {timeCreated}")
print(f"Updated: {updated}")
object_path = str('gs://' + bucket + '/' + name)
m = re.findall(r"(user=.*)\/(t=.*)\/(.*)$", name)
user_part = m[0][0]
time_part = m[0][1]
file_name = m[0][2]
file_prefix = user_part+"/"+time_part
base_file_name = file_name.split('.')[0]
username = user_part.split('=')[1]
ui_timestamp = time_part.split('=')[1]
print(f"Adding new request for processing {object_path}.")
report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.')
pdf_file_path = "/tmp/"+file_name
download_blob_to_file(bucket, name, pdf_file_path)
local_report_location = extract_line_items(pdf_file_path=pdf_file_path,csv_file_path=,base_file_name=base_file_name)
if local_report_location:
gcs_uri = upload_blob_from_filename(file_prefix, local_report_location)
def download_blob_to_file(bucket_name, source_blob_name, file_path):
"""Downloads a blob from the bucket to Local File"""
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(source_blob_name)
blob.download_to_filename(file_path)
def upload_blob_from_filename(file_prefix, f):
report_output_bucket = os.environ.get('AWS_REPORT_BUCKET', 'AWS_REPORT_BUCKET variable is not set.')
file_name = file_prefix + "/" + f.split('/')[-1]
print('File name inside upload blob is {}'.format(file_name))
print('File inside upload blob is {}'.format(f))
storage_client = storage.Client()
bucket = storage_client.bucket(report_output_bucket)
blob = bucket.blob(file_name)
print('Before upload begins Listing files in dir : {}'.format(os.listdir('/tmp/')))
blob.upload_from_filename(f)
print(f"Successfully Done!!!")
return "gs://"+report_output_bucket+"/"+file_name
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment