Skip to content

Instantly share code, notes, and snippets.

@saurabhwahile
Last active April 3, 2020 04:35
Show Gist options
  • Save saurabhwahile/efbcdbf12be8c5df48873a1404f07b79 to your computer and use it in GitHub Desktop.
Save saurabhwahile/efbcdbf12be8c5df48873a1404f07b79 to your computer and use it in GitHub Desktop.
import os
import sys
from datetime import datetime, timedelta
import re
from io import BytesIO
import subprocess
import base64
import imaplib
import email
import camelot
import tempfile
import pandas as pd
STATEMENT_PASSWORD = '<password>'
"""
Pass a file buffer like object ex: open('file.pdf', 'wb') and the password to the statement file
"""
def parse_ecas(f_buffer, password=None):
f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False)
f_temp.write(f_buffer.read())
f_temp.close()
tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all')
os.unlink(f_temp.name)
funds_list = []
data = None
for table in tables:
if len(table.df.columns)!=7:
continue
for i, row in table.df.iterrows():
if not pd.isnull(row[0]) and re.match('(\d*)\/(\d*)',row[0]):
if data:
funds_list.append(data)
data = {
"folio_no" : row[0],
"scheme_name":row[1],
"unit_balance":row[2],
"dt":row[3],
"nav":row[4],
"market_value":row[5],
"registrar":row[6]
}
else:
if data:
data['scheme_name']+= ' '+ row[1] if not pd.isnull(row[1]) else ''
funds_list.append(data)
df = pd.DataFrame(funds_list)
df['dt'] = pd.to_datetime(df['dt'])
df['market_value'] = df['market_value'].str.replace(',','').astype(float)
df['nav'] = df['nav'].str.replace(',','').astype(float)
df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float)
return df
def parse_ecas_transactions(f_buffer, password=None):
f_temp = tempfile.TemporaryFile('w+b', suffix='.pdf', delete=False)
f_temp.write(f_buffer.read())
f_temp.close()
tables = camelot.read_pdf(f_temp.name, password=password, flavor='stream', pages='all')
os.unlink(f_temp.name)
data = []
state = 0
for t in tables:
for i, row in t.df.iterrows():
re_folio_no = re.match('Folio No:(.*)', row[0])
re_transaction_dt = re.match('(\d*-[a-zA-Z]*-\d*)', row[0])
if re_transaction_dt and not (pd.isnull(row[3]) or row[3]==''):
transaction_dt = re_transaction_dt[1]
re_fund_name = re.match('(.*)\(Advisor:(.*)', fund_name)
if re_fund_name:
fund_name = re_fund_name[1]
data.append({
'folio_no': folio_no.replace(' ', ''),
'fund_name': fund_name,
'dt': row[0],
'type': row[1],
'amount': row[3],
'units': row[4],
'nav': row[5],
'unit_balance': row[6]
})
state = 2
if state==1:
fund_name += row[0]
if re_folio_no:
folio_no = re_folio_no[1]
fund_name = ''
state = 1
df = pd.DataFrame(data)
df['dt'] = pd.to_datetime(df['dt'])
df['amount'] = df['amount'].str.replace(',','').astype(float)
df['units'] = df['units'].str.replace(',','').astype(float)
df['nav'] = df['nav'].str.replace(',','').astype(float)
df['unit_balance'] = df['unit_balance'].str.replace(',','').astype(float)
return df
"""
Download the tika-app here https://tika.apache.org/download.html
"""
def read_pdf_tika(file_buffer, password):
p = subprocess.Popen(f'java -jar C:\\bin\\tika-app.jar --password={password} --text', shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
return p.communicate(file_buffer.read())[0]
"""
Checks if the statement is a summary statement or if its a detailed statement
"""
def is_ecas_summary(file_buffer, password):
text = read_pdf_tika(file_buffer, password)
detailed_statement_date = None
for line in text.decode('utf-8',errors='ignore').split('\n'):
re_date_summary = re.match('(.*)As on (\d*-[a-zA-Z]*-\d*)(.*)', line)
if re_date_summary:
print(f'Summary | {re_date_summary[2]}')
return True
re_date_detailed = re.match('(.*)(\d*-[a-zA-Z]*-\d*) To (\d*-[a-zA-Z]*-\d*)(.*)', line)
if re_date_detailed:
detailed_statement_date = re_date_detailed[0]
print(f'Detailed Statement | {detailed_statement_date}')
return False
"""
Scan your inbox for E-Cas mails
"""
def scan_ecas_statement(FROM_EMAIL, FROM_PWD, SMTP_SERVER, SMTP_PORT):
mail = imaplib.IMAP4_SSL(SMTP_SERVER)
mail.login(FROM_EMAIL,FROM_PWD)
mail.select('inbox')
todays_date = datetime.strftime(datetime.now()-timedelta(days=1), "%d-%b-%Y")
tomorrows_date = datetime.strftime(datetime.now()+timedelta(days=1), "%d-%b-%Y")
typ, data = mail.search(None, 'SUBJECT "Consolidated Account Statement - CAMS Mailback Request" (SINCE "'+todays_date+'" BEFORE "'+tomorrows_date+'")')
mail_ids = data[0].decode()
id_list = mail_ids.split()
if len(id_list) == 0:
print('No Mails Received')
return
id_list.sort(reverse=True)
for i in id_list:
typ, data = mail.fetch(str(i), '(RFC822)' )
for response_part in data:
if isinstance(response_part, tuple):
msg = email.message_from_string(response_part[1].decode())
print(i)
for part in msg.walk():
if part.get_content_maintype() == 'multipart':
continue
if 'application/octet-stream' not in part.get('Content-Type'):
continue
statement_file = BytesIO(part.get_payload(decode=True))
if is_ecas_summary(statement_file, STATEMENT_PASSWORD):
statement_file.seek(0)
df = parse_ecas(statement_file, STATEMENT_PASSWORD)
else:
statement_file.seek(0)
df = parse_ecas_transactions(statement_file, STATEMENT_PASSWORD)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment