akshaykarnawat/dtcc.py

## dtcc.py
# the structure of the code needs to change and the script needs to be parameterized with func...
# but gets the job done for the data needed

from    urllib.parse import urljoin
from    bs4 import BeautifulSoup
from    datetime import datetime
import  pandas as pd
import  requests
import  zipfile


def getParamString():
    # this might fail in windows as you can't convert float to string
    # with more than 2 decimal places; the way dtcc sets its query param it needs 13 char
    # for now...
    return str((datetime.utcnow() - datetime(1970,1,1)).total_seconds()).replace('.', '')[:13]


url = 'https://rtdata.dtcc.com/gtr'
res = requests.get(url=url)
cookies = res.cookies
headers = res.headers

if res.status_code != 200:
    return


soup = BeautifulSoup(res.text, 'html.parser')
urlToCall = soup.find('iframe',
attrs={'class' : 'gridBody cumulativeSliceGrid'}).get('src', '')
res.close()


# subsequent call

# call urlToCall to get the details surrounding cumulative.
res = requests.get(url=urlToCall, params={'_': getParamString()},
cookies=cookies, headers=headers)

soup = BeautifulSoup(res.text, 'html.parser')
dataset = soup.findAll('div', attrs={'class' : 'trackerGrid pollingGrid'})
res.close()


# make calls to the actual data available
for d in dataset:
    print('Getting data for', d.get('id', ''))
    durl = urljoin(urlToCall, d.get('sourcefile', ''))
    res = requests.get(url=durl, params={'_': getParamString()},
    cookies=cookies, headers=headers)

    soup = BeautifulSoup(res.text, 'html.parser')
    res.close()
    ddata = [a.get('href') for a in soup.findAll('a')]

    # now we can loop through each of links in ddata to get the zip file
    # download the data in there and convert it to a pandas dataframe

    for dd in ddata:
        res = requests.get(dd)
        content = res.content
        z = zipfile.ZipFile(io.BytesIO(content))
        res.close()
        for f in z.filelist:
            if '.csv' in f.filename:
                df = pd.read_csv(z.open(f))

                # do your thing -- pandas the * out of data
	# the structure of the code needs to change and the script needs to be parameterized with func...
	# but gets the job done for the data needed

	from urllib.parse import urljoin
	from bs4 import BeautifulSoup
	from datetime import datetime
	import pandas as pd
	import requests
	import zipfile


	def getParamString():
	# this might fail in windows as you can't convert float to string
	# with more than 2 decimal places; the way dtcc sets its query param it needs 13 char
	# for now...
	return str((datetime.utcnow() - datetime(1970,1,1)).total_seconds()).replace('.', '')[:13]


	url = 'https://rtdata.dtcc.com/gtr'
	res = requests.get(url=url)
	cookies = res.cookies
	headers = res.headers

	if res.status_code != 200:
	return


	soup = BeautifulSoup(res.text, 'html.parser')
	urlToCall = soup.find('iframe',
	attrs={'class' : 'gridBody cumulativeSliceGrid'}).get('src', '')
	res.close()


	# subsequent call

	# call urlToCall to get the details surrounding cumulative.
	res = requests.get(url=urlToCall, params={'_': getParamString()},
	cookies=cookies, headers=headers)

	soup = BeautifulSoup(res.text, 'html.parser')
	dataset = soup.findAll('div', attrs={'class' : 'trackerGrid pollingGrid'})
	res.close()


	# make calls to the actual data available
	for d in dataset:
	print('Getting data for', d.get('id', ''))
	durl = urljoin(urlToCall, d.get('sourcefile', ''))
	res = requests.get(url=durl, params={'_': getParamString()},
	cookies=cookies, headers=headers)

	soup = BeautifulSoup(res.text, 'html.parser')
	res.close()
	ddata = [a.get('href') for a in soup.findAll('a')]

	# now we can loop through each of links in ddata to get the zip file
	# download the data in there and convert it to a pandas dataframe

	for dd in ddata:
	res = requests.get(dd)
	content = res.content
	z = zipfile.ZipFile(io.BytesIO(content))
	res.close()
	for f in z.filelist:
	if '.csv' in f.filename:
	df = pd.read_csv(z.open(f))

	# do your thing -- pandas the * out of data