broschke/tableau_audit

## tableau_audit
#!/usr/bin/env python
# coding: utf-8

import xml.etree.ElementTree as ET
import tableauserverclient as TSC
import pandas as pd
import numpy as np
import os
import shutil
from pathlib import Path
import glob
import zipfile
from datetime import datetime


USERNAME = os.getenv('AD_USER')
PW = os.getenv('AD_PASS')
SERVER = ''
SITE_ID = ''

tableau_auth = TSC.TableauAuth(USERNAME, PW, site_id=SITE_ID)

request_options = TSC.RequestOptions(pagesize=1000)

server = TSC.Server(SERVER, use_server_version=True)

with server.auth.sign_in(tableau_auth):
    all_workbooks, pagination_item = server.workbooks.get(req_options=request_options)
    workbook_list = [[workbook.id, workbook.name, workbook.project_name] for workbook in all_workbooks]

workbook_ids = [item[0] for item in workbook_list]

p = Path(os.getcwd())

if not os.path.exists(str(p)+'\\wb_downloads'):
    os.makedirs(str(p)+'\\wb_downloads')
else:
    shutil.rmtree(str(p)+'\\wb_downloads')
    os.makedirs(str(p)+'\\wb_downloads')

os.chdir(str(p)+'\\wb_downloads')

with server.auth.sign_in(tableau_auth):
    for wb in workbook_ids:
        try:
            file_path = server.workbooks.download(wb)
        except:
            print(f'Workbook {wb} could not be found.')
        print("\nDownloaded workbook to {0}.".format(file_path))

print("Download complete!")

twb_files = glob.glob('*.twb*')

for file in twb_files:
    if file[-4:] == 'twbx':
        with zipfile.ZipFile(file, 'r') as pack_wb:
            pack_wb.extractall(os.getcwd())

twb_files = glob.glob('*.twb')

for file in twb_files:
    pre, ext = os.path.splitext(file)
    os.rename(file, pre + '.xml')

xml_files = glob.glob('*.xml')

wb_name_list = []
wb_table_list = []
wb_query_list = []

print('Parsing data sources')

for file in xml_files:
    root = ET.parse(file).getroot()
    for elem in root.iter('relation'):
        wb_table_list.append(elem.get('table'))
        wb_name_list.append(file)
    for elem in root.iter('relation'):
        wb_query_list.append(elem.text)

wb_name_list = [wb[:-4] for wb in wb_name_list]
wb_query_list = [query.replace('\r\n',' ') if query != None else query for query in wb_query_list]
wb_query_list = [query.replace('\n',' ') if query != None else query for query in wb_query_list]

df_query = pd.DataFrame(zip(wb_name_list, wb_table_list, wb_query_list), columns=['workbook', 'table', 'query'])

df_query = df_query[df_query.table != '[Extract].[Extract]']
df_query = df_query.reset_index(drop=True)

df_query = df_query.replace(r'^\s*$', np.nan, regex=True)

df_query = df_query.dropna(thresh=2)

df_query = df_query.drop_duplicates()

print('Parsing calculated fields')

calcDict = {}
calcList = []
for file in xml_files:
    root = ET.parse(file).getroot()
    for item in root.findall('.//column[@caption]'):
        if item.find(".//calculation") is None:
            continue
        else:
            calcDict[item.attrib['name']] = '[' + item.attrib['caption'] + ']'

    for item in root.findall('.//column[@caption]'):
        if item.find(".//calculation") is None:
            continue
        else:
            if item.find(".//calculation[@formula]") is None:
                continue
            else:
                wb_name = file[:-4]
                calc_caption = '[' + item.attrib['caption'] + ']'
                calc_name = item.attrib['name']
                calc_raw_formula = item.find(".//calculation").attrib['formula']
                calc_comment = ''
                calc_formula = ''
                for line in calc_raw_formula.split('\r\n'):
                    if line.startswith('//'):
                        calc_comment = calc_comment + line + ' '
                    else:
                        calc_formula = calc_formula + line + ' '
                for name, caption in calcDict.items():
                    calc_formula = calc_formula.replace(name, caption)

                calc_row = (wb_name, calc_caption, calc_name, calc_formula, calc_comment)
                calcList.append(list(calc_row))

df_calcs = pd.DataFrame(calcList, columns=['Workbook', 'Name', 'Remote Name', 'Formula', 'Comment'])

# remove duplicate rows from data frame
df_calcs = df_calcs.drop_duplicates(subset=None, keep='first', inplace=False)

today = datetime.now().date()

with pd.ExcelWriter(f'{today}_zico_audit_{SITE_ID}.xlsx') as writer:
    df_query.to_excel(writer, sheet_name='data_sources', index=False)
    df_calcs.to_excel(writer, sheet_name='calc_fields', index=False)


with Path(os.getcwd()+f'\\{today}_zico_audit_{SITE_ID}.xlsx') as file:
    destination = Path(os.path.dirname(os.getcwd()))
    shutil.copy(file, destination)

os.chdir('..')
shutil.rmtree(str(Path(os.getcwd()))+'\\wb_downloads')

print("Audit complete.")
	#!/usr/bin/env python
	# coding: utf-8

	import xml.etree.ElementTree as ET
	import tableauserverclient as TSC
	import pandas as pd
	import numpy as np
	import os
	import shutil
	from pathlib import Path
	import glob
	import zipfile
	from datetime import datetime


	USERNAME = os.getenv('AD_USER')
	PW = os.getenv('AD_PASS')
	SERVER = ''
	SITE_ID = ''

	tableau_auth = TSC.TableauAuth(USERNAME, PW, site_id=SITE_ID)

	request_options = TSC.RequestOptions(pagesize=1000)

	server = TSC.Server(SERVER, use_server_version=True)

	with server.auth.sign_in(tableau_auth):
	all_workbooks, pagination_item = server.workbooks.get(req_options=request_options)
	workbook_list = [[workbook.id, workbook.name, workbook.project_name] for workbook in all_workbooks]

	workbook_ids = [item[0] for item in workbook_list]

	p = Path(os.getcwd())

	if not os.path.exists(str(p)+'\\wb_downloads'):
	os.makedirs(str(p)+'\\wb_downloads')
	else:
	shutil.rmtree(str(p)+'\\wb_downloads')
	os.makedirs(str(p)+'\\wb_downloads')

	os.chdir(str(p)+'\\wb_downloads')

	with server.auth.sign_in(tableau_auth):
	for wb in workbook_ids:
	try:
	file_path = server.workbooks.download(wb)
	except:
	print(f'Workbook {wb} could not be found.')
	print("\nDownloaded workbook to {0}.".format(file_path))

	print("Download complete!")

	twb_files = glob.glob('.twb')

	for file in twb_files:
	if file[-4:] == 'twbx':
	with zipfile.ZipFile(file, 'r') as pack_wb:
	pack_wb.extractall(os.getcwd())

	twb_files = glob.glob('*.twb')

	for file in twb_files:
	pre, ext = os.path.splitext(file)
	os.rename(file, pre + '.xml')

	xml_files = glob.glob('*.xml')

	wb_name_list = []
	wb_table_list = []
	wb_query_list = []

	print('Parsing data sources')

	for file in xml_files:
	root = ET.parse(file).getroot()
	for elem in root.iter('relation'):
	wb_table_list.append(elem.get('table'))
	wb_name_list.append(file)
	for elem in root.iter('relation'):
	wb_query_list.append(elem.text)

	wb_name_list = [wb[:-4] for wb in wb_name_list]
	wb_query_list = [query.replace('\r\n',' ') if query != None else query for query in wb_query_list]
	wb_query_list = [query.replace('\n',' ') if query != None else query for query in wb_query_list]

	df_query = pd.DataFrame(zip(wb_name_list, wb_table_list, wb_query_list), columns=['workbook', 'table', 'query'])

	df_query = df_query[df_query.table != '[Extract].[Extract]']
	df_query = df_query.reset_index(drop=True)

	df_query = df_query.replace(r'^\s*$', np.nan, regex=True)

	df_query = df_query.dropna(thresh=2)

	df_query = df_query.drop_duplicates()

	print('Parsing calculated fields')

	calcDict = {}
	calcList = []
	for file in xml_files:
	root = ET.parse(file).getroot()
	for item in root.findall('.//column[@caption]'):
	if item.find(".//calculation") is None:
	continue
	else:
	calcDict[item.attrib['name']] = '[' + item.attrib['caption'] + ']'

	for item in root.findall('.//column[@caption]'):
	if item.find(".//calculation") is None:
	continue
	else:
	if item.find(".//calculation[@formula]") is None:
	continue
	else:
	wb_name = file[:-4]
	calc_caption = '[' + item.attrib['caption'] + ']'
	calc_name = item.attrib['name']
	calc_raw_formula = item.find(".//calculation").attrib['formula']
	calc_comment = ''
	calc_formula = ''
	for line in calc_raw_formula.split('\r\n'):
	if line.startswith('//'):
	calc_comment = calc_comment + line + ' '
	else:
	calc_formula = calc_formula + line + ' '
	for name, caption in calcDict.items():
	calc_formula = calc_formula.replace(name, caption)

	calc_row = (wb_name, calc_caption, calc_name, calc_formula, calc_comment)
	calcList.append(list(calc_row))

	df_calcs = pd.DataFrame(calcList, columns=['Workbook', 'Name', 'Remote Name', 'Formula', 'Comment'])

	# remove duplicate rows from data frame
	df_calcs = df_calcs.drop_duplicates(subset=None, keep='first', inplace=False)

	today = datetime.now().date()

	with pd.ExcelWriter(f'{today}_zico_audit_{SITE_ID}.xlsx') as writer:
	df_query.to_excel(writer, sheet_name='data_sources', index=False)
	df_calcs.to_excel(writer, sheet_name='calc_fields', index=False)


	with Path(os.getcwd()+f'\\{today}_zico_audit_{SITE_ID}.xlsx') as file:
	destination = Path(os.path.dirname(os.getcwd()))
	shutil.copy(file, destination)

	os.chdir('..')
	shutil.rmtree(str(Path(os.getcwd()))+'\\wb_downloads')

	print("Audit complete.")