Skip to content

Instantly share code, notes, and snippets.

@broschke
Created September 14, 2020 19:39
Show Gist options
  • Save broschke/8778148352d9884e1c05e40cd6a4ff0a to your computer and use it in GitHub Desktop.
Save broschke/8778148352d9884e1c05e40cd6a4ff0a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
import xml.etree.ElementTree as ET
import tableauserverclient as TSC
import pandas as pd
import numpy as np
import os
import shutil
from pathlib import Path
import glob
import zipfile
#Input user credentials.
USERNAME = ''
PW = ''
SERVER = ''
SITE_ID = ''
def get_datasources():
"""Download Tableau Server workbooks and extract database table connections and custom SQL queries."""
#Create Server class for authentication.
tableau_auth = TSC.TableauAuth(USERNAME, PW, site_id=SITE_ID)
request_options = TSC.RequestOptions(pagesize=1000)
server = TSC.Server(SERVER, use_server_version=True)
#Grab list of workbook IDs. Required for downloading.
with server.auth.sign_in(tableau_auth):
all_workbooks, pagination_item = server.workbooks.get(req_options=request_options)
workbook_list = [[workbook.id, workbook.name, workbook.project_name] for workbook in all_workbooks]
workbook_ids = [item[0] for item in workbook_list]
#Create subfolder in current directory for files.
p = Path(os.getcwd())
if not os.path.exists(str(p)+'\\wb_downloads'):
os.makedirs(str(p)+'\\wb_downloads')
else:
shutil.rmtree(str(p)+'\\wb_downloads')
os.makedirs(str(p)+'\\wb_downloads')
os.chdir(str(p)+'\\wb_downloads')
#Download all workbooks.
with server.auth.sign_in(tableau_auth):
for wb in workbook_ids:
file_path = server.workbooks.download(wb)
print("\nDownloaded workbook to {0}.".format(file_path))
#Grab all workbook files
twb_files = glob.glob('*.twb*')
#Extract any packaged workbook files.
for file in twb_files:
if file[-4:] == 'twbx':
with zipfile.ZipFile(file, 'r') as pack_wb:
pack_wb.extractall(os.getcwd())
#Grab only twb files.
twb_files = glob.glob('*.twb')
#Convert to xml.
for file in twb_files:
pre, ext = os.path.splitext(file)
os.rename(file, pre + '.xml')
#Grab only xml files.
xml_files = glob.glob('*.xml')
wb_name_list = []
wb_table_list = []
wb_query_list = []
#Loop through files, parsing for desired data and updating respect lists to store results.
for file in xml_files:
root = ET.parse(file).getroot()
for elem in root.iter('relation'):
wb_table_list.append(elem.get('table'))
wb_name_list.append(file)
for elem in root.iter('relation'):
wb_query_list.append(elem.text)
#Clean data
wb_name_list = [wb[:-4] for wb in wb_name_list]
wb_query_list = [query.replace('\r\n',' ') if query != None else query for query in wb_query_list]
wb_query_list = [query.replace('\n',' ') if query != None else query for query in wb_query_list]
#Make a Pandas dataframe of all lists.
df = pd.DataFrame(zip(wb_name_list, wb_table_list, wb_query_list), columns=['workbook', 'table', 'query'])
#Clean data some more.
df = df[df.table != '[Extract].[Extract]']
df = df.reset_index(drop=True)
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(thresh=2)
df = df.drop_duplicates()
#Output to csv.
df.to_csv('zico_audit.csv', index=False)
#Move output one folder up and delete subfolder and contents
with Path(os.getcwd()+'\\zico_audit.csv') as file:
destination = Path(os.path.dirname(os.getcwd()))
shutil.copy(file, destination)
os.chdir('..')
shutil.rmtree(str(Path(os.getcwd()))+'\\wb_downloads')
print("Audit complete.")
if __name__ == '__main__':
get_datasources()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment