Created
September 14, 2020 19:39
-
-
Save broschke/8778148352d9884e1c05e40cd6a4ff0a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import xml.etree.ElementTree as ET | |
import tableauserverclient as TSC | |
import pandas as pd | |
import numpy as np | |
import os | |
import shutil | |
from pathlib import Path | |
import glob | |
import zipfile | |
#Input user credentials. | |
USERNAME = '' | |
PW = '' | |
SERVER = '' | |
SITE_ID = '' | |
def get_datasources(): | |
"""Download Tableau Server workbooks and extract database table connections and custom SQL queries.""" | |
#Create Server class for authentication. | |
tableau_auth = TSC.TableauAuth(USERNAME, PW, site_id=SITE_ID) | |
request_options = TSC.RequestOptions(pagesize=1000) | |
server = TSC.Server(SERVER, use_server_version=True) | |
#Grab list of workbook IDs. Required for downloading. | |
with server.auth.sign_in(tableau_auth): | |
all_workbooks, pagination_item = server.workbooks.get(req_options=request_options) | |
workbook_list = [[workbook.id, workbook.name, workbook.project_name] for workbook in all_workbooks] | |
workbook_ids = [item[0] for item in workbook_list] | |
#Create subfolder in current directory for files. | |
p = Path(os.getcwd()) | |
if not os.path.exists(str(p)+'\\wb_downloads'): | |
os.makedirs(str(p)+'\\wb_downloads') | |
else: | |
shutil.rmtree(str(p)+'\\wb_downloads') | |
os.makedirs(str(p)+'\\wb_downloads') | |
os.chdir(str(p)+'\\wb_downloads') | |
#Download all workbooks. | |
with server.auth.sign_in(tableau_auth): | |
for wb in workbook_ids: | |
file_path = server.workbooks.download(wb) | |
print("\nDownloaded workbook to {0}.".format(file_path)) | |
#Grab all workbook files | |
twb_files = glob.glob('*.twb*') | |
#Extract any packaged workbook files. | |
for file in twb_files: | |
if file[-4:] == 'twbx': | |
with zipfile.ZipFile(file, 'r') as pack_wb: | |
pack_wb.extractall(os.getcwd()) | |
#Grab only twb files. | |
twb_files = glob.glob('*.twb') | |
#Convert to xml. | |
for file in twb_files: | |
pre, ext = os.path.splitext(file) | |
os.rename(file, pre + '.xml') | |
#Grab only xml files. | |
xml_files = glob.glob('*.xml') | |
wb_name_list = [] | |
wb_table_list = [] | |
wb_query_list = [] | |
#Loop through files, parsing for desired data and updating respect lists to store results. | |
for file in xml_files: | |
root = ET.parse(file).getroot() | |
for elem in root.iter('relation'): | |
wb_table_list.append(elem.get('table')) | |
wb_name_list.append(file) | |
for elem in root.iter('relation'): | |
wb_query_list.append(elem.text) | |
#Clean data | |
wb_name_list = [wb[:-4] for wb in wb_name_list] | |
wb_query_list = [query.replace('\r\n',' ') if query != None else query for query in wb_query_list] | |
wb_query_list = [query.replace('\n',' ') if query != None else query for query in wb_query_list] | |
#Make a Pandas dataframe of all lists. | |
df = pd.DataFrame(zip(wb_name_list, wb_table_list, wb_query_list), columns=['workbook', 'table', 'query']) | |
#Clean data some more. | |
df = df[df.table != '[Extract].[Extract]'] | |
df = df.reset_index(drop=True) | |
df = df.replace(r'^\s*$', np.nan, regex=True) | |
df = df.dropna(thresh=2) | |
df = df.drop_duplicates() | |
#Output to csv. | |
df.to_csv('zico_audit.csv', index=False) | |
#Move output one folder up and delete subfolder and contents | |
with Path(os.getcwd()+'\\zico_audit.csv') as file: | |
destination = Path(os.path.dirname(os.getcwd())) | |
shutil.copy(file, destination) | |
os.chdir('..') | |
shutil.rmtree(str(Path(os.getcwd()))+'\\wb_downloads') | |
print("Audit complete.") | |
if __name__ == '__main__': | |
get_datasources() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment