Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save divinorum-webb/0a4fccc8beb1e4bcadd708c53d1cdec8 to your computer and use it in GitHub Desktop.
Save divinorum-webb/0a4fccc8beb1e4bcadd708c53d1cdec8 to your computer and use it in GitHub Desktop.
Template code for querying the Tableau Metadata API for all sites on a server
import pandas as pd
from pandas.io.json import json_normalize
from tableau_api_lib import TableauServerConnection
from tableau_api_lib.utils import flatten_dict_column, flatten_dict_list_column
from tableau_api_lib.utils.querying import get_sites_dataframe
# using personal access tokens is preferred; otherwise, comment those details out and use username / password
tableau_server_config = {
'my_env': {
'server': 'https://10ax.online.tableau.com', # replace with your own server
'api_version': '3.8', # replace with your REST API version
'personal_access_token_name': '<PAT NAME>',
'personal_access_token_secret': '<PAT SECRET',
# 'username': '<USERNAME>',
# 'password': '<PASSWORD>',
'site_name': 'your-pretty-site-name', # if accessing your default site, set this to ''
'site_url': 'YourSiteContentUrl' # if accessing your default site, set this to ''
}
}
# define the GraphQL queries to run against the Metadata API
query_workbooks = """
{
workbooks {
workbook_name: name
workbook_id: luid
workbook_project: projectName
views {
view_type: __typename
view_name: name
view_id: luid
}
upstreamTables {
upstr_table_name: name
upstr_table_id: luid
upstreamDatabases {
upstr_db_name: name
upstr_db_type: connectionType
upstr_db_id: luid
upstr_db_isEmbedded: isEmbedded
}
}
upstreamDatasources {
upstr_ds_name: name
upstr_ds_id: luid
upstr_ds_project: projectName
}
embeddedDatasources {
emb_ds_name: name
}
upstreamFlows {
flow_name: name
flow_id: luid
flow_project: projectName
}
}
}
"""
query_databases = """
{
databaseServers {
database_hostname: hostName
database_port: port
database_id: luid
}
}
"""
def get_metadata_json(conn, query, content_type):
results = conn.metadata_graphql_query(query)
results_json = results.json()['data'][content_type]
return results_json
def get_workbook_metadata_df(json_data):
df = json_normalize(json_data)
cols_to_drop = ['views', 'upstreamTables', 'upstreamDatasources', 'embeddedDatasources', 'upstreamFlows']
df.drop(columns=cols_to_drop, inplace=True)
return df
def get_view_metadata_df(json_data):
df = json_normalize(data=json_data, record_path='views', meta='workbook_id')
return df
def get_tables_metadata_df(json_data):
df = json_normalize(data=json_data, record_path='upstreamTables', meta='workbook_id')
df = flatten_dict_list_column(df, col_name='upstreamDatabases')
return df
def get_database_metadata_df(json_data):
return pd.DataFrame(json_data)
def get_datasource_metadata_df(json_data):
wb_upstream_ds_df = json_normalize(data=json_data, record_path='upstreamDatasources', meta='workbook_id')
wb_embedded_ds_df = json_normalize(data=json_data, record_path='embeddedDatasources', meta='workbook_id')
df = wb_upstream_ds_df.merge(wb_embedded_ds_df, how='left', on='workbook_id')
return df
def get_flow_metadata_df(json_data):
df = json_normalize(data=json_data, record_path='upstreamFlows', meta='workbook_id')
return df
def get_combined_metadata_df(wb_json, db_json):
wb_df = get_workbook_metadata_df(wb_json)
wb_views_df = get_view_metadata_df(wb_json)
wb_tables_df = get_tables_metadata_df(wb_json)
db_df = get_database_metadata_df(db_json)
wb_ds_df = get_datasource_metadata_df(wb_json)
wb_flows_df = get_flow_metadata_df(wb_json)
combined_df = wb_df.merge(wb_views_df, how='left', on='workbook_id')
combined_df = combined_df.merge(wb_tables_df, how='left', on='workbook_id')
combined_df = combined_df.merge(db_df, how='left', left_on='upstr_db_id', right_on='database_id')
combined_df = combined_df.merge(wb_ds_df, how='left', on='workbook_id')
combined_df = combined_df.merge(wb_flows_df, how='left', on='workbook_id')
return combined_df
def add_contextual_columns(df, site_name):
df['summary_date'] = pd.datetime.now()
df['site_name'] = site_name
return df
conn = TableauServerConnection(tableau_server_config, 'my_env')
conn.sign_in()
sites_df = get_sites_dataframe(conn)
all_sites_combined_df = pd.DataFrame()
for index, site in sites_df.iterrows():
print(f"querying data from site '{site['contentUrl']}'...")
conn.switch_site(site['contentUrl'])
wb_query_results_json = get_metadata_json(conn, query_workbooks, 'workbooks')
db_query_results_json = get_metadata_json(conn, query_databases, 'databaseServers')
combined_df = get_combined_metadata_df(wb_query_results_json, db_query_results_json)
combined_df = add_contextual_columns(combined_df, conn.site_name)
all_sites_combined_df = all_sites_combined_df.append(combined_df, sort=False, ignore_index=True)
# output the resulting data to a CSV file
all_sites_combined_df.to_csv('impact_analysis_milestone1.csv', header=True, index=False)
conn.sign_out()
@ivmonnier
Copy link

Hello,
all your work is great, this is exactly what I need to have a better idea of what is going on on my Tableau Server.

I have a problem though:
My Tableau Server is 2020.3.1, API: 3.9

When I run the script, I have an error:
Traceback (most recent call last):
File "", line 138, in
File "C:\Python36\lib\site-packages\tableau_api_lib\decorators\verification.py", line 43, in wrapper
return func(self, *args, **kwargs)
File "C:\Python36\lib\site-packages\tableau_api_lib\decorators\verification.py", line 154, in wrapper
return func(self, *args, **kwargs)
File "C:\Python36\lib\site-packages\tableau_api_lib\tableau_server_connection.py", line 196, in switch_site
self.site_name = self.query_site().json()['site']['name']
File "C:\Python36\lib\site-packages\tableau_api_lib\decorators\verification.py", line 154, in wrapper
return func(self, *args, **kwargs)
File "C:\Python36\lib\site-packages\tableau_api_lib\tableau_server_connection.py", line 288, in query_site
parameter_dict=parameter_dict).get_endpoint()
TypeError: init() got an unexpected keyword argument 'include_usage_flag'

I have run the script from line 1 to 138, and I have the error.
It does print(f"querying data from site '{site['contentUrl']}'...") and
conn.switch_site(site['contentUrl'])
from the 1st site to the 2nd and stops.

There are 8 sites on my server.

Would you know what I could do about this?

Thank you

Ivan

@divinorum-webb
Copy link
Author

Hey Ivan, I am getting the same error when I use a Jupyter notebook, but I do not get the error when running the exact same code in PyCharm. It's odd because the error would seem to indicate that the 'include_usage_flag' argument is being passed and that there is no such parameter defined in the 'SiteEndpoint' class, but if that were the case then the code would also raise a TypeError outside of Jupyter when run in PyCharm.

I'll keep looking into it, but I recommend trying to run your code outside of Jupyter if possible as a short-term solution.

Hope that helps you!

@divinorum-webb
Copy link
Author

Following up, please run pip install -U tableau-api-lib and try again. Let me know if you are still getting the error, but it is resolved for me now (both in Jupyter and using Python directly).

@koteshbhupathi
Copy link

Hi Gurus,

can you please help me how do i sign.in the server without SSL certificate verification exception.

It looks like when i request connection my client program failing with SSL error.

My code snippet as below.

from tableau_api_lib import TableauServerConnection

tableau_server_config = {

    'tableau_prod': {

            'server': 'https://xxxxx.edwards.com.edwards.com/',

            'api_version': '3.7',

            'username': 'xxxx',

            'password': 'xxxxx',

            'site_name': 'Finance',

            'site_url': 'https://xxxxx.edwards.com/#/site/Finance/home'

    }

}

conn = TableauServerConnection(tableau_server_config)

conn = TableauServerConnection(tableau_server_config)

conn.sign_in()
conn.create_site(site_name='estam_temp', content_url='estamTemp')
conn.sign_out()

image

@ivmonnier
Copy link

Hello @divinorum-webb,
thank you for your replay.
1st: I had the problem using PyCharm, unlike you.
2nd: I ran "pip install -U tableau-api-lib", the tableau-api-lib has been upgraded from tableau-api-lib-0.0.100 to tableau-api-lib-0.0.102.

Now it works.

Thank you.

Ivan

@koteshbhupathi
Copy link

Hi ivan,

Really Thanks for your Quick response, i will try it but meanwhile how where should i add "add_http_options({'verify': False})" parameter to the below snippet if upgrading tableu api doesnt work?

@ivmonnier
Copy link

Hello @koteshbhupathi,

I am not sure that my reply would help you, I do not know much aboit ssl and certificates.

What I can say is that my server is in https, with a certificate issued by my company, and that I do not write anything related to that in the connection string. Here is the way I connect:
tableau_server_config_src = {
'tableau_prod_myserver': {
'server': 'https://myserver.sub.mycompany.fr',
'api_version': '3.9',
'username': user,
'password': mdp_ts,
'site_name': '',
'site_url': ''
}
}

I leave site_name and site_url as empty strings, the site(s) name(s) and url are managed by the following code.

Hope this helps

Ivan

@divinorum-webb
Copy link
Author

Hi @koteshbhupathi, one issue for sure is that your site_url value should just be the site name as it appears in your URL; in this case, it looks like it will be either 'Finance' or 'finance'.

This value is what Tableau sometimes refers to in its documentation as the 'content URL'. The reason it is different from your site name is that often times site names have spaces or other characters in them which are not valid characters in a URL string. For example, if my site name were 'Tableau is Great' then the site URL would remove the spaces and be something like 'TableauisGreat'.

So to recap, keep your site name but change the site_url value to exactly how the site name is represented in your URL when logged into that site on Tableau Server.

As a side note, if you leave site_name and site_url empty then you will log into the default site for the server.

@YEM-1
Copy link

YEM-1 commented Jan 14, 2021

in this function call sites_df = get_sites_dataframe(conn) , I get this error

PaginationError: 
        The Tableau Server REST API method query_sites did not return paginated results. 
        Please verify that your connection is logged in and has a valid auth token.
        If using personal access tokens, note that only one session can be active at a time using a single token.
        Also note that the extract_pages() method wrapping this call is intended for paginated results only.
        Not all Tableau Server REST API methods support pagination. 

Any ideas or troubleshooting steps appreciated.

Full, un-redacted error:

Traceback (most recent call last):

  File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\pagination.py", line 14, in get_page_attributes
    pagination = query['pagination']

KeyError: 'pagination'


During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "C:\tableau2.py", line 142, in <module>
    sites_df = get_sites_dataframe(conn,site_names = site)

  File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\querying\sites.py", line 18, in get_sites_dataframe
    sites_df = pd.DataFrame(get_all_site_fields(conn))

  File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\querying\sites.py", line 13, in get_all_site_fields
    all_sites = extract_pages(conn.query_sites, parameter_dict={'fields': 'fields=_all_'})

  File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\pagination.py", line 51, in extract_pages
    page_number, page_size, total_available = get_page_attributes(query, query_func)

  File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\pagination.py", line 20, in get_page_attributes
    raise PaginationError(query_func)

PaginationError: 
        The Tableau Server REST API method query_sites did not return paginated results. 
        Please verify that your connection is logged in and has a valid auth token.
        If using personal access tokens, note that only one session can be active at a time using a single token.
        Also note that the extract_pages() method wrapping this call is intended for paginated results only.
        Not all Tableau Server REST API methods support pagination. 

@divinorum-webb
Copy link
Author

@YEM-1 Let's start out by checking if you have the latest version of tableau-api-lib installed. I have seen this same pagination error in older versions of the library, and it is possible you are on an older version.

First open your command line prompt (if you use a virtual environment, make sure the desired one is active) and run pip install -U tableau-api-lib.

If you then attempt to run your code, does it work? If not, you can get more information about what's wrong by inspecting the connection object for the active endpoint, server response, and contents of the server response.

Additionally, you can try hitting the REST API for "Query Sites" directly:
response = conn.query_sites()

Assuming your connection object is conn, you can then inspect it as described above in the following ways:

  1. conn.active_endpoint inspects the most recent endpoint the connection has used (the URL used in the request for data)
  2. response.status_code inspects the status code the server returned; you can cross-reference against the Tableau Server REST API documentation to understand what the code means.
  3. response.content inspects the content of the server response, and this may include a text description of why your request failed.

Hope that helps!

@Larad90
Copy link

Larad90 commented Mar 5, 2021

**After debugging the issue I've detailed below I realize why this is happening- one my sites doesn't have database servers. As a way around it I added an If statement to add missing column names with null values. like:

if 'database_id' not in df.columns:
df['database_id']= ""

but I realize this isn't an efficient fix since I did come across other sites not having all few other columns. So If there is a more efficient way to do this please let me know. Thanks and I really appreciate all the education you've put out there.


I am facing a very strange issue: When I run this whole code I get a KeyError: 'database_id' from combined_df = combined_df.merge(db_df, how='left', left_on='upstr_db_id', right_on='database_id'). At first I thought it was because the Json output from def get_metadata_json() was converted to pandas df with df = json_normalize(json_data). So added that but I still this error.
However, the weird this is that while debugging I ran the code outside of the loop and created an appended dataset. When I go back running it in a loop as it is here, it throws the error. I am really lost as to what's happening here. Any help is much appreciated. Thanks!

Stacktrace:

Traceback (most recent call last):
File ".\switch_site.py", line 155, in
combined_df = get_combined_metadata_df(wb_query_results_json, db_query_results_json)
File ".\switch_site.py", line 128, in get_combined_metadata_df
combined_df = combined_df.merge(db_df, how='left', left_on='upstr_db_id', right_on='database_id')
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py", line 7297, in merge
validate=validate,
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\reshape\merge.py", line 86, in merge
validate=validate,
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\reshape\merge.py", line 627, in init
) = self._get_merge_keys()
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\reshape\merge.py", line 983, in _get_merge_keys
right_keys.append(right._get_label_or_level_values(rk))
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py", line 1692, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'database_id'

@divinorum-webb
Copy link
Author

If you know all the columns you expect to come through, you could try defining a placeholder dataframe before apppending any of your generated dataframes to it.

For example, let's say you're anticipating these columns to be returned: ['col_a', 'col_b', 'col_c'].

You could define your placeholder dataframe like this: df = pd.DataFrame(columns=['col_a', 'col_b', 'col_c']).
Now even if one of your dataframes doesn't have all three columns (maybe it has returned columns ['col_a', 'col_c]), appending it to the placeholder will work and row values for the columns not present will default to NaN.

Hope that helps!

@Larad90
Copy link

Larad90 commented Mar 6, 2021

If you know all the columns you expect to come through, you could try defining a placeholder dataframe before apppending any of your generated dataframes to it.

For example, let's say you're anticipating these columns to be returned: ['col_a', 'col_b', 'col_c'].

You could define your placeholder dataframe like this: df = pd.DataFrame(columns=['col_a', 'col_b', 'col_c']).
Now even if one of your dataframes doesn't have all three columns (maybe it has returned columns ['col_a', 'col_c]), appending it to the placeholder will work and row values for the columns not present will default to NaN.

Hope that helps!

Thanks!
This is happening in merge statement, while merging wb_df with combined_df. So when ‘database_id’ isn’t present it doesn’t have a column to merge on.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment