-
-
Save divinorum-webb/0a4fccc8beb1e4bcadd708c53d1cdec8 to your computer and use it in GitHub Desktop.
import pandas as pd | |
from pandas.io.json import json_normalize | |
from tableau_api_lib import TableauServerConnection | |
from tableau_api_lib.utils import flatten_dict_column, flatten_dict_list_column | |
from tableau_api_lib.utils.querying import get_sites_dataframe | |
# using personal access tokens is preferred; otherwise, comment those details out and use username / password | |
tableau_server_config = { | |
'my_env': { | |
'server': 'https://10ax.online.tableau.com', # replace with your own server | |
'api_version': '3.8', # replace with your REST API version | |
'personal_access_token_name': '<PAT NAME>', | |
'personal_access_token_secret': '<PAT SECRET', | |
# 'username': '<USERNAME>', | |
# 'password': '<PASSWORD>', | |
'site_name': 'your-pretty-site-name', # if accessing your default site, set this to '' | |
'site_url': 'YourSiteContentUrl' # if accessing your default site, set this to '' | |
} | |
} | |
# define the GraphQL queries to run against the Metadata API | |
query_workbooks = """ | |
{ | |
workbooks { | |
workbook_name: name | |
workbook_id: luid | |
workbook_project: projectName | |
views { | |
view_type: __typename | |
view_name: name | |
view_id: luid | |
} | |
upstreamTables { | |
upstr_table_name: name | |
upstr_table_id: luid | |
upstreamDatabases { | |
upstr_db_name: name | |
upstr_db_type: connectionType | |
upstr_db_id: luid | |
upstr_db_isEmbedded: isEmbedded | |
} | |
} | |
upstreamDatasources { | |
upstr_ds_name: name | |
upstr_ds_id: luid | |
upstr_ds_project: projectName | |
} | |
embeddedDatasources { | |
emb_ds_name: name | |
} | |
upstreamFlows { | |
flow_name: name | |
flow_id: luid | |
flow_project: projectName | |
} | |
} | |
} | |
""" | |
query_databases = """ | |
{ | |
databaseServers { | |
database_hostname: hostName | |
database_port: port | |
database_id: luid | |
} | |
} | |
""" | |
def get_metadata_json(conn, query, content_type): | |
results = conn.metadata_graphql_query(query) | |
results_json = results.json()['data'][content_type] | |
return results_json | |
def get_workbook_metadata_df(json_data): | |
df = json_normalize(json_data) | |
cols_to_drop = ['views', 'upstreamTables', 'upstreamDatasources', 'embeddedDatasources', 'upstreamFlows'] | |
df.drop(columns=cols_to_drop, inplace=True) | |
return df | |
def get_view_metadata_df(json_data): | |
df = json_normalize(data=json_data, record_path='views', meta='workbook_id') | |
return df | |
def get_tables_metadata_df(json_data): | |
df = json_normalize(data=json_data, record_path='upstreamTables', meta='workbook_id') | |
df = flatten_dict_list_column(df, col_name='upstreamDatabases') | |
return df | |
def get_database_metadata_df(json_data): | |
return pd.DataFrame(json_data) | |
def get_datasource_metadata_df(json_data): | |
wb_upstream_ds_df = json_normalize(data=json_data, record_path='upstreamDatasources', meta='workbook_id') | |
wb_embedded_ds_df = json_normalize(data=json_data, record_path='embeddedDatasources', meta='workbook_id') | |
df = wb_upstream_ds_df.merge(wb_embedded_ds_df, how='left', on='workbook_id') | |
return df | |
def get_flow_metadata_df(json_data): | |
df = json_normalize(data=json_data, record_path='upstreamFlows', meta='workbook_id') | |
return df | |
def get_combined_metadata_df(wb_json, db_json): | |
wb_df = get_workbook_metadata_df(wb_json) | |
wb_views_df = get_view_metadata_df(wb_json) | |
wb_tables_df = get_tables_metadata_df(wb_json) | |
db_df = get_database_metadata_df(db_json) | |
wb_ds_df = get_datasource_metadata_df(wb_json) | |
wb_flows_df = get_flow_metadata_df(wb_json) | |
combined_df = wb_df.merge(wb_views_df, how='left', on='workbook_id') | |
combined_df = combined_df.merge(wb_tables_df, how='left', on='workbook_id') | |
combined_df = combined_df.merge(db_df, how='left', left_on='upstr_db_id', right_on='database_id') | |
combined_df = combined_df.merge(wb_ds_df, how='left', on='workbook_id') | |
combined_df = combined_df.merge(wb_flows_df, how='left', on='workbook_id') | |
return combined_df | |
def add_contextual_columns(df, site_name): | |
df['summary_date'] = pd.datetime.now() | |
df['site_name'] = site_name | |
return df | |
conn = TableauServerConnection(tableau_server_config, 'my_env') | |
conn.sign_in() | |
sites_df = get_sites_dataframe(conn) | |
all_sites_combined_df = pd.DataFrame() | |
for index, site in sites_df.iterrows(): | |
print(f"querying data from site '{site['contentUrl']}'...") | |
conn.switch_site(site['contentUrl']) | |
wb_query_results_json = get_metadata_json(conn, query_workbooks, 'workbooks') | |
db_query_results_json = get_metadata_json(conn, query_databases, 'databaseServers') | |
combined_df = get_combined_metadata_df(wb_query_results_json, db_query_results_json) | |
combined_df = add_contextual_columns(combined_df, conn.site_name) | |
all_sites_combined_df = all_sites_combined_df.append(combined_df, sort=False, ignore_index=True) | |
# output the resulting data to a CSV file | |
all_sites_combined_df.to_csv('impact_analysis_milestone1.csv', header=True, index=False) | |
conn.sign_out() |
Hey Ivan, I am getting the same error when I use a Jupyter notebook, but I do not get the error when running the exact same code in PyCharm. It's odd because the error would seem to indicate that the 'include_usage_flag' argument is being passed and that there is no such parameter defined in the 'SiteEndpoint' class, but if that were the case then the code would also raise a TypeError outside of Jupyter when run in PyCharm.
I'll keep looking into it, but I recommend trying to run your code outside of Jupyter if possible as a short-term solution.
Hope that helps you!
Following up, please run pip install -U tableau-api-lib
and try again. Let me know if you are still getting the error, but it is resolved for me now (both in Jupyter and using Python directly).
Hi Gurus,
can you please help me how do i sign.in the server without SSL certificate verification exception.
It looks like when i request connection my client program failing with SSL error.
My code snippet as below.
from tableau_api_lib import TableauServerConnection
tableau_server_config = {
'tableau_prod': {
'server': 'https://xxxxx.edwards.com.edwards.com/',
'api_version': '3.7',
'username': 'xxxx',
'password': 'xxxxx',
'site_name': 'Finance',
'site_url': 'https://xxxxx.edwards.com/#/site/Finance/home'
}
}
conn = TableauServerConnection(tableau_server_config)
conn = TableauServerConnection(tableau_server_config)
conn.sign_in()
conn.create_site(site_name='estam_temp', content_url='estamTemp')
conn.sign_out()
Hello @divinorum-webb,
thank you for your replay.
1st: I had the problem using PyCharm, unlike you.
2nd: I ran "pip install -U tableau-api-lib", the tableau-api-lib has been upgraded from tableau-api-lib-0.0.100 to tableau-api-lib-0.0.102.
Now it works.
Thank you.
Ivan
Hi ivan,
Really Thanks for your Quick response, i will try it but meanwhile how where should i add "add_http_options({'verify': False})" parameter to the below snippet if upgrading tableu api doesnt work?
Hello @koteshbhupathi,
I am not sure that my reply would help you, I do not know much aboit ssl and certificates.
What I can say is that my server is in https, with a certificate issued by my company, and that I do not write anything related to that in the connection string. Here is the way I connect:
tableau_server_config_src = {
'tableau_prod_myserver': {
'server': 'https://myserver.sub.mycompany.fr',
'api_version': '3.9',
'username': user,
'password': mdp_ts,
'site_name': '',
'site_url': ''
}
}
I leave site_name and site_url as empty strings, the site(s) name(s) and url are managed by the following code.
Hope this helps
Ivan
Hi @koteshbhupathi, one issue for sure is that your site_url
value should just be the site name as it appears in your URL; in this case, it looks like it will be either 'Finance' or 'finance'.
This value is what Tableau sometimes refers to in its documentation as the 'content URL'. The reason it is different from your site name is that often times site names have spaces or other characters in them which are not valid characters in a URL string. For example, if my site name were 'Tableau is Great' then the site URL would remove the spaces and be something like 'TableauisGreat'.
So to recap, keep your site name but change the site_url
value to exactly how the site name is represented in your URL when logged into that site on Tableau Server.
As a side note, if you leave site_name
and site_url
empty then you will log into the default site for the server.
in this function call sites_df = get_sites_dataframe(conn)
, I get this error
PaginationError:
The Tableau Server REST API method query_sites did not return paginated results.
Please verify that your connection is logged in and has a valid auth token.
If using personal access tokens, note that only one session can be active at a time using a single token.
Also note that the extract_pages() method wrapping this call is intended for paginated results only.
Not all Tableau Server REST API methods support pagination.
Any ideas or troubleshooting steps appreciated.
Full, un-redacted error:
Traceback (most recent call last):
File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\pagination.py", line 14, in get_page_attributes
pagination = query['pagination']
KeyError: 'pagination'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\tableau2.py", line 142, in <module>
sites_df = get_sites_dataframe(conn,site_names = site)
File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\querying\sites.py", line 18, in get_sites_dataframe
sites_df = pd.DataFrame(get_all_site_fields(conn))
File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\querying\sites.py", line 13, in get_all_site_fields
all_sites = extract_pages(conn.query_sites, parameter_dict={'fields': 'fields=_all_'})
File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\pagination.py", line 51, in extract_pages
page_number, page_size, total_available = get_page_attributes(query, query_func)
File "c:\users\appdata\local\programs\python\python38-32\lib\site-packages\tableau_api_lib\utils\pagination.py", line 20, in get_page_attributes
raise PaginationError(query_func)
PaginationError:
The Tableau Server REST API method query_sites did not return paginated results.
Please verify that your connection is logged in and has a valid auth token.
If using personal access tokens, note that only one session can be active at a time using a single token.
Also note that the extract_pages() method wrapping this call is intended for paginated results only.
Not all Tableau Server REST API methods support pagination.
@YEM-1 Let's start out by checking if you have the latest version of tableau-api-lib
installed. I have seen this same pagination error in older versions of the library, and it is possible you are on an older version.
First open your command line prompt (if you use a virtual environment, make sure the desired one is active) and run pip install -U tableau-api-lib
.
If you then attempt to run your code, does it work? If not, you can get more information about what's wrong by inspecting the connection object for the active endpoint, server response, and contents of the server response.
Additionally, you can try hitting the REST API for "Query Sites" directly:
response = conn.query_sites()
Assuming your connection object is conn
, you can then inspect it as described above in the following ways:
conn.active_endpoint
inspects the most recent endpoint the connection has used (the URL used in the request for data)response.status_code
inspects the status code the server returned; you can cross-reference against the Tableau Server REST API documentation to understand what the code means.response.content
inspects the content of the server response, and this may include a text description of why your request failed.
Hope that helps!
**After debugging the issue I've detailed below I realize why this is happening- one my sites doesn't have database servers. As a way around it I added an If statement to add missing column names with null values. like:
if 'database_id' not in df.columns:
df['database_id']= ""
but I realize this isn't an efficient fix since I did come across other sites not having all few other columns. So If there is a more efficient way to do this please let me know. Thanks and I really appreciate all the education you've put out there.
I am facing a very strange issue: When I run this whole code I get a KeyError: 'database_id' from combined_df = combined_df.merge(db_df, how='left', left_on='upstr_db_id', right_on='database_id'). At first I thought it was because the Json output from def get_metadata_json() was converted to pandas df with df = json_normalize(json_data). So added that but I still this error.
However, the weird this is that while debugging I ran the code outside of the loop and created an appended dataset. When I go back running it in a loop as it is here, it throws the error. I am really lost as to what's happening here. Any help is much appreciated. Thanks!
Stacktrace:
Traceback (most recent call last):
File ".\switch_site.py", line 155, in
combined_df = get_combined_metadata_df(wb_query_results_json, db_query_results_json)
File ".\switch_site.py", line 128, in get_combined_metadata_df
combined_df = combined_df.merge(db_df, how='left', left_on='upstr_db_id', right_on='database_id')
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\frame.py", line 7297, in merge
validate=validate,
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\reshape\merge.py", line 86, in merge
validate=validate,
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\reshape\merge.py", line 627, in init
) = self._get_merge_keys()
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\reshape\merge.py", line 983, in _get_merge_keys
right_keys.append(right._get_label_or_level_values(rk))
File "C:\Users\alaksh1\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py", line 1692, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'database_id'
If you know all the columns you expect to come through, you could try defining a placeholder dataframe before apppending any of your generated dataframes to it.
For example, let's say you're anticipating these columns to be returned: ['col_a', 'col_b', 'col_c'].
You could define your placeholder dataframe like this: df = pd.DataFrame(columns=['col_a', 'col_b', 'col_c'])
.
Now even if one of your dataframes doesn't have all three columns (maybe it has returned columns ['col_a', 'col_c]), appending it to the placeholder will work and row values for the columns not present will default to NaN.
Hope that helps!
If you know all the columns you expect to come through, you could try defining a placeholder dataframe before apppending any of your generated dataframes to it.
For example, let's say you're anticipating these columns to be returned: ['col_a', 'col_b', 'col_c'].
You could define your placeholder dataframe like this:
df = pd.DataFrame(columns=['col_a', 'col_b', 'col_c'])
.
Now even if one of your dataframes doesn't have all three columns (maybe it has returned columns ['col_a', 'col_c]), appending it to the placeholder will work and row values for the columns not present will default to NaN.Hope that helps!
Thanks!
This is happening in merge statement, while merging wb_df with combined_df. So when ‘database_id’ isn’t present it doesn’t have a column to merge on.
Hello,
all your work is great, this is exactly what I need to have a better idea of what is going on on my Tableau Server.
I have a problem though:
My Tableau Server is 2020.3.1, API: 3.9
When I run the script, I have an error:
Traceback (most recent call last):
File "", line 138, in
File "C:\Python36\lib\site-packages\tableau_api_lib\decorators\verification.py", line 43, in wrapper
return func(self, *args, **kwargs)
File "C:\Python36\lib\site-packages\tableau_api_lib\decorators\verification.py", line 154, in wrapper
return func(self, *args, **kwargs)
File "C:\Python36\lib\site-packages\tableau_api_lib\tableau_server_connection.py", line 196, in switch_site
self.site_name = self.query_site().json()['site']['name']
File "C:\Python36\lib\site-packages\tableau_api_lib\decorators\verification.py", line 154, in wrapper
return func(self, *args, **kwargs)
File "C:\Python36\lib\site-packages\tableau_api_lib\tableau_server_connection.py", line 288, in query_site
parameter_dict=parameter_dict).get_endpoint()
TypeError: init() got an unexpected keyword argument 'include_usage_flag'
I have run the script from line 1 to 138, and I have the error.
It does print(f"querying data from site '{site['contentUrl']}'...") and
conn.switch_site(site['contentUrl'])
from the 1st site to the 2nd and stops.
There are 8 sites on my server.
Would you know what I could do about this?
Thank you
Ivan