Last active
December 22, 2023 06:17
-
-
Save samsav/f4ae72eb7d03fa27310f82e0a1d5636e to your computer and use it in GitHub Desktop.
Scripts for downloading and analyzing Zendesk Help Center contents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script for analyzing Zendesk Help Center articles | |
Extracts the following statistics from article body content: | |
- word count | |
- average sentence length | |
- number of sentences that are too long (as defined by the LONG_SENT_LIMIT | |
parameter) | |
- readability scores (Flesch Reading Ease and Automated Readability Index) | |
- number of images | |
- ratio of images to text (images per 100 words) | |
Uses BeautifulSoup to parse HTML, spaCy to tokenize text, and textstat for | |
readability formula calculations. Uses pandas to read input JSON into a | |
DataFrame and to manipulate the data. | |
""" | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import en_core_web_sm # spaCy language model; run python3 -m spacy download en_core_web_sm to download | |
import textstat | |
from unicodedata import normalize | |
import string | |
from os.path import splitext | |
PUNCT = set(string.punctuation + string.whitespace) | |
LONG_SENT_LIMIT = 24 | |
def clean_html(html_text): | |
"""Parse HTML text and clean artifacts""" | |
if not html_text: | |
return "" | |
soup = BeautifulSoup(html_text, 'html.parser') | |
text = soup.get_text() | |
text_norm = normalize('NFKD', text) | |
cleaned = text_norm.replace('\n', ' ').rstrip() | |
return cleaned | |
def get_num_imgs(html_text): | |
"""Parse HTML text and return the number of <img> tags""" | |
if not html_text: | |
return 0 | |
soup = BeautifulSoup(html_text, 'html.parser') | |
imgs = soup.find_all('img') | |
return len(imgs) | |
def len_without_punct(tokens): | |
"""Return the number of non-punctuation tokens in a sentence or text""" | |
return sum(token.text not in PUNCT for token in tokens) | |
def img_to_word_count_ratio(df_row): | |
"""Calculate number of images per 100 words in an article""" | |
raw_count = df_row.loc['image_count'] / df_row.loc['word_count'] | |
return round((raw_count * 100), 1) | |
if __name__ == "__main__": | |
nlp = en_core_web_sm.load() # Load spaCy language model | |
kb_export_filename = input('Zendesk article download file: ').lower() | |
export_date = splitext(kb_export_filename)[0][-10:] | |
section_export_filename = input('Zendesk sections download file: ').lower() | |
# Read the JSON export into a pandas DataFrame. Don't convert dates to | |
# ensure that the article ID numbers are not interpreted as datetime objects. | |
print('Reading article export file') | |
df = pd.read_json(kb_export_filename, | |
convert_dates=False, | |
convert_axes=False, | |
orient='index') | |
section_df = pd.read_json(section_export_filename, | |
convert_dates=False, | |
convert_axes=False, | |
orient='index') | |
# Drop rows if there's no body text | |
df.dropna(subset=['body'], inplace=True) | |
# Convert date columns to datetime objects | |
date_cols = { | |
'created_at': 'datetime64', | |
'edited_at': 'datetime64', | |
'updated_at': 'datetime64' | |
} | |
df = df.astype(dtype=date_cols) | |
# Add section names | |
df['section_name'] = df.apply( | |
lambda row: section_df.loc[str(row.section_id), 'name'], axis=1) | |
# Clean the HTML code | |
print('Cleaning HTML') | |
df['body_cleaned'] = df.apply(lambda row: clean_html(row.loc['body']), | |
axis=1) | |
# Tokenize the body text using spaCy | |
print('Tokenizing text') | |
df['body_tokenized'] = df.apply(lambda row: nlp(row.loc['body_cleaned']), | |
axis=1) | |
# Calculate word counts and average sentence lengths | |
print('Adding analysis columns: word count and sentence length') | |
df['word_count'] = df.apply( | |
lambda row: len_without_punct(row.loc['body_tokenized']), axis=1) | |
df['avg_sentence_length'] = [ | |
round(len_without_punct(row) / len(list(row.sents))) | |
for row in df.loc[:, 'body_tokenized'] | |
] | |
# Count the number of sentences longer than 24 words in an article | |
df['long_sentence_count'] = [ | |
sum(len_without_punct(sent) > LONG_SENT_LIMIT for sent in row.sents) | |
for row in df.loc[:, 'body_tokenized'] | |
] | |
# Add readability scores | |
print('Adding analysis columns: readability scores') | |
# Flesch Reading Ease | |
df['FRE'] = df.apply( | |
lambda row: textstat.flesch_reading_ease(row.loc['body_cleaned']), | |
axis=1) | |
# Automated Readability Index | |
df['ARI'] = df.apply(lambda row: textstat.automated_readability_index( | |
row.loc['body_cleaned']), | |
axis=1) | |
# Get the number of images | |
print('Adding analysis columns: images') | |
df['image_count'] = df.apply(lambda row: get_num_imgs(row.loc['body']), | |
axis=1) | |
df['img_to_words_ratio'] = df.apply( | |
lambda row: img_to_word_count_ratio(row), axis=1) | |
# Define output columns and their order | |
output_columns = [ | |
'name', 'title', 'html_url', 'section_id', 'section_name', 'author_id', | |
'created_at', 'edited_at', 'updated_at', 'draft', 'outdated', | |
'promoted', 'label_names', 'locale', 'word_count', | |
'avg_sentence_length', 'long_sentence_count', 'FRE', 'ARI', | |
'image_count', 'img_to_words_ratio' | |
] | |
# Reindex the output DataFrame for easier reading | |
output_index = pd.Index(output_columns) | |
output_df = df.reindex(columns=output_index) | |
# Write output file | |
with open(f'output/analysis_complete_data_{export_date}.csv', 'w') as fp: | |
fp.write(output_df.to_csv()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script for downloading Zendesk Help Center articles in JSON format""" | |
# Based on the following scripts: | |
# https://develop.zendesk.com/hc/en-us/articles/360001074288-Zendesk-REST-API-tutorial-Backing-up-your-knowledge-base-with-Python | |
# https://github.com/scotthavard92/Zendesk-Help-Center-Export-Scripts/blob/master/Article_Information_Export/Zendesk_Guide_Article_Export.py | |
# Relevant Zendesk API documentation: | |
# https://developer.zendesk.com/rest_api/docs/support/introduction#security-and-authentication | |
# https://developer.zendesk.com/rest_api/docs/help_center/articles | |
# https://developer.zendesk.com/rest_api/docs/help_center/sections | |
import requests | |
from requests.auth import HTTPBasicAuth | |
import json | |
import sys | |
from getpass import getpass | |
from datetime import datetime | |
# Define JSON fields that can be discarded (either not in use or do not | |
# contain useful data). See Zendesk API documentation for available fields. | |
# Note: 'id' is excluded because it is used as a dictionary key when combining | |
# the API request results. | |
EXCLUDE = [ | |
'id', 'user_segment_id', 'permission_group_id', 'url', 'comments_disabled', | |
'position', 'vote_sum', 'vote_count', 'outdated_locales' | |
] | |
DATE = str(datetime.today().date()) | |
def get_api_content(url, user, token, api_content='articles', exclude=[]): | |
api_output = dict() | |
# Loop until there's no next_page in the response | |
while url: | |
try: | |
response = requests.get(url, auth=HTTPBasicAuth(user, token)) | |
response.raise_for_status() # Raise HTTP error if any | |
except requests.exceptions.HTTPError as err: | |
print(err) | |
sys.exit(1) | |
# Read the response into JSON | |
data = response.json() | |
sys.stdout.write("\rFetching {} page {:3^}/{:3^}".format( | |
api_content, data['page'], data['page_count'])) | |
sys.stdout.flush() | |
# data['articles'] is a list of dicts | |
for entry in data[api_content]: | |
key = entry['id'] | |
api_output[key] = dict() | |
for field in entry.keys(): | |
if field in EXCLUDE: | |
continue # Skip excluded fields | |
api_output[key][field] = entry[field] | |
url = data['next_page'] | |
return api_output | |
# Set the request parameters | |
articles_url = input('Articles API endpoint URL: ').lower() | |
# Use Your Zendesk Support Sign-On Credentials | |
user = f'{input("Zendesk user email: ").lower()}/token' | |
token = getpass(prompt="Zendesk API token: ") | |
articles = get_api_content(articles_url, | |
user, | |
token, | |
api_content='articles', | |
exclude=EXCLUDE) | |
# Write articles_output to JSON file | |
with open(f'downloads/zd_articles_{DATE}.json', 'w') as fp: | |
json.dump(articles, fp) | |
print(f'\nWrote article query response to file {fp.name}.\n') | |
# Get section data | |
sections_url = input('Sections API endpoint URL: ').lower() | |
sections = get_api_content(sections_url, | |
user, | |
token, | |
api_content='sections', | |
exclude=EXCLUDE) | |
with open(f'downloads/kb_sections_{DATE}.json', 'w') as fp: | |
json.dump(sections, fp) | |
print(f'\nWrote section query response to file {fp.name}.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment