Skip to content

Instantly share code, notes, and snippets.

@samsav
Last active December 22, 2023 06:17
Show Gist options
  • Save samsav/f4ae72eb7d03fa27310f82e0a1d5636e to your computer and use it in GitHub Desktop.
Save samsav/f4ae72eb7d03fa27310f82e0a1d5636e to your computer and use it in GitHub Desktop.
Scripts for downloading and analyzing Zendesk Help Center contents
"""
Script for analyzing Zendesk Help Center articles
Extracts the following statistics from article body content:
- word count
- average sentence length
- number of sentences that are too long (as defined by the LONG_SENT_LIMIT
parameter)
- readability scores (Flesch Reading Ease and Automated Readability Index)
- number of images
- ratio of images to text (images per 100 words)
Uses BeautifulSoup to parse HTML, spaCy to tokenize text, and textstat for
readability formula calculations. Uses pandas to read input JSON into a
DataFrame and to manipulate the data.
"""
import pandas as pd
from bs4 import BeautifulSoup
import en_core_web_sm # spaCy language model; run python3 -m spacy download en_core_web_sm to download
import textstat
from unicodedata import normalize
import string
from os.path import splitext
PUNCT = set(string.punctuation + string.whitespace)
LONG_SENT_LIMIT = 24
def clean_html(html_text):
"""Parse HTML text and clean artifacts"""
if not html_text:
return ""
soup = BeautifulSoup(html_text, 'html.parser')
text = soup.get_text()
text_norm = normalize('NFKD', text)
cleaned = text_norm.replace('\n', ' ').rstrip()
return cleaned
def get_num_imgs(html_text):
"""Parse HTML text and return the number of <img> tags"""
if not html_text:
return 0
soup = BeautifulSoup(html_text, 'html.parser')
imgs = soup.find_all('img')
return len(imgs)
def len_without_punct(tokens):
"""Return the number of non-punctuation tokens in a sentence or text"""
return sum(token.text not in PUNCT for token in tokens)
def img_to_word_count_ratio(df_row):
"""Calculate number of images per 100 words in an article"""
raw_count = df_row.loc['image_count'] / df_row.loc['word_count']
return round((raw_count * 100), 1)
if __name__ == "__main__":
nlp = en_core_web_sm.load() # Load spaCy language model
kb_export_filename = input('Zendesk article download file: ').lower()
export_date = splitext(kb_export_filename)[0][-10:]
section_export_filename = input('Zendesk sections download file: ').lower()
# Read the JSON export into a pandas DataFrame. Don't convert dates to
# ensure that the article ID numbers are not interpreted as datetime objects.
print('Reading article export file')
df = pd.read_json(kb_export_filename,
convert_dates=False,
convert_axes=False,
orient='index')
section_df = pd.read_json(section_export_filename,
convert_dates=False,
convert_axes=False,
orient='index')
# Drop rows if there's no body text
df.dropna(subset=['body'], inplace=True)
# Convert date columns to datetime objects
date_cols = {
'created_at': 'datetime64',
'edited_at': 'datetime64',
'updated_at': 'datetime64'
}
df = df.astype(dtype=date_cols)
# Add section names
df['section_name'] = df.apply(
lambda row: section_df.loc[str(row.section_id), 'name'], axis=1)
# Clean the HTML code
print('Cleaning HTML')
df['body_cleaned'] = df.apply(lambda row: clean_html(row.loc['body']),
axis=1)
# Tokenize the body text using spaCy
print('Tokenizing text')
df['body_tokenized'] = df.apply(lambda row: nlp(row.loc['body_cleaned']),
axis=1)
# Calculate word counts and average sentence lengths
print('Adding analysis columns: word count and sentence length')
df['word_count'] = df.apply(
lambda row: len_without_punct(row.loc['body_tokenized']), axis=1)
df['avg_sentence_length'] = [
round(len_without_punct(row) / len(list(row.sents)))
for row in df.loc[:, 'body_tokenized']
]
# Count the number of sentences longer than 24 words in an article
df['long_sentence_count'] = [
sum(len_without_punct(sent) > LONG_SENT_LIMIT for sent in row.sents)
for row in df.loc[:, 'body_tokenized']
]
# Add readability scores
print('Adding analysis columns: readability scores')
# Flesch Reading Ease
df['FRE'] = df.apply(
lambda row: textstat.flesch_reading_ease(row.loc['body_cleaned']),
axis=1)
# Automated Readability Index
df['ARI'] = df.apply(lambda row: textstat.automated_readability_index(
row.loc['body_cleaned']),
axis=1)
# Get the number of images
print('Adding analysis columns: images')
df['image_count'] = df.apply(lambda row: get_num_imgs(row.loc['body']),
axis=1)
df['img_to_words_ratio'] = df.apply(
lambda row: img_to_word_count_ratio(row), axis=1)
# Define output columns and their order
output_columns = [
'name', 'title', 'html_url', 'section_id', 'section_name', 'author_id',
'created_at', 'edited_at', 'updated_at', 'draft', 'outdated',
'promoted', 'label_names', 'locale', 'word_count',
'avg_sentence_length', 'long_sentence_count', 'FRE', 'ARI',
'image_count', 'img_to_words_ratio'
]
# Reindex the output DataFrame for easier reading
output_index = pd.Index(output_columns)
output_df = df.reindex(columns=output_index)
# Write output file
with open(f'output/analysis_complete_data_{export_date}.csv', 'w') as fp:
fp.write(output_df.to_csv())
"""Script for downloading Zendesk Help Center articles in JSON format"""
# Based on the following scripts:
# https://develop.zendesk.com/hc/en-us/articles/360001074288-Zendesk-REST-API-tutorial-Backing-up-your-knowledge-base-with-Python
# https://github.com/scotthavard92/Zendesk-Help-Center-Export-Scripts/blob/master/Article_Information_Export/Zendesk_Guide_Article_Export.py
# Relevant Zendesk API documentation:
# https://developer.zendesk.com/rest_api/docs/support/introduction#security-and-authentication
# https://developer.zendesk.com/rest_api/docs/help_center/articles
# https://developer.zendesk.com/rest_api/docs/help_center/sections
import requests
from requests.auth import HTTPBasicAuth
import json
import sys
from getpass import getpass
from datetime import datetime
# Define JSON fields that can be discarded (either not in use or do not
# contain useful data). See Zendesk API documentation for available fields.
# Note: 'id' is excluded because it is used as a dictionary key when combining
# the API request results.
EXCLUDE = [
'id', 'user_segment_id', 'permission_group_id', 'url', 'comments_disabled',
'position', 'vote_sum', 'vote_count', 'outdated_locales'
]
DATE = str(datetime.today().date())
def get_api_content(url, user, token, api_content='articles', exclude=[]):
api_output = dict()
# Loop until there's no next_page in the response
while url:
try:
response = requests.get(url, auth=HTTPBasicAuth(user, token))
response.raise_for_status() # Raise HTTP error if any
except requests.exceptions.HTTPError as err:
print(err)
sys.exit(1)
# Read the response into JSON
data = response.json()
sys.stdout.write("\rFetching {} page {:3^}/{:3^}".format(
api_content, data['page'], data['page_count']))
sys.stdout.flush()
# data['articles'] is a list of dicts
for entry in data[api_content]:
key = entry['id']
api_output[key] = dict()
for field in entry.keys():
if field in EXCLUDE:
continue # Skip excluded fields
api_output[key][field] = entry[field]
url = data['next_page']
return api_output
# Set the request parameters
articles_url = input('Articles API endpoint URL: ').lower()
# Use Your Zendesk Support Sign-On Credentials
user = f'{input("Zendesk user email: ").lower()}/token'
token = getpass(prompt="Zendesk API token: ")
articles = get_api_content(articles_url,
user,
token,
api_content='articles',
exclude=EXCLUDE)
# Write articles_output to JSON file
with open(f'downloads/zd_articles_{DATE}.json', 'w') as fp:
json.dump(articles, fp)
print(f'\nWrote article query response to file {fp.name}.\n')
# Get section data
sections_url = input('Sections API endpoint URL: ').lower()
sections = get_api_content(sections_url,
user,
token,
api_content='sections',
exclude=EXCLUDE)
with open(f'downloads/kb_sections_{DATE}.json', 'w') as fp:
json.dump(sections, fp)
print(f'\nWrote section query response to file {fp.name}.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment