samsav/zendesk_analysis.py

## zendesk_analysis.py
"""
Script for analyzing Zendesk Help Center articles

Extracts the following statistics from article body content:

- word count
- average sentence length
- number of sentences that are too long (as defined by the LONG_SENT_LIMIT
  parameter)
- readability scores (Flesch Reading Ease and Automated Readability Index)
- number of images
- ratio of images to text (images per 100 words)

Uses BeautifulSoup to parse HTML, spaCy to tokenize text, and textstat for
readability formula calculations. Uses pandas to read input JSON into a
DataFrame and to manipulate the data.
"""

import pandas as pd
from bs4 import BeautifulSoup
import en_core_web_sm  # spaCy language model; run python3 -m spacy download en_core_web_sm to download
import textstat
from unicodedata import normalize
import string
from os.path import splitext

PUNCT = set(string.punctuation + string.whitespace)
LONG_SENT_LIMIT = 24


def clean_html(html_text):
    """Parse HTML text and clean artifacts"""
    if not html_text:
        return ""
    soup = BeautifulSoup(html_text, 'html.parser')
    text = soup.get_text()
    text_norm = normalize('NFKD', text)
    cleaned = text_norm.replace('\n', ' ').rstrip()
    return cleaned


def get_num_imgs(html_text):
    """Parse HTML text and return the number of <img> tags"""
    if not html_text:
        return 0
    soup = BeautifulSoup(html_text, 'html.parser')
    imgs = soup.find_all('img')
    return len(imgs)


def len_without_punct(tokens):
    """Return the number of non-punctuation tokens in a sentence or text"""
    return sum(token.text not in PUNCT for token in tokens)


def img_to_word_count_ratio(df_row):
    """Calculate number of images per 100 words in an article"""
    raw_count = df_row.loc['image_count'] / df_row.loc['word_count']
    return round((raw_count * 100), 1)


if __name__ == "__main__":
    nlp = en_core_web_sm.load()  # Load spaCy language model
    kb_export_filename = input('Zendesk article download file: ').lower()
    export_date = splitext(kb_export_filename)[0][-10:]

    section_export_filename = input('Zendesk sections download file: ').lower()

    # Read the JSON export into a pandas DataFrame. Don't convert dates to
    # ensure that the article ID numbers are not interpreted as datetime objects.
    print('Reading article export file')
    df = pd.read_json(kb_export_filename,
                      convert_dates=False,
                      convert_axes=False,
                      orient='index')

    section_df = pd.read_json(section_export_filename,
                              convert_dates=False,
                              convert_axes=False,
                              orient='index')

    # Drop rows if there's no body text
    df.dropna(subset=['body'], inplace=True)

    # Convert date columns to datetime objects
    date_cols = {
        'created_at': 'datetime64',
        'edited_at': 'datetime64',
        'updated_at': 'datetime64'
    }
    df = df.astype(dtype=date_cols)

    # Add section names
    df['section_name'] = df.apply(
        lambda row: section_df.loc[str(row.section_id), 'name'], axis=1)

    # Clean the HTML code
    print('Cleaning HTML')
    df['body_cleaned'] = df.apply(lambda row: clean_html(row.loc['body']),
                                  axis=1)
    # Tokenize the body text using spaCy
    print('Tokenizing text')
    df['body_tokenized'] = df.apply(lambda row: nlp(row.loc['body_cleaned']),
                                    axis=1)

    # Calculate word counts and average sentence lengths
    print('Adding analysis columns: word count and sentence length')
    df['word_count'] = df.apply(
        lambda row: len_without_punct(row.loc['body_tokenized']), axis=1)
    df['avg_sentence_length'] = [
        round(len_without_punct(row) / len(list(row.sents)))
        for row in df.loc[:, 'body_tokenized']
    ]
    # Count the number of sentences longer than 24 words in an article
    df['long_sentence_count'] = [
        sum(len_without_punct(sent) > LONG_SENT_LIMIT for sent in row.sents)
        for row in df.loc[:, 'body_tokenized']
    ]

    # Add readability scores
    print('Adding analysis columns: readability scores')
    # Flesch Reading Ease
    df['FRE'] = df.apply(
        lambda row: textstat.flesch_reading_ease(row.loc['body_cleaned']),
        axis=1)
    # Automated Readability Index
    df['ARI'] = df.apply(lambda row: textstat.automated_readability_index(
        row.loc['body_cleaned']),
                         axis=1)

    # Get the number of images
    print('Adding analysis columns: images')
    df['image_count'] = df.apply(lambda row: get_num_imgs(row.loc['body']),
                                 axis=1)
    df['img_to_words_ratio'] = df.apply(
        lambda row: img_to_word_count_ratio(row), axis=1)

    # Define output columns and their order
    output_columns = [
        'name', 'title', 'html_url', 'section_id', 'section_name', 'author_id',
        'created_at', 'edited_at', 'updated_at', 'draft', 'outdated',
        'promoted', 'label_names', 'locale', 'word_count',
        'avg_sentence_length', 'long_sentence_count', 'FRE', 'ARI',
        'image_count', 'img_to_words_ratio'
    ]
    # Reindex the output DataFrame for easier reading
    output_index = pd.Index(output_columns)
    output_df = df.reindex(columns=output_index)
    # Write output file
    with open(f'output/analysis_complete_data_{export_date}.csv', 'w') as fp:
        fp.write(output_df.to_csv())

## zendesk_download.py
"""Script for downloading Zendesk Help Center articles in JSON format"""

# Based on the following scripts:
# https://develop.zendesk.com/hc/en-us/articles/360001074288-Zendesk-REST-API-tutorial-Backing-up-your-knowledge-base-with-Python
# https://github.com/scotthavard92/Zendesk-Help-Center-Export-Scripts/blob/master/Article_Information_Export/Zendesk_Guide_Article_Export.py

# Relevant Zendesk API documentation:
# https://developer.zendesk.com/rest_api/docs/support/introduction#security-and-authentication
# https://developer.zendesk.com/rest_api/docs/help_center/articles
# https://developer.zendesk.com/rest_api/docs/help_center/sections

import requests
from requests.auth import HTTPBasicAuth
import json
import sys
from getpass import getpass
from datetime import datetime

# Define JSON fields that can be discarded (either not in use or do not
# contain useful data). See Zendesk API documentation for available fields.
# Note: 'id' is excluded because it is used as a dictionary key when combining
# the API request results.
EXCLUDE = [
    'id', 'user_segment_id', 'permission_group_id', 'url', 'comments_disabled',
    'position', 'vote_sum', 'vote_count', 'outdated_locales'
]

DATE = str(datetime.today().date())


def get_api_content(url, user, token, api_content='articles', exclude=[]):
    api_output = dict()
    # Loop until there's no next_page in the response
    while url:
        try:
            response = requests.get(url, auth=HTTPBasicAuth(user, token))
            response.raise_for_status()  # Raise HTTP error if any
        except requests.exceptions.HTTPError as err:
            print(err)
            sys.exit(1)
        # Read the response into JSON
        data = response.json()
        sys.stdout.write("\rFetching {} page {:3^}/{:3^}".format(
            api_content, data['page'], data['page_count']))
        sys.stdout.flush()
        # data['articles'] is a list of dicts
        for entry in data[api_content]:
            key = entry['id']
            api_output[key] = dict()
            for field in entry.keys():
                if field in EXCLUDE:
                    continue  # Skip excluded fields
                api_output[key][field] = entry[field]

        url = data['next_page']

    return api_output


# Set the request parameters
articles_url = input('Articles API endpoint URL: ').lower()
# Use Your Zendesk Support Sign-On Credentials
user = f'{input("Zendesk user email: ").lower()}/token'
token = getpass(prompt="Zendesk API token: ")

articles = get_api_content(articles_url,
                           user,
                           token,
                           api_content='articles',
                           exclude=EXCLUDE)

# Write articles_output to JSON file
with open(f'downloads/zd_articles_{DATE}.json', 'w') as fp:
    json.dump(articles, fp)
    print(f'\nWrote article query response to file {fp.name}.\n')

# Get section data
sections_url = input('Sections API endpoint URL: ').lower()

sections = get_api_content(sections_url,
                           user,
                           token,
                           api_content='sections',
                           exclude=EXCLUDE)

with open(f'downloads/kb_sections_{DATE}.json', 'w') as fp:
    json.dump(sections, fp)
    print(f'\nWrote section query response to file {fp.name}.')
	"""
	Script for analyzing Zendesk Help Center articles

	Extracts the following statistics from article body content:

	- word count
	- average sentence length
	- number of sentences that are too long (as defined by the LONG_SENT_LIMIT
	parameter)
	- readability scores (Flesch Reading Ease and Automated Readability Index)
	- number of images
	- ratio of images to text (images per 100 words)

	Uses BeautifulSoup to parse HTML, spaCy to tokenize text, and textstat for
	readability formula calculations. Uses pandas to read input JSON into a
	DataFrame and to manipulate the data.
	"""

	import pandas as pd
	from bs4 import BeautifulSoup
	import en_core_web_sm # spaCy language model; run python3 -m spacy download en_core_web_sm to download
	import textstat
	from unicodedata import normalize
	import string
	from os.path import splitext

	PUNCT = set(string.punctuation + string.whitespace)
	LONG_SENT_LIMIT = 24


	def clean_html(html_text):
	"""Parse HTML text and clean artifacts"""
	if not html_text:
	return ""
	soup = BeautifulSoup(html_text, 'html.parser')
	text = soup.get_text()
	text_norm = normalize('NFKD', text)
	cleaned = text_norm.replace('\n', ' ').rstrip()
	return cleaned


	def get_num_imgs(html_text):
	"""Parse HTML text and return the number of <img> tags"""
	if not html_text:
	return 0
	soup = BeautifulSoup(html_text, 'html.parser')
	imgs = soup.find_all('img')
	return len(imgs)


	def len_without_punct(tokens):
	"""Return the number of non-punctuation tokens in a sentence or text"""
	return sum(token.text not in PUNCT for token in tokens)


	def img_to_word_count_ratio(df_row):
	"""Calculate number of images per 100 words in an article"""
	raw_count = df_row.loc['image_count'] / df_row.loc['word_count']
	return round((raw_count * 100), 1)


	if __name__ == "__main__":
	nlp = en_core_web_sm.load() # Load spaCy language model
	kb_export_filename = input('Zendesk article download file: ').lower()
	export_date = splitext(kb_export_filename)[0][-10:]

	section_export_filename = input('Zendesk sections download file: ').lower()

	# Read the JSON export into a pandas DataFrame. Don't convert dates to
	# ensure that the article ID numbers are not interpreted as datetime objects.
	print('Reading article export file')
	df = pd.read_json(kb_export_filename,
	convert_dates=False,
	convert_axes=False,
	orient='index')

	section_df = pd.read_json(section_export_filename,
	convert_dates=False,
	convert_axes=False,
	orient='index')

	# Drop rows if there's no body text
	df.dropna(subset=['body'], inplace=True)

	# Convert date columns to datetime objects
	date_cols = {
	'created_at': 'datetime64',
	'edited_at': 'datetime64',
	'updated_at': 'datetime64'
	}
	df = df.astype(dtype=date_cols)

	# Add section names
	df['section_name'] = df.apply(
	lambda row: section_df.loc[str(row.section_id), 'name'], axis=1)

	# Clean the HTML code
	print('Cleaning HTML')
	df['body_cleaned'] = df.apply(lambda row: clean_html(row.loc['body']),
	axis=1)
	# Tokenize the body text using spaCy
	print('Tokenizing text')
	df['body_tokenized'] = df.apply(lambda row: nlp(row.loc['body_cleaned']),
	axis=1)

	# Calculate word counts and average sentence lengths
	print('Adding analysis columns: word count and sentence length')
	df['word_count'] = df.apply(
	lambda row: len_without_punct(row.loc['body_tokenized']), axis=1)
	df['avg_sentence_length'] = [
	round(len_without_punct(row) / len(list(row.sents)))
	for row in df.loc[:, 'body_tokenized']
	]
	# Count the number of sentences longer than 24 words in an article
	df['long_sentence_count'] = [
	sum(len_without_punct(sent) > LONG_SENT_LIMIT for sent in row.sents)
	for row in df.loc[:, 'body_tokenized']
	]

	# Add readability scores
	print('Adding analysis columns: readability scores')
	# Flesch Reading Ease
	df['FRE'] = df.apply(
	lambda row: textstat.flesch_reading_ease(row.loc['body_cleaned']),
	axis=1)
	# Automated Readability Index
	df['ARI'] = df.apply(lambda row: textstat.automated_readability_index(
	row.loc['body_cleaned']),
	axis=1)

	# Get the number of images
	print('Adding analysis columns: images')
	df['image_count'] = df.apply(lambda row: get_num_imgs(row.loc['body']),
	axis=1)
	df['img_to_words_ratio'] = df.apply(
	lambda row: img_to_word_count_ratio(row), axis=1)

	# Define output columns and their order
	output_columns = [
	'name', 'title', 'html_url', 'section_id', 'section_name', 'author_id',
	'created_at', 'edited_at', 'updated_at', 'draft', 'outdated',
	'promoted', 'label_names', 'locale', 'word_count',
	'avg_sentence_length', 'long_sentence_count', 'FRE', 'ARI',
	'image_count', 'img_to_words_ratio'
	]
	# Reindex the output DataFrame for easier reading
	output_index = pd.Index(output_columns)
	output_df = df.reindex(columns=output_index)
	# Write output file
	with open(f'output/analysis_complete_data_{export_date}.csv', 'w') as fp:
	fp.write(output_df.to_csv())
	"""Script for downloading Zendesk Help Center articles in JSON format"""

	# Based on the following scripts:
	# https://develop.zendesk.com/hc/en-us/articles/360001074288-Zendesk-REST-API-tutorial-Backing-up-your-knowledge-base-with-Python
	# https://github.com/scotthavard92/Zendesk-Help-Center-Export-Scripts/blob/master/Article_Information_Export/Zendesk_Guide_Article_Export.py

	# Relevant Zendesk API documentation:
	# https://developer.zendesk.com/rest_api/docs/support/introduction#security-and-authentication
	# https://developer.zendesk.com/rest_api/docs/help_center/articles
	# https://developer.zendesk.com/rest_api/docs/help_center/sections

	import requests
	from requests.auth import HTTPBasicAuth
	import json
	import sys
	from getpass import getpass
	from datetime import datetime

	# Define JSON fields that can be discarded (either not in use or do not
	# contain useful data). See Zendesk API documentation for available fields.
	# Note: 'id' is excluded because it is used as a dictionary key when combining
	# the API request results.
	EXCLUDE = [
	'id', 'user_segment_id', 'permission_group_id', 'url', 'comments_disabled',
	'position', 'vote_sum', 'vote_count', 'outdated_locales'
	]

	DATE = str(datetime.today().date())


	def get_api_content(url, user, token, api_content='articles', exclude=[]):
	api_output = dict()
	# Loop until there's no next_page in the response
	while url:
	try:
	response = requests.get(url, auth=HTTPBasicAuth(user, token))
	response.raise_for_status() # Raise HTTP error if any
	except requests.exceptions.HTTPError as err:
	print(err)
	sys.exit(1)
	# Read the response into JSON
	data = response.json()
	sys.stdout.write("\rFetching {} page {:3^}/{:3^}".format(
	api_content, data['page'], data['page_count']))
	sys.stdout.flush()
	# data['articles'] is a list of dicts
	for entry in data[api_content]:
	key = entry['id']
	api_output[key] = dict()
	for field in entry.keys():
	if field in EXCLUDE:
	continue # Skip excluded fields
	api_output[key][field] = entry[field]

	url = data['next_page']

	return api_output


	# Set the request parameters
	articles_url = input('Articles API endpoint URL: ').lower()
	# Use Your Zendesk Support Sign-On Credentials
	user = f'{input("Zendesk user email: ").lower()}/token'
	token = getpass(prompt="Zendesk API token: ")

	articles = get_api_content(articles_url,
	user,
	token,
	api_content='articles',
	exclude=EXCLUDE)

	# Write articles_output to JSON file
	with open(f'downloads/zd_articles_{DATE}.json', 'w') as fp:
	json.dump(articles, fp)
	print(f'\nWrote article query response to file {fp.name}.\n')

	# Get section data
	sections_url = input('Sections API endpoint URL: ').lower()

	sections = get_api_content(sections_url,
	user,
	token,
	api_content='sections',
	exclude=EXCLUDE)

	with open(f'downloads/kb_sections_{DATE}.json', 'w') as fp:
	json.dump(sections, fp)
	print(f'\nWrote section query response to file {fp.name}.')