Skip to content

Instantly share code, notes, and snippets.

@sfboss
Last active June 3, 2024 13:11
Show Gist options
  • Save sfboss/0728e0caeb9dbc6b334243bf225a4d2e to your computer and use it in GitHub Desktop.
Save sfboss/0728e0caeb9dbc6b334243bf225a4d2e to your computer and use it in GitHub Desktop.
test
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "BGnPntdaXhYN"
},
"source": [
"# Trailhead Assistant"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EZ47C_htXo1S"
},
"source": [
"## **Trailhead Details**\n",
"\n",
"Summarizing, Keywords, and Related Topics for Trailhead Links to help you connect topics and concepts together to learn Salesforce"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yjNx_cNJ7958"
},
"source": [
"\n",
"## Get Trailhead URLS from Salesforce\n",
"\n",
"Type in a URL for a trailhead to be analyzed and run all cells. Your output should guide you from there."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jx6_h5k3UMPV"
},
"outputs": [],
"source": [
"#@title { form-width: \"80%\" }\n",
"selected_url = \"https://trailhead.salesforce.com/content/learn/modules\" #@param {type:\"string\"}\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YFU6MJMx7dFD"
},
"source": [
"## Parse Sitemap to get Valid Trailhead URLs with Pagination"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4l4bVZ1yB_h0"
},
"source": [
"### Trailhead URLs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 416
},
"id": "oSdIQCdi5GBw",
"outputId": "05b43014-abed-46f7-da97-8f609d56c81f"
},
"outputs": [],
"source": [
"import requests\n",
"import xml.etree.ElementTree as ET\n",
"import pandas as pd\n",
"from IPython.display import display\n",
"from ipywidgets import interact, SelectMultiple, Layout, Button\n",
"import ipywidgets as widgets\n",
"from newspaper import Article\n",
"\n",
"def parse_sitemap(sitemap_url):\n",
" response = requests.get(sitemap_url)\n",
" root = ET.fromstring(response.content)\n",
" urls = []\n",
" for child in root:\n",
" loc = child.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')\n",
" if loc is not None:\n",
" urls.append(loc.text)\n",
" return urls\n",
"\n",
"def parse_urls(selected_url):\n",
" article = Article(selected_url)\n",
" article.download()\n",
" article.parse()\n",
" urls = article.extractor.get_urls(selected_url)\n",
" print(urls)\n",
" return urls\n",
"\n",
"def paginate_urls(urls):\n",
" page_size = 10\n",
" num_pages = (len(urls) - 1) // page_size + 1\n",
" paginated_urls = [urls[i:i + page_size] for i in range(0, len(urls), page_size)]\n",
" return paginated_urls, num_pages\n",
"\n",
"def select_url(sitemap_url):\n",
" if 'sitemap' in sitemap_url:\n",
" urls = parse_sitemap(sitemap_url)\n",
" else:\n",
" urls = parse_urls(sitemap_url)\n",
" print (sitemap_url)\n",
" print (urls)\n",
" paginated_urls, num_pages = paginate_urls(urls)\n",
" df = pd.DataFrame({'URLs': urls})\n",
"\n",
" def view_urls(page):\n",
" page_urls = paginated_urls[page - 1]\n",
" selected = SelectMultiple(options=page_urls, layout=Layout(width='auto', height='200px'))\n",
" display(selected)\n",
"\n",
" def update_selection(change):\n",
" global selected_url\n",
" selected_url = selected.value\n",
"\n",
" selected.observe(update_selection, names='value')\n",
"\n",
" interact(view_urls, page=widgets.IntSlider(min=0, max=num_pages, step=1, value=1))\n",
"\n",
" def on_button_clicked(b):\n",
" global selected_url\n",
" print(\"Selected URL:\", selected_url)\n",
"\n",
" button = Button(description=\"Get Selected URL\")\n",
" button.on_click(on_button_clicked)\n",
" display(button)\n",
"\n",
" return selected_url\n",
"\n",
"url = select_url(selected_url)\n",
"\n",
"print (url)\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oliNfcbZCDOQ"
},
"source": [
"## Trailhead Details Datatable"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "IvmbYCXKSNX-"
},
"outputs": [],
"source": [
"#@title Trailhead Details { vertical-output: true }\n",
"print(\"Starting analysis...\")\n",
"!pip install -q newspaper3k\n",
"!pip install -q pandas\n",
"\n",
"import newspaper\n",
"from newspaper import Article\n",
"from google.colab.patches import cv2_imshow\n",
"import cv2\n",
"import nltk\n",
"import pandas as pd\n",
"import os\n",
"\n",
"def get_article_image(article):\n",
" imurl = article.top_image\n",
" !curl -s -o logo.png $imurl >&2 logo.png\n",
" with open('logo.png', 'rb') as f:\n",
" image_data = f.read()\n",
" return image_data\n",
"\n",
"def get_article_details(url):\n",
" article = Article(url)\n",
" article.download()\n",
" article.parse()\n",
" article.nlp()\n",
" thedetails = {}\n",
" thedetails['text'] = article.text\n",
" thedetails['url'] = url\n",
" thedetails['html'] = article.html\n",
" thedetails['keywords'] = article.keywords\n",
" thedetails['summary'] = article.summary\n",
" thedetails['title'] = article.title\n",
" thedetails['meta_description'] = article.meta_description\n",
" thedetails['meta_keywords'] = article.meta_keywords\n",
" thedetails['meta_data'] = article.meta_data\n",
" thedetails['tags'] = article.tags\n",
" thedetails['additional_data'] = article.additional_data\n",
" thedetails['image'] = get_article_image(article)\n",
" print(\"Analysis finished!\")\n",
" return thedetails\n",
"\n",
"\n",
"# Example usage\n",
"selected_url='https://trailhead.salesforce.com/content/learn/modules'\n",
"# Suppress nltk output\n",
"nltk.download('punkt', quiet=True)\n",
"\n",
"# Suppress curl output and errors\n",
"os.environ['CURL_PROGRESS'] = 'no'\n",
"article_details = get_article_details(selected_url)\n",
"df = pd.DataFrame([article_details])\n",
"df.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9H4aSVxTCIny"
},
"source": [
"## Individual Data Points\n",
"The Text, URL, HTML, Keywords, AND Summary columns from the above datatable."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iRW_Tx5IAFp1"
},
"source": [
"### Text\n",
"The text column contains the main text content extracted from the article using newspaper3k."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "nNLlU4u3Apy8"
},
"outputs": [],
"source": [
"#@title Displaying the value of the 'text' column { vertical-output: true }\n",
"df['text'][0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WDOxhzwVApeG"
},
"source": [
"### URL\n",
"The url column represents the URL of the article."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "3rarCzQhA3GM"
},
"outputs": [],
"source": [
"#@title Displaying the value of the 'URL' column\n",
"df['url'][0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vNZ2CU8IA-rG"
},
"source": [
"### HTML\n",
"The html column contains the raw HTML content of the article.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "Wvoj4BGvA-rG"
},
"outputs": [],
"source": [
"#@title Displaying the value of the 'HTML' column { vertical-output: true }\n",
"from IPython.display import display, HTML\n",
"display(HTML(df['html'][0]))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qz76v8QpA-9X"
},
"source": [
"### Keywords\n",
"\n",
"The keywords column contains a list of keywords extracted from the article."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "zA-PsmxgA-9X"
},
"outputs": [],
"source": [
"#@title Displaying the value of the 'Keywords' column { vertical-output: true }\n",
"df['keywords'][0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "e8g-cGF0A_2V"
},
"source": [
"### Summary\n",
"The summary column provides a summarized version of the article's content."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "JQfGWLN2A_2V"
},
"outputs": [],
"source": [
"#@title Displaying the value of the 'Summary' column { vertical-output: true }\n",
"df['summary'][0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BFmw2acVGo8r"
},
"source": [
"## Output Keywords and Summary as Markdown Table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B6UKPUJCGuxh"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from IPython.display import display, HTML\n",
"\n",
"# Select the desired columns\n",
"selected_columns = [\"text\", \"summary\",\"keywords\"]\n",
"df_selected = df[selected_columns].copy() # Make a copy of the DataFrame to avoid warnings\n",
"\n",
"# Join the keywords using <br> tags\n",
"df_selected['keywords'] = df_selected['keywords'].apply(lambda x: '<br>'.join(x))\n",
"\n",
"# Replace newlines within cell values with <br> tags\n",
"df_selected.loc[:, 'text'] = df_selected['text'].str.replace('\\n', '<br>')\n",
"\n",
"# Split the sentences in the 'summary' column and wrap each non-empty sentence in <li> tags\n",
"df_selected.loc[:, 'summary'] = df_selected['summary'].apply(lambda x: '<ul style=\"list-style-type: disc; padding-left: 20px;\">' + ''.join(['<li>' + s.strip() + '</li>' for s in x.split('.') if s.strip()]) + '</ul>')\n",
"\n",
"# Convert the selected DataFrame to an HTML table\n",
"html_table = df_selected.to_html(index=False, escape=False)\n",
"\n",
"# Add CSS styles to align the table\n",
"html_table = f'<div style=\"overflow:auto; height:300px;\"><table style=\"table-layout:fixed;\"><colgroup><col style=\"width:150px\"><col style=\"width:100px\"><col style=\"width:300px\"></colgroup>{html_table}</table></div>'\n",
"\n",
"# Apply CSS styles to the table cells\n",
"html_table = html_table.replace('<td>', '<td style=\"vertical-align:top; text-align:left;\">')\n",
"\n",
"# Apply CSS styles to the table headers (column labels)\n",
"html_table = html_table.replace('<th>', '<th style=\"width:600px;text-align:left;\">')\n",
"\n",
"# Display the table using HTML\n",
"display(HTML(html_table))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "u3e1AM7aYYGl"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from typing import List\n",
"\n",
"def format_html_table(dataframe: pd.DataFrame) -> str:\n",
" # Create the HTML string\n",
" html = \"\"\n",
"\n",
" # Iterate over the rows of the DataFrame\n",
" for _, row in dataframe.iterrows():\n",
" # Get the values of each column\n",
" text = row[\"text\"]\n",
" summary = row[\"summary\"]\n",
" keywords = row[\"keywords\"]\n",
"\n",
" # Create the HTML sections for each column\n",
" sections = [\n",
" (\"summary\", summary),\n",
" (\"keywords\", keywords),\n",
" (\"text\", text)\n",
" ]\n",
"\n",
" # Add the sections to the HTML string\n",
" for section_name, section_text in sections:\n",
" html += f\"<h3>{section_name}</h3>\\n\"\n",
" html += f\"<p>{section_text}</p>\\n\"\n",
"\n",
" # Add a horizontal line between rows\n",
" html += \"<hr>\\n\"\n",
"\n",
" return '<h1>Trailhead Summary for </h1>'+html\n",
"\n",
"\n",
"html_table = format_html_table(df_selected)\n",
"display(HTML(html_table))\n"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [
"yjNx_cNJ7958",
"YFU6MJMx7dFD",
"4l4bVZ1yB_h0",
"oliNfcbZCDOQ",
"9H4aSVxTCIny",
"iRW_Tx5IAFp1",
"WDOxhzwVApeG",
"vNZ2CU8IA-rG",
"qz76v8QpA-9X",
"e8g-cGF0A_2V",
"BFmw2acVGo8r"
],
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment