Skip to content

Instantly share code, notes, and snippets.

@wshayes
Created October 29, 2020 20:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wshayes/1250bc749f327a29406a2ada5c681e62 to your computer and use it in GitHub Desktop.
Save wshayes/1250bc749f327a29406a2ada5c681e62 to your computer and use it in GitHub Desktop.
Export (via webscraping) Intercom Article Help documents
#!/usr/bin/env python
# -*- coding: utf-8-*-
"""
Webscrape Intercom Help Articles and export them into markdown and html as a JSON data file
This captures Collection info but not Sections. Images are downloaded into an images folder.
The images are renamed with the collection directory name (see the dir_map dict below) in numerical order.
You will need to dedupe duplicate images using another tool. Renaming the images avoids bad initial names
and duplicate image filenames (quick hack and not ideal - feel free to improve as desired.)
"""
import httpx
from markdownify import markdownify as md
from bs4 import BeautifulSoup
import time
import json
main_url = "https://help.biodati.com"
rootdir = "/Users/william/studio/dev/studio_help"
# Collection Title to directory name
dir_map = {
"BioDati Studio Overview": "overview",
"Knowledge": "knowledge",
"Networks": "networks",
"Projects": "projects",
"Developer and API Information": "dev",
"Administration": "admin",
}
pages = {}
def collect_content():
"""Scrape content from Intercom Articles"""
page = httpx.get(main_url)
soup = BeautifulSoup(page.content, "html.parser")
links = soup.find_all("a", class_="paper")
collection_links = [f"{main_url}{link['href']}" for link in links]
for clink in collection_links:
image_number = 0
time.sleep(0.5)
page = httpx.get(clink)
soup = BeautifulSoup(page.content, "html.parser")
collection_title = soup("h1")[0].string
print(f"\n\nProcessing Collection: {collection_title}")
collection_dir = dir_map[collection_title]
links = soup("a", class_="paper")
page_links = [f"{main_url}{link['href']}" for link in links]
for plink in page_links:
time.sleep(0.5)
article_page = httpx.get(plink)
article_soup = BeautifulSoup(article_page.content, "html.parser")
article_title = article_soup("h1")[0].string
print(f" Article: {article_title}")
article = article_soup("article")
article_text = article[0].prettify()
article_md = md(article_text, heading_style="ATX")
key = f"{collection_title}__{article_title}"
if key in pages:
print(
f"Collection {collection_dir} has duplicate Article Title: {article_title}"
)
pages[key] = {
"collection_title": collection_title,
"article_title": article_title,
"content_html": article_text,
"content_md": article_md,
"image_urls": [],
}
for image in article_soup.select("article img"):
image_url = image.get("src")
img_fn = image_url.split("/")[-1]
suffix = img_fn.split(".")[-1]
if len(suffix) > 4:
suffix = "png"
print(" Suffix", suffix, "FN", img_fn)
image_save_fn = f"{collection_dir}_{image_number}.{suffix}"
image_number += 1
pages[key]["image_urls"].append(
{"url": image_url, "filename": image_save_fn}
)
result = httpx.get(image_url)
with open(f"images/{image_save_fn}", "wb") as f:
f.write(result.content)
with open("pages.json", "w") as f:
json.dump(pages, f, indent=4)
def process_content():
with open("pages.json", "r") as f:
pages = json.load(f)
def main():
collect_content()
# process_content()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment