Skip to content

Instantly share code, notes, and snippets.

@ssokolow
Created March 4, 2019 02:12
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssokolow/4ca5fe898c9cbe3896badd215042056e to your computer and use it in GitHub Desktop.
Save ssokolow/4ca5fe898c9cbe3896badd215042056e to your computer and use it in GitHub Desktop.
A one-off script for converting a specific dump of the SDL wiki into a Dash/Zeal docset... in case it's useful to someone
#!/usr/bin/env python
"""Quick script to generate a Dash/Zeal docset from the SDL 2 wiki.
Requirements:
- Python 2.x (3.x may work, but it's untested)
- LXML (for parsing the non-XML HTML used in the pages)
- Pillow (for converting favicon.ico into icon.png)
- http://www.libsdl.org/tmp/SDL-wiki.zip
"""
from __future__ import print_function
__author__ = "Stephan Sokolow (deitarion/SSokolow)"
__license__ = "MIT"
# ---=== Configuration Begins ===---
# Pages which would be undesirably excluded by categorical rules below
WHITELISTED_PAGES = [
'FrontPage.html',
'SDL_SaveDollarTemplate.html'
]
UNWANTED_PAGES = [
# Home pages missing CategoryHomepage
'Sam(20)Lantinga.html',
'Spikerocks101.html',
# Useless in offline docs
'APIContributionStyleGuide.html',
'Contributing.html',
'error.log',
'SDL(2d)gsoc*.html', # Scratch pages
'SDL*Template.html', # Page Templates (Note: see whitelist)
'SG*.html', # Style Guides
'Roadmap.html',
'Test.html',
'ToDo.html',
]
# Unwanted pages which also contain lists of links to unwanted pages
# (This throws out the system pages and most of the contributor home pages)
UNWANTED_GROUPS = [
'AdminGroup.html',
'AutoAdminGroup.html',
'CategoryHomepage.html',
'ContributorGroup.html',
'EditorGroup.html',
'SystemPages*Group.html',
'Wiki(20)Help.html',
]
# Pages which shouldn't trigger a missing page warning if not found
# (eg. Stuff linked from retained pages which we intentionally stripped)
EXPECTED_DEADLINKS = [
# Stuff we generate our own more helpful replacements for
'WordIndex.html',
'TitleIndex.html',
'CategoryCategory.html',
]
# Metadata
DOCSET_ID = 'sdl2'
DOCSET_NAME = 'SDL 2'
START_PAGE = 'index.html'
BASE_URL = 'https://wiki.libsdl.org/'
SRC_DIR = 'SDL-wiki'
ICO_URL = 'https://www.libsdl.org/favicon.ico'
ZIP_URL = 'http://www.libsdl.org/tmp/SDL-wiki.zip'
ICO_FILE = "sdl2.ico"
ZIP_FILE = "%s.zip" % SRC_DIR
TAR_FILE = "%s.tar" % DOCSET_NAME.replace(' ', '_')
PLIST_TMPL = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
"http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleIdentifier</key>
<string>{id}</string>
<key>CFBundleName</key>
<string>{name}</string>
<key>DocSetPlatformFamily</key>
<string>{id}</string>
<key>isDashDocset</key>
<true/>
<key>dashIndexFilePath</key>
<string>{start_page}</string>
<key>DashDocSetFallbackURL</key>
<string>{base_url}</string>
</dict>
</plist>"""
# ---=== Code Begins ===---
import glob, os, shutil, sqlite3, tarfile
from urllib import url2pathname, urlretrieve
from urlparse import urlparse
# External dependencies
from lxml.html import parse # Needed for parsing non-XML HTML
from PIL import Image
def prep_path(url):
"""Resolve a URL to a local path, enforcing the deletion whitelist."""
parts = urlparse(url)
if parts.scheme or parts.netloc:
return
path = os.path.normcase(os.path.normpath(url2pathname(parts.path)))
if path in WHITELISTED_PAGES:
return
return path
unwanted_files = []
def add_unwanted(page_fname):
"""Resolve a URL to a local path and mark it for deletion"""
path = prep_path(page_fname)
if not path:
return
if page_fname not in unwanted_files and os.path.exists(page_fname):
unwanted_files.append(page_fname)
def delete_unwanted():
"""Apply and empty the deletion queue"""
cwd = os.getcwd()
for fname in unwanted_files:
fpath = os.path.abspath(fname)
if not fpath.startswith(cwd):
print("Skipping for safety: %s" % fpath)
continue
if os.path.isdir(fpath):
shutil.rmtree(fpath)
elif os.path.isfile(fpath):
os.remove(fpath)
unwanted_files[:] = []
# ---=== Main Program Begins ===---
# Unpack SDL-wiki.zip if not already done
if not os.path.exists(SRC_DIR):
if not os.path.exists(ZIP_FILE):
print("Downloading %s..." % ZIP_URL)
urlretrieve(ZIP_URL, ZIP_FILE)
print("Unpacking %s..." % ZIP_FILE)
import zipfile
with zipfile.ZipFile(ZIP_FILE, 'r') as zobj:
zobj.extractall()
os.chdir(SRC_DIR)
print("Removing unwanted pages...")
# Mark all unwanted boilerplate files for deletion
for glob_pat in UNWANTED_PAGES:
for page_fname in glob.glob(glob_pat):
add_unwanted(page_fname)
for glob_pat in UNWANTED_GROUPS:
for group_fname in glob.glob(glob_pat):
add_unwanted(group_fname)
for node in parse(group_fname).findall('.//a'):
add_unwanted(node.get('href', ''))
print("Searching for and removing mis-tagged CategoryHomepage pages...")
# Mark all home pages where CategoryHomepage was mis-applied somehow
# (Where they link to it, but it doesn't link to them)
for page in glob.glob('*.html'):
for node in parse(page).findall('.//a'):
if 'CategoryHomepage.html' in node.get('href', ''):
add_unwanted(page)
# Delete all marked files
delete_unwanted()
print("Deleting orphaned attachments...")
remaining = os.listdir('.')
for fname in os.listdir('attachments'):
if fname + '.html' not in remaining:
add_unwanted(os.path.join('attachments', fname))
delete_unwanted()
print("Removing dead links and missing images...")
for page in glob.glob('*.html'):
changed, root = False, parse(page)
for node in root.findall('.//a'):
link_url = prep_path(node.get('href', ''))
if not link_url:
continue
if os.path.exists(link_url):
continue
elif link_url in EXPECTED_DEADLINKS or link_url in UNWANTED_PAGES:
node.tag = 'span'
changed = True
else:
print("WARNING: Missing page: %s" % link_url)
# Remove dead <img> tags
for node in root.findall('.//img'):
if not os.path.exists(prep_path(node.get('src', ''))):
node.getparent().remove(node)
if changed:
root.write(page)
print("Setting up docset directory structure...")
os.chdir(os.pardir)
dsdir = "%s.docset" % DOCSET_NAME.replace(' ', '_')
cntdir = os.path.join(dsdir, "Contents")
resdir = os.path.join(cntdir, "Resources")
os.makedirs(resdir)
docdir = os.path.join(resdir, "Documents")
os.rename(SRC_DIR, docdir)
print("Generating Info.plist...")
with open(os.path.join(cntdir, "Info.plist"), 'w') as fobj:
fobj.write(PLIST_TMPL.format(
id=DOCSET_ID,
name=DOCSET_NAME,
start_page=START_PAGE,
base_url=BASE_URL,
))
print("Generating index...")
conn = sqlite3.connect(os.path.join(resdir, "docSet.dsidx"))
conn.executescript("""
CREATE TABLE searchIndex(
id INTEGER PRIMARY KEY,
name TEXT,
type TEXT,
path TEXT);
CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);
""")
# Populate the index
for fname in os.listdir(docdir):
fpath = os.path.join(docdir, fname)
# Skip non-HTML files
if not (os.path.isfile(fpath) and fname.endswith('.html')):
continue
# Parse the HTML and extract the title
root = parse(fpath)
entry_name = root.find('.//h1')
if entry_name is not None:
entry_name = entry_name.text
else: # Fail-safe for "replaced with..." pages
entry_name = root.find('.//title').text
# Infer a default type as well as we can
if entry_name.startswith('SDL_'):
entry_type = 'Function'
else:
entry_type = 'Guide'
cats = ','.join(x.text or '' for x in root.findall('.//a'))
for typename in ('Define', 'Enum', 'Struct'):
if 'Category%s' % typename in cats:
entry_type = typename
break
conn.execute("INSERT INTO searchIndex(name, type, path) "
"VALUES (?, ?, ?)", [entry_name, entry_type, fname])
conn.commit()
# Unpack SDL-wiki.zip if not already done
if not os.path.exists(ICO_FILE):
print("Downloading %s..." % ICO_URL)
urlretrieve(ICO_URL, ICO_FILE)
print("Converting %s to icon.png..." % ICO_FILE)
Image.open(ICO_FILE).save(os.path.join(dsdir, 'icon.png'))
# TODO: https://kapeli.com/docsets#tableofcontents
# (Manually define what to traverse at the top levels so that the by-category
# traversal claims API pages first, then ignore links to pages that have
# already been visited in order to turn the directed graph into a tree.)
print("Archiving docset as %s for sharing..." % TAR_FILE)
with tarfile.open(TAR_FILE, 'w:gz') as tobj:
tobj.add(dsdir, filter=lambda x:
None if x.name.split('/')[-1] == '.DS_Store' else x)
print("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment