ssokolow/generate_sql_docs.py

## generate_sql_docs.py
#!/usr/bin/env python
"""Quick script to generate a Dash/Zeal docset from the SDL 2 wiki.

Requirements:
- Python 2.x (3.x may work, but it's untested)
- LXML (for parsing the non-XML HTML used in the pages)
- Pillow (for converting favicon.ico into icon.png)
- http://www.libsdl.org/tmp/SDL-wiki.zip
"""

from __future__ import print_function

__author__ = "Stephan Sokolow (deitarion/SSokolow)"
__license__ = "MIT"

# ---=== Configuration Begins ===---

# Pages which would be undesirably excluded by categorical rules below
WHITELISTED_PAGES = [
    'FrontPage.html',
    'SDL_SaveDollarTemplate.html'
]

UNWANTED_PAGES = [
    # Home pages missing CategoryHomepage
    'Sam(20)Lantinga.html',
    'Spikerocks101.html',
    # Useless in offline docs
    'APIContributionStyleGuide.html',
    'Contributing.html',
    'error.log',
    'SDL(2d)gsoc*.html',     # Scratch pages
    'SDL*Template.html',     # Page Templates (Note: see whitelist)
    'SG*.html',              # Style Guides
    'Roadmap.html',
    'Test.html',
    'ToDo.html',
]
# Unwanted pages which also contain lists of links to unwanted pages
# (This throws out the system pages and most of the contributor home pages)
UNWANTED_GROUPS = [
    'AdminGroup.html',
    'AutoAdminGroup.html',
    'CategoryHomepage.html',
    'ContributorGroup.html',
    'EditorGroup.html',
    'SystemPages*Group.html',
    'Wiki(20)Help.html',
]

# Pages which shouldn't trigger a missing page warning if not found
# (eg. Stuff linked from retained pages which we intentionally stripped)
EXPECTED_DEADLINKS = [
    # Stuff we generate our own more helpful replacements for
    'WordIndex.html',
    'TitleIndex.html',
    'CategoryCategory.html',
]

# Metadata
DOCSET_ID = 'sdl2'
DOCSET_NAME = 'SDL 2'
START_PAGE = 'index.html'
BASE_URL = 'https://wiki.libsdl.org/'

SRC_DIR = 'SDL-wiki'
ICO_URL = 'https://www.libsdl.org/favicon.ico'
ZIP_URL = 'http://www.libsdl.org/tmp/SDL-wiki.zip'
ICO_FILE = "sdl2.ico"
ZIP_FILE = "%s.zip" % SRC_DIR
TAR_FILE = "%s.tar" % DOCSET_NAME.replace(' ', '_')

PLIST_TMPL = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
    "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
    <key>CFBundleIdentifier</key>
    <string>{id}</string>
    <key>CFBundleName</key>
    <string>{name}</string>
    <key>DocSetPlatformFamily</key>
    <string>{id}</string>
    <key>isDashDocset</key>
    <true/>
    <key>dashIndexFilePath</key>
    <string>{start_page}</string>
    <key>DashDocSetFallbackURL</key>
    <string>{base_url}</string>
</dict>
</plist>"""

# ---=== Code Begins ===---

import glob, os, shutil, sqlite3, tarfile
from urllib import url2pathname, urlretrieve
from urlparse import urlparse

# External dependencies
from lxml.html import parse   # Needed for parsing non-XML HTML
from PIL import Image

def prep_path(url):
    """Resolve a URL to a local path, enforcing the deletion whitelist."""
    parts = urlparse(url)
    if parts.scheme or parts.netloc:
        return

    path = os.path.normcase(os.path.normpath(url2pathname(parts.path)))

    if path in WHITELISTED_PAGES:
        return

    return path

unwanted_files = []
def add_unwanted(page_fname):
    """Resolve a URL to a local path and mark it for deletion"""
    path = prep_path(page_fname)
    if not path:
        return

    if page_fname not in unwanted_files and os.path.exists(page_fname):
        unwanted_files.append(page_fname)

def delete_unwanted():
    """Apply and empty the deletion queue"""
    cwd = os.getcwd()
    for fname in unwanted_files:
        fpath = os.path.abspath(fname)
        if not fpath.startswith(cwd):
            print("Skipping for safety: %s" % fpath)
            continue

        if os.path.isdir(fpath):
            shutil.rmtree(fpath)
        elif os.path.isfile(fpath):
            os.remove(fpath)

    unwanted_files[:] = []

# ---=== Main Program Begins ===---

# Unpack SDL-wiki.zip if not already done
if not os.path.exists(SRC_DIR):
    if not os.path.exists(ZIP_FILE):
        print("Downloading %s..." % ZIP_URL)
        urlretrieve(ZIP_URL, ZIP_FILE)

    print("Unpacking %s..." % ZIP_FILE)
    import zipfile
    with zipfile.ZipFile(ZIP_FILE, 'r') as zobj:
        zobj.extractall()
os.chdir(SRC_DIR)

print("Removing unwanted pages...")
# Mark all unwanted boilerplate files for deletion
for glob_pat in UNWANTED_PAGES:
    for page_fname in glob.glob(glob_pat):
        add_unwanted(page_fname)
for glob_pat in UNWANTED_GROUPS:
    for group_fname in glob.glob(glob_pat):
        add_unwanted(group_fname)
        for node in parse(group_fname).findall('.//a'):
            add_unwanted(node.get('href', ''))

print("Searching for and removing mis-tagged CategoryHomepage pages...")
# Mark all home pages where CategoryHomepage was mis-applied somehow
# (Where they link to it, but it doesn't link to them)
for page in glob.glob('*.html'):
    for node in parse(page).findall('.//a'):
        if 'CategoryHomepage.html' in node.get('href', ''):
            add_unwanted(page)

# Delete all marked files
delete_unwanted()

print("Deleting orphaned attachments...")
remaining = os.listdir('.')
for fname in os.listdir('attachments'):
    if fname + '.html' not in remaining:
        add_unwanted(os.path.join('attachments', fname))
delete_unwanted()

print("Removing dead links and missing images...")
for page in glob.glob('*.html'):
    changed, root = False, parse(page)
    for node in root.findall('.//a'):
        link_url = prep_path(node.get('href', ''))
        if not link_url:
            continue

        if os.path.exists(link_url):
            continue
        elif link_url in EXPECTED_DEADLINKS or link_url in UNWANTED_PAGES:
            node.tag = 'span'
            changed = True
        else:
            print("WARNING: Missing page: %s" % link_url)

    # Remove dead <img> tags
    for node in root.findall('.//img'):
        if not os.path.exists(prep_path(node.get('src', ''))):
            node.getparent().remove(node)

    if changed:
        root.write(page)

print("Setting up docset directory structure...")
os.chdir(os.pardir)
dsdir = "%s.docset" % DOCSET_NAME.replace(' ', '_')
cntdir = os.path.join(dsdir, "Contents")
resdir = os.path.join(cntdir, "Resources")
os.makedirs(resdir)

docdir = os.path.join(resdir, "Documents")
os.rename(SRC_DIR, docdir)

print("Generating Info.plist...")
with open(os.path.join(cntdir, "Info.plist"), 'w') as fobj:
    fobj.write(PLIST_TMPL.format(
        id=DOCSET_ID,
        name=DOCSET_NAME,
        start_page=START_PAGE,
        base_url=BASE_URL,
    ))

print("Generating index...")
conn = sqlite3.connect(os.path.join(resdir, "docSet.dsidx"))
conn.executescript("""
    CREATE TABLE searchIndex(
        id INTEGER PRIMARY KEY,
        name TEXT,
        type TEXT,
        path TEXT);
    CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);
""")

# Populate the index
for fname in os.listdir(docdir):
    fpath = os.path.join(docdir, fname)

    # Skip non-HTML files
    if not (os.path.isfile(fpath) and fname.endswith('.html')):
        continue

    # Parse the HTML and extract the title
    root = parse(fpath)
    entry_name = root.find('.//h1')
    if entry_name is not None:
        entry_name = entry_name.text
    else:  # Fail-safe for "replaced with..." pages
        entry_name = root.find('.//title').text

    # Infer a default type as well as we can
    if entry_name.startswith('SDL_'):
        entry_type = 'Function'
    else:
        entry_type = 'Guide'

    cats = ','.join(x.text or '' for x in root.findall('.//a'))
    for typename in ('Define', 'Enum', 'Struct'):
        if 'Category%s' % typename in cats:
            entry_type = typename
            break

    conn.execute("INSERT INTO searchIndex(name, type, path) "
                 "VALUES (?, ?, ?)", [entry_name, entry_type, fname])
conn.commit()

# Unpack SDL-wiki.zip if not already done
if not os.path.exists(ICO_FILE):
    print("Downloading %s..." % ICO_URL)
    urlretrieve(ICO_URL, ICO_FILE)
print("Converting %s to icon.png..." % ICO_FILE)
Image.open(ICO_FILE).save(os.path.join(dsdir, 'icon.png'))

# TODO: https://kapeli.com/docsets#tableofcontents
# (Manually define what to traverse at the top levels so that the by-category
# traversal claims API pages first, then ignore links to pages that have
# already been visited in order to turn the directed graph into a tree.)

print("Archiving docset as %s for sharing..." % TAR_FILE)
with tarfile.open(TAR_FILE, 'w:gz') as tobj:
    tobj.add(dsdir, filter=lambda x:
             None if x.name.split('/')[-1] == '.DS_Store' else x)

print("Done.")
	#!/usr/bin/env python
	"""Quick script to generate a Dash/Zeal docset from the SDL 2 wiki.

	Requirements:
	- Python 2.x (3.x may work, but it's untested)
	- LXML (for parsing the non-XML HTML used in the pages)
	- Pillow (for converting favicon.ico into icon.png)
	- http://www.libsdl.org/tmp/SDL-wiki.zip
	"""

	from __future__ import print_function

	__author__ = "Stephan Sokolow (deitarion/SSokolow)"
	__license__ = "MIT"

	# ---=== Configuration Begins ===---

	# Pages which would be undesirably excluded by categorical rules below
	WHITELISTED_PAGES = [
	'FrontPage.html',
	'SDL_SaveDollarTemplate.html'
	]

	UNWANTED_PAGES = [
	# Home pages missing CategoryHomepage
	'Sam(20)Lantinga.html',
	'Spikerocks101.html',
	# Useless in offline docs
	'APIContributionStyleGuide.html',
	'Contributing.html',
	'error.log',
	'SDL(2d)gsoc*.html', # Scratch pages
	'SDL*Template.html', # Page Templates (Note: see whitelist)
	'SG*.html', # Style Guides
	'Roadmap.html',
	'Test.html',
	'ToDo.html',
	]
	# Unwanted pages which also contain lists of links to unwanted pages
	# (This throws out the system pages and most of the contributor home pages)
	UNWANTED_GROUPS = [
	'AdminGroup.html',
	'AutoAdminGroup.html',
	'CategoryHomepage.html',
	'ContributorGroup.html',
	'EditorGroup.html',
	'SystemPages*Group.html',
	'Wiki(20)Help.html',
	]

	# Pages which shouldn't trigger a missing page warning if not found
	# (eg. Stuff linked from retained pages which we intentionally stripped)
	EXPECTED_DEADLINKS = [
	# Stuff we generate our own more helpful replacements for
	'WordIndex.html',
	'TitleIndex.html',
	'CategoryCategory.html',
	]

	# Metadata
	DOCSET_ID = 'sdl2'
	DOCSET_NAME = 'SDL 2'
	START_PAGE = 'index.html'
	BASE_URL = 'https://wiki.libsdl.org/'

	SRC_DIR = 'SDL-wiki'
	ICO_URL = 'https://www.libsdl.org/favicon.ico'
	ZIP_URL = 'http://www.libsdl.org/tmp/SDL-wiki.zip'
	ICO_FILE = "sdl2.ico"
	ZIP_FILE = "%s.zip" % SRC_DIR
	TAR_FILE = "%s.tar" % DOCSET_NAME.replace(' ', '_')

	PLIST_TMPL = """<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
	"http://www.apple.com/DTDs/PropertyList-1.0.dtd">
	<plist version="1.0">
	<dict>
	<key>CFBundleIdentifier</key>
	<string>{id}</string>
	<key>CFBundleName</key>
	<string>{name}</string>
	<key>DocSetPlatformFamily</key>
	<string>{id}</string>
	<key>isDashDocset</key>
	<true/>
	<key>dashIndexFilePath</key>
	<string>{start_page}</string>
	<key>DashDocSetFallbackURL</key>
	<string>{base_url}</string>
	</dict>
	</plist>"""

	# ---=== Code Begins ===---

	import glob, os, shutil, sqlite3, tarfile
	from urllib import url2pathname, urlretrieve
	from urlparse import urlparse

	# External dependencies
	from lxml.html import parse # Needed for parsing non-XML HTML
	from PIL import Image

	def prep_path(url):
	"""Resolve a URL to a local path, enforcing the deletion whitelist."""
	parts = urlparse(url)
	if parts.scheme or parts.netloc:
	return

	path = os.path.normcase(os.path.normpath(url2pathname(parts.path)))

	if path in WHITELISTED_PAGES:
	return

	return path

	unwanted_files = []
	def add_unwanted(page_fname):
	"""Resolve a URL to a local path and mark it for deletion"""
	path = prep_path(page_fname)
	if not path:
	return

	if page_fname not in unwanted_files and os.path.exists(page_fname):
	unwanted_files.append(page_fname)

	def delete_unwanted():
	"""Apply and empty the deletion queue"""
	cwd = os.getcwd()
	for fname in unwanted_files:
	fpath = os.path.abspath(fname)
	if not fpath.startswith(cwd):
	print("Skipping for safety: %s" % fpath)
	continue

	if os.path.isdir(fpath):
	shutil.rmtree(fpath)
	elif os.path.isfile(fpath):
	os.remove(fpath)

	unwanted_files[:] = []

	# ---=== Main Program Begins ===---

	# Unpack SDL-wiki.zip if not already done
	if not os.path.exists(SRC_DIR):
	if not os.path.exists(ZIP_FILE):
	print("Downloading %s..." % ZIP_URL)
	urlretrieve(ZIP_URL, ZIP_FILE)

	print("Unpacking %s..." % ZIP_FILE)
	import zipfile
	with zipfile.ZipFile(ZIP_FILE, 'r') as zobj:
	zobj.extractall()
	os.chdir(SRC_DIR)

	print("Removing unwanted pages...")
	# Mark all unwanted boilerplate files for deletion
	for glob_pat in UNWANTED_PAGES:
	for page_fname in glob.glob(glob_pat):
	add_unwanted(page_fname)
	for glob_pat in UNWANTED_GROUPS:
	for group_fname in glob.glob(glob_pat):
	add_unwanted(group_fname)
	for node in parse(group_fname).findall('.//a'):
	add_unwanted(node.get('href', ''))

	print("Searching for and removing mis-tagged CategoryHomepage pages...")
	# Mark all home pages where CategoryHomepage was mis-applied somehow
	# (Where they link to it, but it doesn't link to them)
	for page in glob.glob('*.html'):
	for node in parse(page).findall('.//a'):
	if 'CategoryHomepage.html' in node.get('href', ''):
	add_unwanted(page)

	# Delete all marked files
	delete_unwanted()

	print("Deleting orphaned attachments...")
	remaining = os.listdir('.')
	for fname in os.listdir('attachments'):
	if fname + '.html' not in remaining:
	add_unwanted(os.path.join('attachments', fname))
	delete_unwanted()

	print("Removing dead links and missing images...")
	for page in glob.glob('*.html'):
	changed, root = False, parse(page)
	for node in root.findall('.//a'):
	link_url = prep_path(node.get('href', ''))
	if not link_url:
	continue

	if os.path.exists(link_url):
	continue
	elif link_url in EXPECTED_DEADLINKS or link_url in UNWANTED_PAGES:
	node.tag = 'span'
	changed = True
	else:
	print("WARNING: Missing page: %s" % link_url)

	# Remove dead <img> tags
	for node in root.findall('.//img'):
	if not os.path.exists(prep_path(node.get('src', ''))):
	node.getparent().remove(node)

	if changed:
	root.write(page)

	print("Setting up docset directory structure...")
	os.chdir(os.pardir)
	dsdir = "%s.docset" % DOCSET_NAME.replace(' ', '_')
	cntdir = os.path.join(dsdir, "Contents")
	resdir = os.path.join(cntdir, "Resources")
	os.makedirs(resdir)

	docdir = os.path.join(resdir, "Documents")
	os.rename(SRC_DIR, docdir)

	print("Generating Info.plist...")
	with open(os.path.join(cntdir, "Info.plist"), 'w') as fobj:
	fobj.write(PLIST_TMPL.format(
	id=DOCSET_ID,
	name=DOCSET_NAME,
	start_page=START_PAGE,
	base_url=BASE_URL,
	))

	print("Generating index...")
	conn = sqlite3.connect(os.path.join(resdir, "docSet.dsidx"))
	conn.executescript("""
	CREATE TABLE searchIndex(
	id INTEGER PRIMARY KEY,
	name TEXT,
	type TEXT,
	path TEXT);
	CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);
	""")

	# Populate the index
	for fname in os.listdir(docdir):
	fpath = os.path.join(docdir, fname)

	# Skip non-HTML files
	if not (os.path.isfile(fpath) and fname.endswith('.html')):
	continue

	# Parse the HTML and extract the title
	root = parse(fpath)
	entry_name = root.find('.//h1')
	if entry_name is not None:
	entry_name = entry_name.text
	else: # Fail-safe for "replaced with..." pages
	entry_name = root.find('.//title').text

	# Infer a default type as well as we can
	if entry_name.startswith('SDL_'):
	entry_type = 'Function'
	else:
	entry_type = 'Guide'

	cats = ','.join(x.text or '' for x in root.findall('.//a'))
	for typename in ('Define', 'Enum', 'Struct'):
	if 'Category%s' % typename in cats:
	entry_type = typename
	break

	conn.execute("INSERT INTO searchIndex(name, type, path) "
	"VALUES (?, ?, ?)", [entry_name, entry_type, fname])
	conn.commit()

	# Unpack SDL-wiki.zip if not already done
	if not os.path.exists(ICO_FILE):
	print("Downloading %s..." % ICO_URL)
	urlretrieve(ICO_URL, ICO_FILE)
	print("Converting %s to icon.png..." % ICO_FILE)
	Image.open(ICO_FILE).save(os.path.join(dsdir, 'icon.png'))

	# TODO: https://kapeli.com/docsets#tableofcontents
	# (Manually define what to traverse at the top levels so that the by-category
	# traversal claims API pages first, then ignore links to pages that have
	# already been visited in order to turn the directed graph into a tree.)

	print("Archiving docset as %s for sharing..." % TAR_FILE)
	with tarfile.open(TAR_FILE, 'w:gz') as tobj:
	tobj.add(dsdir, filter=lambda x:
	None if x.name.split('/')[-1] == '.DS_Store' else x)

	print("Done.")