Skip to content

Instantly share code, notes, and snippets.

@coliff
Last active November 2, 2017 13:02
Show Gist options
  • Save coliff/2aab68ba82d7fbd66007dae3c7dbc99b to your computer and use it in GitHub Desktop.
Save coliff/2aab68ba82d7fbd66007dae3c7dbc99b to your computer and use it in GitHub Desktop.
MkDocs customisation which minifies the search\search-index.json file for faster loading. The below code should be in `Lib\site-packages\mkdocs\contrib\legacy_search\search-index.py`
# coding: utf-8
from __future__ import unicode_literals
import json
from mkdocs import utils
try: # pragma: no cover
from html.parser import HTMLParser # noqa
except ImportError: # pragma: no cover
from HTMLParser import HTMLParser # noqa
class SearchIndex(object):
"""
Search index is a collection of pages and sections (heading
tags and their following content are sections).
"""
def __init__(self):
self._entries = []
def _find_toc_by_id(self, toc, id_):
"""
Given a table of contents and HTML ID, iterate through
and return the matched item in the TOC.
"""
for toc_item in toc:
if toc_item.url[1:] == id_:
return toc_item
toc_item_r = self._find_toc_by_id(toc_item.children, id_)
if toc_item_r is not None:
return toc_item_r
def _add_entry(self, title, text, loc):
"""
A simple wrapper to add an entry and ensure the contents
is UTF8 encoded.
"""
# Sanity check to compress JSON
text = text.replace(u'\u00a0', ' ')
text = text.replace(' ', ' ')
text = text.replace('\n\n', ' ')
text = text.replace(' ', ' ')
text = text.replace('\n ', ' ')
text = text.replace(' ', ' ')
text = text.replace(' \n', ' ')
text = text.replace(' ', ' ')
text = text.replace(' ', ' ')
self._entries.append({
'title': title,
'text': utils.text_type(text.strip().encode('utf-8'), encoding='utf-8'),
'location': loc
})
def add_entry_from_context(self, page):
"""
Create a set of entries in the index for a page. One for
the page itself and then one for each of its' heading
tags.
"""
# Create the content parser and feed in the HTML for the
# full page. This handles all the parsing and prepares
# us to iterate through it.
parser = ContentParser()
parser.feed(page.content)
parser.close()
# Get the absolute URL for the page, this is then
# prepended to the urls of the sections
abs_url = page.abs_url
# Create an entry for the full page.
self._add_entry(
title=page.title,
text=self.strip_tags(page.content).rstrip('\n'),
loc=abs_url
)
for section in parser.data:
self.create_entry_for_section(section, page.toc, abs_url)
def create_entry_for_section(self, section, toc, abs_url):
"""
Given a section on the page, the table of contents and
the absolute url for the page create an entry in the
index
"""
toc_item = self._find_toc_by_id(toc, section.id)
if toc_item is not None:
self._add_entry(
title=toc_item.title,
text=u" ".join(section.text),
loc=abs_url + toc_item.url
)
def generate_search_index(self):
"""python to json conversion"""
page_dicts = {
'docs': self._entries,
}
return json.dumps(page_dicts, sort_keys=True)
def strip_tags(self, html):
"""strip html tags from data"""
s = HTMLStripper()
s.feed(html)
return s.get_data()
class HTMLStripper(HTMLParser):
"""
A simple HTML parser that stores all of the data within tags
but ignores the tags themselves and thus strips them from the
content.
"""
def __init__(self, *args, **kwargs):
# HTMLParser is a old-style class in Python 2, so
# super() wont work here.
HTMLParser.__init__(self, *args, **kwargs)
self.data = []
def handle_data(self, d):
"""
Called for the text contents of each tag.
"""
self.data.append(d)
def get_data(self):
return '\n'.join(self.data)
class ContentSection(object):
"""
Used by the ContentParser class to capture the information we
need when it is parsing the HMTL.
"""
def __init__(self, text=None, id_=None, title=None):
self.text = text or []
self.id = id_
self.title = title
def __eq__(self, other):
return all([
self.text == other.text,
self.id == other.id,
self.title == other.title
])
class ContentParser(HTMLParser):
"""
Given a block of HTML, group the content under the preceding
heading tags which can then be used for creating an index
for that section.
"""
def __init__(self, *args, **kwargs):
# HTMLParser is a old-style class in Python 2, so
# super() wont work here.
HTMLParser.__init__(self, *args, **kwargs)
self.data = []
self.section = None
self.is_header_tag = False
def handle_starttag(self, tag, attrs):
"""Called at the start of every HTML tag."""
# We only care about the opening tag for headings.
if tag not in (["h%d" % x for x in range(1, 7)]):
return
# We are dealing with a new header, create a new section
# for it and assign the ID if it has one.
self.is_header_tag = True
self.section = ContentSection()
self.data.append(self.section)
for attr in attrs:
if attr[0] == "id":
self.section.id = attr[1]
def handle_endtag(self, tag):
"""Called at the end of every HTML tag."""
# We only care about the opening tag for headings.
if tag not in (["h%d" % x for x in range(1, 7)]):
return
self.is_header_tag = False
def handle_data(self, data):
"""
Called for the text contents of each tag.
"""
if self.section is None:
# This means we have some content at the start of the
# HTML before we reach a heading tag. We don't actually
# care about that content as it will be added to the
# overall page entry in the search. So just skip it.
return
# If this is a header, then the data is the title.
# Otherwise it is content of something under that header
# section.
if self.is_header_tag:
self.section.title = data
else:
self.section.text.append(data.rstrip('\n'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment