Skip to content

Instantly share code, notes, and snippets.

@fnurl
Last active January 15, 2020 07:07
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fnurl/586dbdb7d313f1911580ae873d5ad213 to your computer and use it in GitHub Desktop.
Save fnurl/586dbdb7d313f1911580ae873d5ad213 to your computer and use it in GitHub Desktop.
A script that produces a JSON page index file for markdown files (extension `.md`) in a directory and its subdirectories (e.g. a Hugo site's (https://gohugo.io/) `content` directory) for use with Algolia Docsearch (https://github.com/algolia/docsearch).
import os
import sys
import yaml
import json
# base url to use
base_url = "http://localhost:1313"
# The attribute mapping for docsearch.
#
# The 'tags' mapping's value will be a list, so values from multiple
# taxonomies can be put into the docsearch `tags` property.
docsearch_mapping = { "content": "content",
"url": "url",
"tags": ["tags", "categories"]
}
# default values for the weight property
docsearch_weight = { "position": 1,
"level": 10,
"page_rank": 0
}
# top hierarchy level. Sections will be used for additional
# levels
base_level = "Hugo Site"
def parse_md(filepath):
"""Parse a markdown file. Reads yaml front matter."""
yaml_string = ""
in_yaml = None
content = ""
with open(filepath) as datafile:
for line in datafile:
if line.startswith("---"):
if in_yaml:
in_yaml = False
else:
in_yaml = True
continue
elif in_yaml == True:
yaml_string += line
else:
content += line
md_data = yaml.load(yaml_string)
if not "content" in md_data.keys():
md_data["content"] = content
else:
sys.stderr.write("ERROR: Could not store content for '" + filepath + "'\n")
return md_data
def create_index_list(walk_dir):
"""Create a list of index entries starting from the directory walk_dir"""
global base_level, base_url, docsearch_mapping, docsearch_weight
# used to store all indexed item (markdown files)
index_list = []
# give items an objectID so that the index file can be uploaded
# to algolia again, overwriting previous index items
objectID = 0
for root, subdirs, files in os.walk(walk_dir):
for filename in files:
# index md files
if filename.endswith(".md"):
objectID += 1
filepath = os.path.join(root, filename)
subpath = root[len(walk_dir):].rstrip(os.sep)
subpaths = subpath.lstrip(os.sep).split(os.sep)
# index.md have special URLs
if filename != "index.md":
subpaths.append(filename[:-3])
# set up list for the hierarchy of the markdown file
hierarchy_list = [base_level]
hierarchy_list.extend(subpaths)
# construct the url of the markdown file
url_subpath = "/".join(subpaths)
url = base_url + "/" + url_subpath + "/"
sys.stderr.write("Indexing '" + filepath + "' (" + url + "\n")
# get data from the file (frontmatter and content)
filedata = parse_md(filepath)
# create index entry
indexed_item = {'objectID': objectID, 'url': url }
# map filedata to docsearch structure
for docsearch_key, filedata_key in docsearch_mapping.items():
# plain mappings, configured at the top of the script
if type(filedata_key) == str and filedata_key in filedata.keys():
indexed_item[docsearch_key] = filedata[filedata_key]
# if the mapping value is a list, assume that the frontmatter data
# of the keys in the list are also lists. Combine the lists values of each
# frontmatter property into a list and set the docsearch property as
# to this combined value list (used for the "tags" property. see
# comment in the beginning of the script
elif type(filedata_key) == list:
aggregated = []
for filedata_subkey in filedata_key:
if filedata_subkey in filedata.keys():
aggregated.extend(filedata[filedata_subkey])
indexed_item[docsearch_key] = aggregated
# hierarchy and hierarchy_complete properties
hierarchy = create_empty_hierarchy()
hierarchy_complete = create_empty_hierarchy()
for level in range(7):
if level < len(hierarchy_list):
hierarchy["lvl" + str(level)] = hierarchy_list[level]
hierarchy_complete["lvl" + str(level)] = " > ".join(hierarchy_list[:level])
indexed_item["hierarchy"] = hierarchy
indexed_item["hierarchy_complete"] = hierarchy_complete
# hierarchy_radio and type
hierarchy_radio = create_empty_hierarchy()
max_lvl = len(subpaths) - 1
hierarchy_radio["lvl" + str(max_lvl)] = subpaths[max_lvl]
indexed_item["hierarchy_radio"] = hierarchy_radio
indexed_item["type"] = "lvl" + str(max_lvl)
# anchor and weight. anchors are not considered
indexed_item["anchor"] = None
indexed_item["weight"] = docsearch_weight
index_list.append(indexed_item)
sys.stderr.write("Done indexing .md files in '" + walk_dir + "'" + "\n")
return index_list
def create_empty_hierarchy():
"""Create a empty hierarchy structure (dict)."""
empty_hierarchy = {}
for level_index in range(7):
empty_hierarchy["lvl" + str(level_index)] = None
return empty_hierarchy
if __name__ == '__main__':
if len(sys.argv) != 2:
sys.stderr.write("ERROR: Please supply a path to the dir to be indexed.")
sys.exit(1)
# gather index data
index_list = create_index_list(sys.argv[1])
# output the index as readable json to stdout. does not escape UTF-8 characters
sys.stdout.write(json.dumps(index_list, ensure_ascii=False, indent=2))
@fnurl
Copy link
Author

fnurl commented Jul 1, 2016

If you use Hugo, you can set your site up by editing your config.yaml to contain your Algolia App ID, API key and the index you want to use as follows:

params:
  algoliaappid: <your appId>
  algoliaapikey: <your API key>
  algoliaindex: <your index>

Then use the following HTML in your template

docsearch script and css:

<script type="text/javascript" src="//cdn.jsdelivr.net/docsearch.js/1/docsearch.min.js"></script>

snippet for the search box:

<input type="text" id="search-input" />
  <script type="text/javascript">
    docsearch({
      apiKey: '{{ .Site.Params.algoliaapikey }}',
      indexName: '{{ .Site.Params.algoliaindex}}',
      inputSelector: '#search-input',
      appId: '{{ .Site.Params.algoliaappid }}',
      debug: true
    });
  </script>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment