Skip to content

Instantly share code, notes, and snippets.

@tanbro
Last active March 9, 2024 09:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tanbro/c4dd926f9a709755e68df4352fb48b15 to your computer and use it in GitHub Desktop.
Save tanbro/c4dd926f9a709755e68df4352fb48b15 to your computer and use it in GitHub Desktop.
Export all source code files into a single HTML file with Pygments syntax highlight and remove all comments and empty lines.
#!/usr/bin/env python
"""
Export all source code files into a single HTML file with Pygments syntax highlight and remove all comments and empty lines.
"""
import argparse
import fnmatch
import os
import sys
from functools import lru_cache
from itertools import chain
from pathlib import Path
from textwrap import dedent, shorten
from jinja2 import Template
from pathspec.gitignore import GitIgnoreSpec
from pygments import highlight
from pygments.filter import simplefilter
from pygments.formatters import HtmlFormatter, NullFormatter
from pygments.lexers import get_lexer_for_filename
from pygments.styles import get_all_styles
from pygments.token import Comment, String
from pygments.util import ClassNotFound
DEFAULT_HTML_TEMPLATE = Template(
dedent(
"""
<!DOCTYPE html>
<html>
<head>
<title>{{ title if title }}</title>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1">
<style type="text/css">
{{ style_defs }}
</style>
</head>
<body>
{% for file_name, lexer_name, highlight in highlights %}
<article>
<h2>
{{ file_name }}
<small>({{ lexer_name }})</small>
</h2>
<article class="hll">
{{ highlight }}
</article>
</article>
{% endfor %}
<body>
"""
).strip()
)
_lines_dict = dict()
def set_args():
##########
# define #
##########
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--dir",
"-d",
type=str,
help="Top directory to search code source files. (default: <CWD>)",
)
parser.add_argument(
"--sub-dir",
"-s",
type=str,
action="append",
help="Scan for source files only in these sub-directories. Can be specified multiple times. (default: <CWD>)",
)
parser.add_argument(
"output",
metavar="OUTPUT",
nargs="?",
type=argparse.FileType("w", encoding="utf-8"),
default="-",
help="Write output HTML file here. (default: <STDOUT>)",
)
parser.add_argument(
"--exclude",
"-x",
type=str,
action="append",
help="Exclude files match theses patterns in `glob` expression. Can be specified multiple times",
)
if sys.version_info >= (3, 9):
parser.add_argument(
"--gitignore",
action=argparse.BooleanOptionalAction,
default=True,
help="Follow `.gitignore` files for excluding",
)
else:
parser.add_argument(
"--no-gitignore",
action="store_true",
help="Do NOT follow `.gitignore` files for excluding",
)
if sys.version_info >= (3, 9):
parser.add_argument(
"--null-formatter",
action=argparse.BooleanOptionalAction,
default=False,
help="Null formatter output the text unchanged without any formatting.",
)
else:
parser.add_argument(
"--null-formatter",
action="store_true",
help="Null formatter output the text unchanged without any formatting.",
)
parser.add_argument(
"--style",
type=str,
choices=list(get_all_styles()),
help="Syntax highlight style",
)
parser.add_argument(
"--linenos",
type=str,
choices=["table", "inline"],
help="If set to 'table', output line numbers as a table with two cells, one containing the line numbers, the other the whole code. " # noqa
"This is copy-and-paste-friendly, but may cause alignment problems with some browsers or fonts. "
" If set to 'inline', the line numbers will be integrated in the <pre> tag that contains the code",
)
parser.add_argument(
"--template",
type=argparse.FileType("r", encoding="utf-8"),
help="A custom Jinja2 template file to render the output HTML file",
)
parser.add_argument("--title", "-t", type=str, help="Title of the HTML document")
#########
# parse #
#########
args = parser.parse_args()
############
# validate #
############
if not args.dir:
args.dir = "."
if not args.sub_dir:
args.sub_dir = []
return args
@simplefilter
def _filter_no_comment(self, lexer, stream, options):
yield from (
(ttype, value)
for ttype, value in stream
if not (
any(
ttype is t_
for t_ in (
Comment,
# Comment.Hashbang,
Comment.Multiline,
# Comment.Preproc,
# Comment.PreprocFile,
Comment.Single,
# Comment.Special,
)
)
)
)
@simplefilter
def _filter_no_docstring(self, lexer, stream, options):
yield from ((ttype, value) for ttype, value in stream if ttype is not String.Doc)
_FILTERS = (_filter_no_comment(), _filter_no_docstring()) # type: ignore
@lru_cache
def make_git_ignore_spec(gitignore_file):
with open(gitignore_file, encoding="utf-8") as fp:
return GitIgnoreSpec.from_lines(fp)
def main(args):
formatter_options = dict(wrapcode=True)
if args.style:
formatter_options.update(style=args.style)
if args.linenos:
formatter_options.update(linenos=args.linenos)
fmt_html = HtmlFormatter(**formatter_options) # type: ignore
fmt_null = NullFormatter()
def _gen():
lines_total = 0
counter = 0
top_path = Path(args.dir).resolve()
if args.sub_dir:
sub_paths = [top_path.joinpath(pth).resolve() for pth in args.sub_dir]
if any(pth <= top_path for pth in sub_paths):
raise ValueError(
"Sub-directories can not be smaller than or equal to top dir."
)
walker = chain.from_iterable(
os.walk(top_path.joinpath(pth).resolve()) for pth in sub_paths
)
else:
walker = os.walk(top_path)
for dirpath, _, filenames in walker:
for filename in filenames:
pth = Path(dirpath, filename).resolve()
filename = Path(
os.path.normpath(os.path.join(dirpath, filename))
).relative_to(top_path)
################
# excluding ...
# ignore none-files
if not pth.is_file():
continue
# ignore symlinks not in the dir
try:
subdir_parts = pth.parent.relative_to(top_path).parts
except ValueError:
continue
# ignore hidden files
if pth.name.startswith("."):
continue
# ignore hidden dirs (except specified ones)
if any(str(part).startswith(".") for part in subdir_parts):
continue
# exclude files from cmd args
if args.exclude:
if any(fnmatch.fnmatch(str(filename), pat) for pat in args.exclude):
continue
################
# git-ignore
if (
args.gitignore
if sys.version_info >= (3, 9)
else not args.no_gitignore
):
is_ignore = False
for parent_dir in pth.parents:
if parent_dir < top_path:
break
pth_gitignore = parent_dir.joinpath(".gitignore")
if not pth_gitignore.is_file():
continue
ignore_spec = make_git_ignore_spec(pth_gitignore)
is_ignore = ignore_spec.match_file(filename.as_posix())
if is_ignore:
break
if is_ignore:
continue
###################
# read source file
with pth.open("rb") as fp:
code = fp.read()
# ignore empty files
if not code:
continue
try:
lexer = get_lexer_for_filename(filename, code)
except ClassNotFound:
# ignore no-supported source files
continue
else:
for filter_ in _FILTERS:
lexer.add_filter(filter_)
lines = [
line
for line in "".join(
s for _, s in lexer.get_tokens(code) # type: ignore
).splitlines()
if line.strip()
]
# ignore empty source files
if not lines:
continue
code = "\n".join(lines)
counter += 1
lines_cnt = len(lines)
lines_total += lines_cnt
_lines_dict[lexer.name] = _lines_dict.get(lexer.name, 0) + lines_cnt
print(
f"[{counter:05d}] "
f"{shorten(str(filename), 88):88} "
f"{shorten(lexer.name, 24):24} " # type: ignore
f"lines: {lines_cnt:3,d}/{lines_total:3,d}",
file=sys.stderr,
)
formatted = highlight(
code, lexer, fmt_null if args.null_formatter else fmt_html
)
yield filename, lexer.name, formatted # type: ignore
if args.null_formatter:
for filename, lexer_name, formatted in _gen():
print(f"{filename} ({lexer_name})", file=args.output)
print(formatted, file=args.output)
else:
context = dict(
style_defs=fmt_html.get_style_defs(),
highlights=_gen(),
title=args.title,
)
tpl = Template(args.template.read()) if args.template else DEFAULT_HTML_TEMPLATE
tpl.stream(**context).dump(args.output)
print(file=sys.stderr)
print("=" * 79, file=sys.stderr)
for k, v in _lines_dict.items():
print(
f"{shorten(k, 24):24} " f"lines: {v:3,d}",
file=sys.stderr,
)
print("=" * 79, file=sys.stderr)
if __name__ == "__main__":
exit(main(set_args()))
Jinja2
Pygments
pathspec
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment