tanbro/grab_source_files.py

## grab_source_files.py
#!/usr/bin/env python

"""
Export all source code files into a single HTML file with Pygments syntax highlight and remove all comments and empty lines.
"""

import argparse
import fnmatch
import os
import sys
from functools import lru_cache
from itertools import chain
from pathlib import Path
from textwrap import dedent, shorten

from jinja2 import Template
from pathspec.gitignore import GitIgnoreSpec
from pygments import highlight
from pygments.filter import simplefilter
from pygments.formatters import HtmlFormatter, NullFormatter
from pygments.lexers import get_lexer_for_filename
from pygments.styles import get_all_styles
from pygments.token import Comment, String
from pygments.util import ClassNotFound

DEFAULT_HTML_TEMPLATE = Template(
    dedent(
        """
        <!DOCTYPE html>
        <html>
        <head>
            <title>{{ title if title }}</title>
            <meta charset="utf-8"/>
            <meta name="viewport" content="width=device-width, initial-scale=1">
            <style type="text/css">
            {{ style_defs }}
            </style>
        </head>
        <body>
            {% for file_name, lexer_name, highlight in highlights %}
            <article>
                <h2>
                    {{ file_name }}
                    <small>({{ lexer_name }})</small>
                </h2>
                <article class="hll">
                    {{ highlight }}
                </article>
            </article>
            {% endfor %}
        <body>
        """
    ).strip()
)


_lines_dict = dict()


def set_args():
    ##########
    # define #
    ##########
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--dir",
        "-d",
        type=str,
        help="Top directory to search code source files. (default: <CWD>)",
    )
    parser.add_argument(
        "--sub-dir",
        "-s",
        type=str,
        action="append",
        help="Scan for source files only in these sub-directories. Can be specified multiple times. (default: <CWD>)",
    )
    parser.add_argument(
        "output",
        metavar="OUTPUT",
        nargs="?",
        type=argparse.FileType("w", encoding="utf-8"),
        default="-",
        help="Write output HTML file here. (default: <STDOUT>)",
    )
    parser.add_argument(
        "--exclude",
        "-x",
        type=str,
        action="append",
        help="Exclude files match theses patterns in `glob` expression. Can be specified multiple times",
    )
    if sys.version_info >= (3, 9):
        parser.add_argument(
            "--gitignore",
            action=argparse.BooleanOptionalAction,
            default=True,
            help="Follow `.gitignore` files for excluding",
        )
    else:
        parser.add_argument(
            "--no-gitignore",
            action="store_true",
            help="Do NOT follow `.gitignore` files for excluding",
        )
    if sys.version_info >= (3, 9):
        parser.add_argument(
            "--null-formatter",
            action=argparse.BooleanOptionalAction,
            default=False,
            help="Null formatter output the text unchanged without any formatting.",
        )
    else:
        parser.add_argument(
            "--null-formatter",
            action="store_true",
            help="Null formatter output the text unchanged without any formatting.",
        )
    parser.add_argument(
        "--style",
        type=str,
        choices=list(get_all_styles()),
        help="Syntax highlight style",
    )
    parser.add_argument(
        "--linenos",
        type=str,
        choices=["table", "inline"],
        help="If set to 'table', output line numbers as a table with two cells, one containing the line numbers, the other the whole code. "  # noqa
        "This is copy-and-paste-friendly, but may cause alignment problems with some browsers or fonts. "
        " If set to 'inline', the line numbers will be integrated in the <pre> tag that contains the code",
    )
    parser.add_argument(
        "--template",
        type=argparse.FileType("r", encoding="utf-8"),
        help="A custom Jinja2 template file to render the output HTML file",
    )
    parser.add_argument("--title", "-t", type=str, help="Title of the HTML document")
    #########
    # parse #
    #########
    args = parser.parse_args()
    ############
    # validate #
    ############
    if not args.dir:
        args.dir = "."
    if not args.sub_dir:
        args.sub_dir = []
    return args


@simplefilter
def _filter_no_comment(self, lexer, stream, options):
    yield from (
        (ttype, value)
        for ttype, value in stream
        if not (
            any(
                ttype is t_
                for t_ in (
                    Comment,
                    # Comment.Hashbang,
                    Comment.Multiline,
                    # Comment.Preproc,
                    # Comment.PreprocFile,
                    Comment.Single,
                    # Comment.Special,
                )
            )
        )
    )


@simplefilter
def _filter_no_docstring(self, lexer, stream, options):
    yield from ((ttype, value) for ttype, value in stream if ttype is not String.Doc)


_FILTERS = (_filter_no_comment(), _filter_no_docstring())  # type: ignore


@lru_cache
def make_git_ignore_spec(gitignore_file):
    with open(gitignore_file, encoding="utf-8") as fp:
        return GitIgnoreSpec.from_lines(fp)


def main(args):
    formatter_options = dict(wrapcode=True)
    if args.style:
        formatter_options.update(style=args.style)
    if args.linenos:
        formatter_options.update(linenos=args.linenos)
    fmt_html = HtmlFormatter(**formatter_options)  # type: ignore
    fmt_null = NullFormatter()

    def _gen():
        lines_total = 0
        counter = 0
        top_path = Path(args.dir).resolve()
        if args.sub_dir:
            sub_paths = [top_path.joinpath(pth).resolve() for pth in args.sub_dir]
            if any(pth <= top_path for pth in sub_paths):
                raise ValueError(
                    "Sub-directories can not be smaller than or equal to top dir."
                )
            walker = chain.from_iterable(
                os.walk(top_path.joinpath(pth).resolve()) for pth in sub_paths
            )
        else:
            walker = os.walk(top_path)
        for dirpath, _, filenames in walker:
            for filename in filenames:
                pth = Path(dirpath, filename).resolve()
                filename = Path(
                    os.path.normpath(os.path.join(dirpath, filename))
                ).relative_to(top_path)
                ################
                # excluding ...
                # ignore none-files
                if not pth.is_file():
                    continue
                # ignore symlinks not in the dir
                try:
                    subdir_parts = pth.parent.relative_to(top_path).parts
                except ValueError:
                    continue
                # ignore hidden files
                if pth.name.startswith("."):
                    continue
                # ignore hidden dirs (except specified ones)
                if any(str(part).startswith(".") for part in subdir_parts):
                    continue
                # exclude files from cmd args
                if args.exclude:
                    if any(fnmatch.fnmatch(str(filename), pat) for pat in args.exclude):
                        continue
                ################
                # git-ignore
                if (
                    args.gitignore
                    if sys.version_info >= (3, 9)
                    else not args.no_gitignore
                ):
                    is_ignore = False
                    for parent_dir in pth.parents:
                        if parent_dir < top_path:
                            break
                        pth_gitignore = parent_dir.joinpath(".gitignore")
                        if not pth_gitignore.is_file():
                            continue
                        ignore_spec = make_git_ignore_spec(pth_gitignore)
                        is_ignore = ignore_spec.match_file(filename.as_posix())
                        if is_ignore:
                            break
                    if is_ignore:
                        continue

                ###################
                # read source file
                with pth.open("rb") as fp:
                    code = fp.read()
                # ignore empty files
                if not code:
                    continue
                try:
                    lexer = get_lexer_for_filename(filename, code)
                except ClassNotFound:
                    # ignore no-supported source files
                    continue
                else:
                    for filter_ in _FILTERS:
                        lexer.add_filter(filter_)
                    lines = [
                        line
                        for line in "".join(
                            s for _, s in lexer.get_tokens(code)  # type: ignore
                        ).splitlines()
                        if line.strip()
                    ]
                    # ignore empty source files
                    if not lines:
                        continue
                    code = "\n".join(lines)

                    counter += 1
                    lines_cnt = len(lines)
                    lines_total += lines_cnt

                    _lines_dict[lexer.name] = _lines_dict.get(lexer.name, 0) + lines_cnt

                    print(
                        f"[{counter:05d}] "
                        f"{shorten(str(filename), 88):88} "
                        f"{shorten(lexer.name, 24):24} "  # type: ignore
                        f"lines: {lines_cnt:3,d}/{lines_total:3,d}",
                        file=sys.stderr,
                    )

                    formatted = highlight(
                        code, lexer, fmt_null if args.null_formatter else fmt_html
                    )
                    yield filename, lexer.name, formatted  # type: ignore

    if args.null_formatter:
        for filename, lexer_name, formatted in _gen():
            print(f"{filename} ({lexer_name})", file=args.output)
            print(formatted, file=args.output)
    else:
        context = dict(
            style_defs=fmt_html.get_style_defs(),
            highlights=_gen(),
            title=args.title,
        )
        tpl = Template(args.template.read()) if args.template else DEFAULT_HTML_TEMPLATE
        tpl.stream(**context).dump(args.output)

    print(file=sys.stderr)
    print("=" * 79, file=sys.stderr)
    for k, v in _lines_dict.items():
        print(
            f"{shorten(k, 24):24} " f"lines: {v:3,d}",
            file=sys.stderr,
        )
    print("=" * 79, file=sys.stderr)


if __name__ == "__main__":
    exit(main(set_args()))

## requirements.txt
Jinja2
Pygments
pathspec
	#!/usr/bin/env python

	"""
	Export all source code files into a single HTML file with Pygments syntax highlight and remove all comments and empty lines.
	"""

	import argparse
	import fnmatch
	import os
	import sys
	from functools import lru_cache
	from itertools import chain
	from pathlib import Path
	from textwrap import dedent, shorten

	from jinja2 import Template
	from pathspec.gitignore import GitIgnoreSpec
	from pygments import highlight
	from pygments.filter import simplefilter
	from pygments.formatters import HtmlFormatter, NullFormatter
	from pygments.lexers import get_lexer_for_filename
	from pygments.styles import get_all_styles
	from pygments.token import Comment, String
	from pygments.util import ClassNotFound

	DEFAULT_HTML_TEMPLATE = Template(
	dedent(
	"""
	<!DOCTYPE html>
	<html>
	<head>
	<title>{{ title if title }}</title>
	<meta charset="utf-8"/>
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<style type="text/css">
	{{ style_defs }}
	</style>
	</head>
	<body>
	{% for file_name, lexer_name, highlight in highlights %}
	<article>
	<h2>
	{{ file_name }}
	<small>({{ lexer_name }})</small>
	</h2>
	<article class="hll">
	{{ highlight }}
	</article>
	</article>
	{% endfor %}
	<body>
	"""
	).strip()
	)


	_lines_dict = dict()


	def set_args():
	##########
	# define #
	##########
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--dir",
	"-d",
	type=str,
	help="Top directory to search code source files. (default: <CWD>)",
	)
	parser.add_argument(
	"--sub-dir",
	"-s",
	type=str,
	action="append",
	help="Scan for source files only in these sub-directories. Can be specified multiple times. (default: <CWD>)",
	)
	parser.add_argument(
	"output",
	metavar="OUTPUT",
	nargs="?",
	type=argparse.FileType("w", encoding="utf-8"),
	default="-",
	help="Write output HTML file here. (default: <STDOUT>)",
	)
	parser.add_argument(
	"--exclude",
	"-x",
	type=str,
	action="append",
	help="Exclude files match theses patterns in `glob` expression. Can be specified multiple times",
	)
	if sys.version_info >= (3, 9):
	parser.add_argument(
	"--gitignore",
	action=argparse.BooleanOptionalAction,
	default=True,
	help="Follow `.gitignore` files for excluding",
	)
	else:
	parser.add_argument(
	"--no-gitignore",
	action="store_true",
	help="Do NOT follow `.gitignore` files for excluding",
	)
	if sys.version_info >= (3, 9):
	parser.add_argument(
	"--null-formatter",
	action=argparse.BooleanOptionalAction,
	default=False,
	help="Null formatter output the text unchanged without any formatting.",
	)
	else:
	parser.add_argument(
	"--null-formatter",
	action="store_true",
	help="Null formatter output the text unchanged without any formatting.",
	)
	parser.add_argument(
	"--style",
	type=str,
	choices=list(get_all_styles()),
	help="Syntax highlight style",
	)
	parser.add_argument(
	"--linenos",
	type=str,
	choices=["table", "inline"],
	help="If set to 'table', output line numbers as a table with two cells, one containing the line numbers, the other the whole code. " # noqa
	"This is copy-and-paste-friendly, but may cause alignment problems with some browsers or fonts. "
	" If set to 'inline', the line numbers will be integrated in the <pre> tag that contains the code",
	)
	parser.add_argument(
	"--template",
	type=argparse.FileType("r", encoding="utf-8"),
	help="A custom Jinja2 template file to render the output HTML file",
	)
	parser.add_argument("--title", "-t", type=str, help="Title of the HTML document")
	#########
	# parse #
	#########
	args = parser.parse_args()
	############
	# validate #
	############
	if not args.dir:
	args.dir = "."
	if not args.sub_dir:
	args.sub_dir = []
	return args


	@simplefilter
	def _filter_no_comment(self, lexer, stream, options):
	yield from (
	(ttype, value)
	for ttype, value in stream
	if not (
	any(
	ttype is t_
	for t_ in (
	Comment,
	# Comment.Hashbang,
	Comment.Multiline,
	# Comment.Preproc,
	# Comment.PreprocFile,
	Comment.Single,
	# Comment.Special,
	)
	)
	)
	)


	@simplefilter
	def _filter_no_docstring(self, lexer, stream, options):
	yield from ((ttype, value) for ttype, value in stream if ttype is not String.Doc)


	_FILTERS = (_filter_no_comment(), _filter_no_docstring()) # type: ignore


	@lru_cache
	def make_git_ignore_spec(gitignore_file):
	with open(gitignore_file, encoding="utf-8") as fp:
	return GitIgnoreSpec.from_lines(fp)


	def main(args):
	formatter_options = dict(wrapcode=True)
	if args.style:
	formatter_options.update(style=args.style)
	if args.linenos:
	formatter_options.update(linenos=args.linenos)
	fmt_html = HtmlFormatter(**formatter_options) # type: ignore
	fmt_null = NullFormatter()

	def _gen():
	lines_total = 0
	counter = 0
	top_path = Path(args.dir).resolve()
	if args.sub_dir:
	sub_paths = [top_path.joinpath(pth).resolve() for pth in args.sub_dir]
	if any(pth <= top_path for pth in sub_paths):
	raise ValueError(
	"Sub-directories can not be smaller than or equal to top dir."
	)
	walker = chain.from_iterable(
	os.walk(top_path.joinpath(pth).resolve()) for pth in sub_paths
	)
	else:
	walker = os.walk(top_path)
	for dirpath, _, filenames in walker:
	for filename in filenames:
	pth = Path(dirpath, filename).resolve()
	filename = Path(
	os.path.normpath(os.path.join(dirpath, filename))
	).relative_to(top_path)
	################
	# excluding ...
	# ignore none-files
	if not pth.is_file():
	continue
	# ignore symlinks not in the dir
	try:
	subdir_parts = pth.parent.relative_to(top_path).parts
	except ValueError:
	continue
	# ignore hidden files
	if pth.name.startswith("."):
	continue
	# ignore hidden dirs (except specified ones)
	if any(str(part).startswith(".") for part in subdir_parts):
	continue
	# exclude files from cmd args
	if args.exclude:
	if any(fnmatch.fnmatch(str(filename), pat) for pat in args.exclude):
	continue
	################
	# git-ignore
	if (
	args.gitignore
	if sys.version_info >= (3, 9)
	else not args.no_gitignore
	):
	is_ignore = False
	for parent_dir in pth.parents:
	if parent_dir < top_path:
	break
	pth_gitignore = parent_dir.joinpath(".gitignore")
	if not pth_gitignore.is_file():
	continue
	ignore_spec = make_git_ignore_spec(pth_gitignore)
	is_ignore = ignore_spec.match_file(filename.as_posix())
	if is_ignore:
	break
	if is_ignore:
	continue

	###################
	# read source file
	with pth.open("rb") as fp:
	code = fp.read()
	# ignore empty files
	if not code:
	continue
	try:
	lexer = get_lexer_for_filename(filename, code)
	except ClassNotFound:
	# ignore no-supported source files
	continue
	else:
	for filter_ in _FILTERS:
	lexer.add_filter(filter_)
	lines = [
	line
	for line in "".join(
	s for _, s in lexer.get_tokens(code) # type: ignore
	).splitlines()
	if line.strip()
	]
	# ignore empty source files
	if not lines:
	continue
	code = "\n".join(lines)

	counter += 1
	lines_cnt = len(lines)
	lines_total += lines_cnt

	_lines_dict[lexer.name] = _lines_dict.get(lexer.name, 0) + lines_cnt

	print(
	f"[{counter:05d}] "
	f"{shorten(str(filename), 88):88} "
	f"{shorten(lexer.name, 24):24} " # type: ignore
	f"lines: {lines_cnt:3,d}/{lines_total:3,d}",
	file=sys.stderr,
	)

	formatted = highlight(
	code, lexer, fmt_null if args.null_formatter else fmt_html
	)
	yield filename, lexer.name, formatted # type: ignore

	if args.null_formatter:
	for filename, lexer_name, formatted in _gen():
	print(f"{filename} ({lexer_name})", file=args.output)
	print(formatted, file=args.output)
	else:
	context = dict(
	style_defs=fmt_html.get_style_defs(),
	highlights=_gen(),
	title=args.title,
	)
	tpl = Template(args.template.read()) if args.template else DEFAULT_HTML_TEMPLATE
	tpl.stream(**context).dump(args.output)

	print(file=sys.stderr)
	print("=" * 79, file=sys.stderr)
	for k, v in _lines_dict.items():
	print(
	f"{shorten(k, 24):24} " f"lines: {v:3,d}",
	file=sys.stderr,
	)
	print("=" * 79, file=sys.stderr)


	if __name__ == "__main__":
	exit(main(set_args()))