Skip to content

Instantly share code, notes, and snippets.

@Syncrossus
Last active December 6, 2019 16:13
Show Gist options
  • Save Syncrossus/46acf43baefaeba6eef1f346a71e0b95 to your computer and use it in GitHub Desktop.
Save Syncrossus/46acf43baefaeba6eef1f346a71e0b95 to your computer and use it in GitHub Desktop.
Determines the proportion of python code that is comments. To use, type `python comment_proportions.py file1.py [file2.py] [file3.py] [...]` in your terminal.
from pygments import highlight
from pygments.lexers import get_lexer_by_name
from pygments.formatters import BBCodeFormatter
import re
import sys
def get_comments(code):
""" Extracts comments and docstrings from python code.
Uses pygments to mark comments and docstrings and
then regular expressions to extract them.
Args:
- code (str) : the code to extract comments from
Return:
- comments (list<str>) : the comments and docstrings
extracted from the code
"""
comment_regex = re.compile(
r"(?:\[color=#408080\]\[i\])(.*)(?:\[/i\]\[/color\])")
docstring_regex = re.compile(
r"(?:\[color=#BA2121\]\[i\])(.*?)(?:\[/i\]\[/color\])", re.DOTALL)
lexer = get_lexer_by_name("python", stripall=True)
formatter = BBCodeFormatter(linenos=False)
result = highlight(code, lexer, formatter)
comments = comment_regex.findall(result)
docstrings = docstring_regex.findall(result)
return comments + docstrings
def compute_comment_stats(file_list):
""" Finds the total number of bytes, the number of bytes of comments,
and computes the ratio of the two, for each file and in total.
Args:
- file_list (list<str>): the list of files to compute stats for
Return:
- stats (dict): a dict with filenames as keys and dicts of
statistics as values.
"""
stats = {}
total_len = 0
total_comment_len = 0
for file in file_list:
with open(file, 'r') as f:
source_code = f.read()
total_len += len(source_code)
comments = get_comments(source_code)
comment_len = sum([len(comment) for comment in comments])
total_comment_len += comment_len
stats[file] = {
"comment bytes": comment_len,
"total bytes": len(source_code),
"ratio": comment_len / len(source_code)}
stats["total"] = {
"comment bytes": total_comment_len,
"total bytes": total_len,
"ratio": total_comment_len / total_len}
return stats
if __name__ == '__main__':
stats = compute_comment_stats(sys.argv[1:])
print(stats)
@Syncrossus
Copy link
Author

This code is released under the WTFPL.

@Syncrossus
Copy link
Author

If you liked this, check out cloc, it's a fantastic counter for lines of code.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment