Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created November 6, 2023 17:01
Show Gist options
  • Save pszemraj/60de22b09e9c91a16443519a1741e0c0 to your computer and use it in GitHub Desktop.
Save pszemraj/60de22b09e9c91a16443519a1741e0c0 to your computer and use it in GitHub Desktop.
heuristics for language agnostic code readability index
import re
from itertools import chain
def calculate_readability(code_string:str) -> float:
code = code_string.splitlines()
# Heuristic 1: Line length
max_line_length = 80
long_lines = sum(1 for line in code if len(line) > max_line_length)
long_line_ratio = long_lines / len(code)
# Heuristic 2: Identifier length
min_identifier_length = 2
max_identifier_length = 20
identifiers = re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", " ".join(code))
short_identifiers = sum(1 for id in identifiers if len(id) < min_identifier_length)
long_identifiers = sum(1 for id in identifiers if len(id) > max_identifier_length)
bad_identifier_ratio = (
(short_identifiers + long_identifiers) / len(identifiers) if identifiers else 0
)
# Heuristic 3: Comment density
target_comment_density = 0.15
comment_lines = sum(1 for line in code if re.search(r"//|/\*|\*/|#", line))
comment_density = abs(comment_lines / len(code) - target_comment_density)
# Heuristic 4: Cyclomatic Complexity (normalized by the number of functions)
control_structures = re.findall(
r"\b(if|else|for|while|switch|case|default|continue|break)\b",
" ".join(code),
re.I,
)
functions = re.findall(r"\b(def|function|func|sub)\b", " ".join(code), re.I)
cyclomatic_complexity = (len(control_structures) + 1) / (len(functions) + 1)
# Heuristic 5: Indentation consistency
indentation_levels = [
len(re.match(r"^[\s\t]*", line).group()) for line in code if line.strip() != ""
]
inconsistent_indentation = sum(
1
for i in range(1, len(indentation_levels))
if indentation_levels[i] - indentation_levels[i - 1] not in {0, 1, -1}
)
indentation_inconsistency_ratio = (
inconsistent_indentation / (len(indentation_levels) - 1)
if len(indentation_levels) > 1
else 0
)
# Normalize heuristic scores
normalized_scores = {
"long_line_ratio": 1 - min(long_line_ratio, 1),
"bad_identifier_ratio": 1 - min(bad_identifier_ratio, 1),
"comment_density_deviation": 1 - min(comment_density, 1),
"normalized_cyclomatic_complexity": 1 / (1 + cyclomatic_complexity),
"indentation_inconsistency_ratio": 1 - min(indentation_inconsistency_ratio, 1),
}
# Calculate the aggregate score as the average of the normalized scores
aggregate_score = sum(normalized_scores.values()) / len(normalized_scores)
return aggregate_score
# Example usage:
code_example = """def calculate_readability(code):
# This function calculates readability
avg_line_length = sum(len(line) for line in code) / len(code)
return avg_line_length"""
readability_score = calculate_readability(code_example)
print(readability_score)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment