Skip to content

Instantly share code, notes, and snippets.

@AlDanial
Created December 4, 2021 20:38
Show Gist options
  • Save AlDanial/ef0ae8a83e699e76f4ab1692b0ed2c02 to your computer and use it in GitHub Desktop.
Save AlDanial/ef0ae8a83e699e76f4ab1692b0ed2c02 to your computer and use it in GitHub Desktop.
Program to compute histogram bins of code line counts by project folder using line counts from cloc.
#!/usr/bin/env python
# A solution to
# https://stackoverflow.com/questions/70182311/is-there-a-tool-that-shows-a-distribution-of-lines-of-code-per-file-of-a-folder
import sys
import os.path
import pandas as pd
def add_folder(df):
"""
Return a Pandas dataframe with an additional 'folder' column
containing each file's parent directory
"""
header = 'github.com/AlDanial/cloc'
df = df.drop(df.columns[df.columns.str.contains(header)], axis=1)
df['folder'] = df['filename'].dropna().apply(os.path.dirname)
return df
def bin_by_folder(df):
bins = list(range(0,1000,50))
return df.groupby('folder')['code'].value_counts(bins=bins).sort_index()
def file_count_by_folder(df):
df_files = pd.pivot_table(df, index=['folder'], aggfunc='count')
file_counts = df_files.rename(columns={'blank':'file count'})
return file_counts[['file count']]
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} data.csv")
print(" where the .csv file is created with")
print(" cloc --by-file --csv --out data.csv my_code_base")
raise SystemExit
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df = add_folder(pd.read_csv(sys.argv[1]))
print(pd.pivot_table(df, index=['folder'], aggfunc='sum'))
print('-' * 50)
print(file_count_by_folder(df))
print('-' * 50)
print(bin_by_folder(df))
if __name__ == "__main__": main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment