-
-
Save dpshelio/aa57ecd98a253b390d6c0156d27891af to your computer and use it in GitHub Desktop.
Plot the distribution of commit frequency for multiple submissions (WIP)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import re | |
import tempfile | |
import shutil | |
import shlex | |
import subprocess | |
import warnings | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
def get_dates_from_file(dates_file) -> pd.Series: | |
# Is this the best way? Should the index values be strings? | |
timestamps = pd.read_csv(dates_file, header=None, names=["timestamp"]) | |
timestamps.timestamp = pd.to_datetime(timestamps.timestamp) | |
timestamps["date"] = timestamps.timestamp.apply(lambda x: f"{x:%Y-%m-%d}") | |
commits_per_day = timestamps.groupby("date").size() | |
return commits_per_day | |
# directory: extract submissions | |
def extract_submissions(directory_submissions): | |
# directory with files like: Participant_3548328_assignsubmission_file_ | |
submissions = Path(directory_submissions).glob('Participant_*/*tar*') | |
submission_ids = {} | |
for submission in submissions: | |
id_submission = submission.parent.name.split('_')[1] | |
student_id = re.compile(r'([a-zA-Z0-9]{5}|[0-9]{8})\.tar\.gz') | |
if re.match(student_id, submission.name): | |
submission_ids[id_submission] = {'path': submission, 'filename': submission.name} | |
return(submission_ids) | |
# for each submission folder: extract tar file | |
# for each tar file: expand and find the git directory | |
def extract_tar(tarpath, sub_id): | |
with tempfile.TemporaryDirectory() as tmpsub: | |
shutil.unpack_archive(tarpath, tmpsub) | |
git_dirs = Path(tmpsub).glob('**/.git') | |
git_counter = 0 | |
for git_dir in git_dirs: | |
location = git_dir.parent | |
command = "git log --pretty=format:'%ci'" | |
git_dates = subprocess.run(shlex.split(command), capture_output=True, | |
cwd=location) | |
output = f"{sub_id}_dates" | |
if git_counter != 0: | |
output += f"_{git_counter:02d}" | |
with open(output, 'bw') as out: | |
out.write(git_dates.stdout) | |
out.write(b'\n') # no needed, but looks nicer when checking the files. | |
# extract the dates into a dataframe. | |
def merge_dates(datespath=Path('.'), plot=True, deadline="2022-01-01"): | |
submission_git_history = datespath.glob('*_dates*') | |
all_numbers = pd.DataFrame() | |
for submission in submission_git_history: | |
commits_per_day = get_dates_from_file(submission) | |
commits_per_day.name = submission.name.split('_')[0] | |
try: | |
all_numbers = all_numbers.join(commits_per_day, how="outer", sort=True) | |
except: | |
warnings.warn(f"Could not merge data for student {submission}") | |
# This will leave NaNs for dates where no data existed, we'd prefer 0 | |
all_numbers = all_numbers.fillna(0).astype(int) | |
if plot: | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
ax.imshow(all_numbers.to_numpy()) | |
locs, labels = ax.get_yticks(), ax.get_yticklabels() | |
labels = [all_numbers.index[int(l)] if 0 <= l < len(all_numbers.index) else "" for l in locs] | |
ax.set(yticklabels=[]) | |
ax.tick_params(left=False) | |
ax.set_yticks(locs) | |
ax.set_yticklabels(labels, rotation=35) | |
value = list(all_numbers.index).index(deadline) | |
ax.hlines(value, 0, all_numbers.shape[1], colors='red') | |
plt.savefig("heatmap.png") | |
if __name__ == "__main__": | |
# FIXME add path and deadline as command line arguments | |
submissions = extract_submissions('./submissions/coursework_01/02_20211130') | |
for submission, properties in submissions.items(): | |
extract_tar(properties['path'], submission) | |
merge_dates(Path('.'), deadline="2021-11-23") | |
# TODO Read data from moodle csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment