dpshelio/analyse_grades.py

## analyse_grades.py
from pathlib import Path
import re
import tempfile
import shutil
import shlex
import subprocess
import warnings

import matplotlib.pyplot as plt
import pandas as pd


def get_dates_from_file(dates_file) -> pd.Series:
    # Is this the best way? Should the index values be strings?
    timestamps = pd.read_csv(dates_file, header=None, names=["timestamp"])
    timestamps.timestamp = pd.to_datetime(timestamps.timestamp)
    timestamps["date"] = timestamps.timestamp.apply(lambda x: f"{x:%Y-%m-%d}")
    commits_per_day = timestamps.groupby("date").size()
    return commits_per_day


# directory: extract submissions
def extract_submissions(directory_submissions):

    # directory with files like: Participant_3548328_assignsubmission_file_
    submissions = Path(directory_submissions).glob('Participant_*/*tar*')
    submission_ids = {}
    for submission in submissions:
        id_submission = submission.parent.name.split('_')[1]
        student_id = re.compile(r'([a-zA-Z0-9]{5}|[0-9]{8})\.tar\.gz')
        if re.match(student_id, submission.name):
            submission_ids[id_submission] = {'path': submission, 'filename': submission.name}
    return(submission_ids)


# for each submission folder: extract tar file
# for each tar file: expand and find the git directory
def extract_tar(tarpath, sub_id):
    with tempfile.TemporaryDirectory() as tmpsub:
        shutil.unpack_archive(tarpath, tmpsub)
        git_dirs = Path(tmpsub).glob('**/.git')
        git_counter = 0
        for git_dir in git_dirs:
            location = git_dir.parent
            command = "git log --pretty=format:'%ci'"
            git_dates = subprocess.run(shlex.split(command), capture_output=True,
                                       cwd=location)
            output = f"{sub_id}_dates"
            if git_counter != 0:
                output += f"_{git_counter:02d}"
            with open(output, 'bw') as out:
                out.write(git_dates.stdout)
                out.write(b'\n') # no needed, but looks nicer when checking the files.

# extract the dates into a dataframe.
def merge_dates(datespath=Path('.'), plot=True, deadline="2022-01-01"):
    submission_git_history = datespath.glob('*_dates*')
    all_numbers = pd.DataFrame()
    for submission in submission_git_history:
        commits_per_day = get_dates_from_file(submission)
        commits_per_day.name = submission.name.split('_')[0]
        try:
            all_numbers = all_numbers.join(commits_per_day, how="outer", sort=True)
        except:
            warnings.warn(f"Could not merge data for student {submission}")
    # This will leave NaNs for dates where no data existed, we'd prefer 0
    all_numbers = all_numbers.fillna(0).astype(int)
    if plot:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.imshow(all_numbers.to_numpy())
        locs, labels = ax.get_yticks(), ax.get_yticklabels()
        labels = [all_numbers.index[int(l)] if 0 <= l < len(all_numbers.index) else "" for l in locs]
        ax.set(yticklabels=[])
        ax.tick_params(left=False)
        ax.set_yticks(locs)
        ax.set_yticklabels(labels, rotation=35)
        value = list(all_numbers.index).index(deadline)
        ax.hlines(value, 0, all_numbers.shape[1], colors='red')

        plt.savefig("heatmap.png")

if __name__ == "__main__":

    # FIXME add path and deadline as command line arguments
    submissions = extract_submissions('./submissions/coursework_01/02_20211130')
    for submission, properties in submissions.items():
        extract_tar(properties['path'], submission)
    merge_dates(Path('.'), deadline="2021-11-23")

    # TODO Read data from moodle csv
	from pathlib import Path
	import re
	import tempfile
	import shutil
	import shlex
	import subprocess
	import warnings

	import matplotlib.pyplot as plt
	import pandas as pd


	def get_dates_from_file(dates_file) -> pd.Series:
	# Is this the best way? Should the index values be strings?
	timestamps = pd.read_csv(dates_file, header=None, names=["timestamp"])
	timestamps.timestamp = pd.to_datetime(timestamps.timestamp)
	timestamps["date"] = timestamps.timestamp.apply(lambda x: f"{x:%Y-%m-%d}")
	commits_per_day = timestamps.groupby("date").size()
	return commits_per_day


	# directory: extract submissions
	def extract_submissions(directory_submissions):

	# directory with files like: Participant_3548328_assignsubmission_file_
	submissions = Path(directory_submissions).glob('Participant_/tar*')
	submission_ids = {}
	for submission in submissions:
	id_submission = submission.parent.name.split('_')[1]
	student_id = re.compile(r'([a-zA-Z0-9]{5}\|[0-9]{8})\.tar\.gz')
	if re.match(student_id, submission.name):
	submission_ids[id_submission] = {'path': submission, 'filename': submission.name}
	return(submission_ids)


	# for each submission folder: extract tar file
	# for each tar file: expand and find the git directory
	def extract_tar(tarpath, sub_id):
	with tempfile.TemporaryDirectory() as tmpsub:
	shutil.unpack_archive(tarpath, tmpsub)
	git_dirs = Path(tmpsub).glob('**/.git')
	git_counter = 0
	for git_dir in git_dirs:
	location = git_dir.parent
	command = "git log --pretty=format:'%ci'"
	git_dates = subprocess.run(shlex.split(command), capture_output=True,
	cwd=location)
	output = f"{sub_id}_dates"
	if git_counter != 0:
	output += f"_{git_counter:02d}"
	with open(output, 'bw') as out:
	out.write(git_dates.stdout)
	out.write(b'\n') # no needed, but looks nicer when checking the files.

	# extract the dates into a dataframe.
	def merge_dates(datespath=Path('.'), plot=True, deadline="2022-01-01"):
	submission_git_history = datespath.glob('_dates')
	all_numbers = pd.DataFrame()
	for submission in submission_git_history:
	commits_per_day = get_dates_from_file(submission)
	commits_per_day.name = submission.name.split('_')[0]
	try:
	all_numbers = all_numbers.join(commits_per_day, how="outer", sort=True)
	except:
	warnings.warn(f"Could not merge data for student {submission}")
	# This will leave NaNs for dates where no data existed, we'd prefer 0
	all_numbers = all_numbers.fillna(0).astype(int)
	if plot:
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.imshow(all_numbers.to_numpy())
	locs, labels = ax.get_yticks(), ax.get_yticklabels()
	labels = [all_numbers.index[int(l)] if 0 <= l < len(all_numbers.index) else "" for l in locs]
	ax.set(yticklabels=[])
	ax.tick_params(left=False)
	ax.set_yticks(locs)
	ax.set_yticklabels(labels, rotation=35)
	value = list(all_numbers.index).index(deadline)
	ax.hlines(value, 0, all_numbers.shape[1], colors='red')

	plt.savefig("heatmap.png")

	if __name__ == "__main__":

	# FIXME add path and deadline as command line arguments
	submissions = extract_submissions('./submissions/coursework_01/02_20211130')
	for submission, properties in submissions.items():
	extract_tar(properties['path'], submission)
	merge_dates(Path('.'), deadline="2021-11-23")

	# TODO Read data from moodle csv