he7d3r/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Usage


Open the Moodle course of interest and go to Administration> Course Administration > Reports > Logs.
Click on "Get these logs"
Download table data as comma separated values (e.g. input1.csv). The script will use this as one of its input files.
Create a file "videos.csv", with a column title (containing the titles that are present in the logs) and a column length, with the length (in minutes) of each video.
Run the script passing the names of the csv files used as input and output:

$ python process-moodle-logs.py --logs input1.csv --videos videos.csv --stats output.csv --aggregated output_agg.csv

Check out the resulting two files:
output.csv: The full statistics for each student: date, title and length of the videos watched by each student.
* Time: When the vide was watched
* User: Name of the student
* Context: Title of the video watched
* length: Length of the video (in minutes)
output_agg.csv: The aggregated statistics for each student:
* User: Name of the student
* Classes: Number of classes equivalent to the number of minutes of videos watched
* Minutes: Total number of minutes of video watched
* % Minutes: Proportion of the total number of minutes of video watched


## process-moodle-logs.py
#!/usr/bin/python3
import argparse
import pandas as pd
from math import ceil

def process(input_files, videos, output_file, aggregated):
    prefixes = ["URL: Videoaula: ", "Arquivo: Videoaula: ", "File: Videoaula: " ]
    lessons = pd.read_csv(videos, index_col=0)
    total_length = lessons.sum(axis=0)['length']
    minutes_per_class = 100
    fieldnames = ["Time", "User", "Affected", "Context", "Component", "Event",
                "Description", "Origin", "IP"]

    logs = pd.concat( (pd.read_csv(file, names=fieldnames) for file in input_files),
            ignore_index=True)
    re_prefixes = '.*(?:' + '|'.join(prefixes) + ')'
    # Restrict to video lessons' logs
    logs = logs[logs.Context.str.contains(re_prefixes)]
    # Remove unnecessary prefixes
    logs.Context = logs.Context.str.replace(re_prefixes, '')
    # Keep only the most recent logs (they appear first in the files)
    logs = pd.DataFrame.drop_duplicates(logs, subset=['User', 'Context'], keep='first')
    logs = logs[['Time', 'User', 'Context']]
    # Add length column
    logs = logs.join(lessons, on='Context', how='inner')
    logs = logs.reset_index(drop=True)
    logs = logs.sort_values(by=['User', 'length'], ascending=[True, False])
    agg_logs = logs.groupby(['User']).sum()
    agg_logs['Classes'] = agg_logs['length'].divide(minutes_per_class).apply(ceil)
    agg_logs['Minutes'] = agg_logs['length'].apply(ceil)
    agg_logs['% Minutes'] = agg_logs['Minutes'].divide(total_length).multiply(100).apply(ceil)
    # Export results to two CSV files (full and aggregated data)
    logs.to_csv(output_file, index=False)
    col_names = ["Classes", "Minutes", "% Minutes"]
    agg_logs[col_names].to_csv(aggregated, index=True)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Get statistics from Moodle logs.')
    parser.add_argument('-l', '--logs', nargs='+',
                        help='names of the csv log files download from Moodle')
    parser.add_argument('-v', '--videos',
                        help='name of the csv file title and length of videos')
    parser.add_argument('-s', '--stats',
                        help='name of the file where the full output will be saved')
    parser.add_argument('-a', '--aggregated',
                        help='name of the file for aggregated statistics')
    args = parser.parse_args()
    process(args.logs, args.videos, args.stats, args.aggregated)
	#!/usr/bin/python3
	import argparse
	import pandas as pd
	from math import ceil

	def process(input_files, videos, output_file, aggregated):
	prefixes = ["URL: Videoaula: ", "Arquivo: Videoaula: ", "File: Videoaula: " ]
	lessons = pd.read_csv(videos, index_col=0)
	total_length = lessons.sum(axis=0)['length']
	minutes_per_class = 100
	fieldnames = ["Time", "User", "Affected", "Context", "Component", "Event",
	"Description", "Origin", "IP"]

	logs = pd.concat( (pd.read_csv(file, names=fieldnames) for file in input_files),
	ignore_index=True)
	re_prefixes = '.*(?:' + '\|'.join(prefixes) + ')'
	# Restrict to video lessons' logs
	logs = logs[logs.Context.str.contains(re_prefixes)]
	# Remove unnecessary prefixes
	logs.Context = logs.Context.str.replace(re_prefixes, '')
	# Keep only the most recent logs (they appear first in the files)
	logs = pd.DataFrame.drop_duplicates(logs, subset=['User', 'Context'], keep='first')
	logs = logs[['Time', 'User', 'Context']]
	# Add length column
	logs = logs.join(lessons, on='Context', how='inner')
	logs = logs.reset_index(drop=True)
	logs = logs.sort_values(by=['User', 'length'], ascending=[True, False])
	agg_logs = logs.groupby(['User']).sum()
	agg_logs['Classes'] = agg_logs['length'].divide(minutes_per_class).apply(ceil)
	agg_logs['Minutes'] = agg_logs['length'].apply(ceil)
	agg_logs['% Minutes'] = agg_logs['Minutes'].divide(total_length).multiply(100).apply(ceil)
	# Export results to two CSV files (full and aggregated data)
	logs.to_csv(output_file, index=False)
	col_names = ["Classes", "Minutes", "% Minutes"]
	agg_logs[col_names].to_csv(aggregated, index=True)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Get statistics from Moodle logs.')
	parser.add_argument('-l', '--logs', nargs='+',
	help='names of the csv log files download from Moodle')
	parser.add_argument('-v', '--videos',
	help='name of the csv file title and length of videos')
	parser.add_argument('-s', '--stats',
	help='name of the file where the full output will be saved')
	parser.add_argument('-a', '--aggregated',
	help='name of the file for aggregated statistics')
	args = parser.parse_args()
	process(args.logs, args.videos, args.stats, args.aggregated)