Last active
May 16, 2021 19:59
-
-
Save Caellwyn/85eb123dcbf86afff0a05e17d545859e to your computer and use it in GitHub Desktop.
process_trace_data.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "process_trace_data.ipynb", | |
"private_outputs": true, | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyN34JC9GrOgjuigsIRmqxK9", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Caellwyn/85eb123dcbf86afff0a05e17d545859e/process_trace_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fmF2T3A9dP6R" | |
}, | |
"source": [ | |
"import requests\n", | |
"import io\n", | |
"import pandas as pd\n", | |
"import zipfile\n", | |
"from IPython.display import clear_output, display\n", | |
"from sys import getsizeof\n", | |
"\n", | |
"def import_student_vle():\n", | |
" \n", | |
" \"\"\"\n", | |
" Loads necessary table from anonymisedData.zip\n", | |
" \"\"\"\n", | |
" \n", | |
" url = 'https://github.com/Caellwyn/ou_student_predictions/raw/main/content/anonymisedData.zip'\n", | |
"\n", | |
" #download and unzip the data\n", | |
" OULAD = requests.get(url, stream=True)\n", | |
" zf = zipfile.ZipFile(io.BytesIO(OULAD.content))\n", | |
" del OULAD\n", | |
" student_vle = pd.read_csv(zf.open('studentVle.csv'))\n", | |
" del zf\n", | |
" #combine module, presentation, and student id columns into one registration column\n", | |
" #representing one course registration, or one student in one course.\n", | |
" student_vle['registration'] = student_vle['code_module'] \\\n", | |
" + student_vle['code_presentation'] \\\n", | |
" + student_vle['id_student'].astype(str)\n", | |
" student_vle = student_vle.drop(columns=['code_module','code_presentation','id_student'])\n", | |
" student_vle = student_vle.rename(columns={'sum_click':'sum_clicks'})\n", | |
" #Change the order of the columns for easy reading\n", | |
" student_vle = student_vle[['registration','id_site','date','sum_clicks']]\n", | |
" \n", | |
" #final table is a list of each student interaction with each activity in each course and the \n", | |
" #number of times a student clicked on that activity. \n", | |
" #It contains over 10 million records.\n", | |
" return student_vle\n", | |
"\n", | |
"def group_by_day(student_vle):\n", | |
" \n", | |
" \"\"\"\n", | |
" Groups activity records by registration and by day, rather than by activity. Table\n", | |
" will by organized by registration, then chronologically. Function sums the total clicks a student and \n", | |
" counts the number of activities completed for each student registration for each day.\n", | |
" \"\"\"\n", | |
" \n", | |
" #group by registration and day\n", | |
" vle_group = student_vle.groupby(by = ['registration', 'date'])\n", | |
"\n", | |
" #sum up the clicks each day\n", | |
" sum_clicks = vle_group.sum().reset_index()\n", | |
" sum_clicks = sum_clicks.drop(columns=['id_site'])\n", | |
" sum_clicks = sum_clicks.sort_values(by=['registration','date'])\n", | |
" \n", | |
" #clean up data.\n", | |
" # sum_clicks = sum_clicks.fillna(0)\n", | |
" # sum_clicks = sum_clicks.drop_duplicates(keep='first')\n", | |
" return sum_clicks\n", | |
"\n", | |
"def create_time_series(sum_clicks):\n", | |
" \n", | |
" \"\"\"\n", | |
" Takes the merged table of activities and returns a new table with one row per student registration\n", | |
" and columns for numbers of activities, clicks, and activities * clicks for each day of the course\n", | |
" up to the prediction window\n", | |
" \"\"\"\n", | |
" \n", | |
" #Create a new dataframe to hold our timeseries data.\n", | |
" time_series = pd.DataFrame()\n", | |
" \n", | |
" #Creates one record for each student registration\n", | |
" time_series['registration'] = sum_clicks['registration'].unique()\n", | |
" \n", | |
" start_date = sum_clicks['date'].min()\n", | |
" end_date = sum_clicks['date'].max()\n", | |
" date_range = range(start_date, end_date)\n", | |
" \n", | |
" counter = len(date_range)\n", | |
" \n", | |
" #for each day in the course, create a window filtered show data from only that date.\n", | |
" for date in date_range:\n", | |
" #create a window filtered by one day by passing a boolean expression.\n", | |
" single_date_df = sum_clicks[sum_clicks['date'] == date][['registration',\n", | |
" 'sum_clicks']]\n", | |
" #rename columns to describe date and data.\n", | |
" single_date_df.columns = ['registration'] + [f'clicks_on_day_{date}']\n", | |
" \n", | |
" #merge the dataframe for this day onto the time series dataframe.\n", | |
" time_series = time_series.merge(single_date_df, \n", | |
" how='left', \n", | |
" on='registration',\n", | |
" validate = 'm:1')\n", | |
"\n", | |
" print('Days remaining to be processed: ', counter)\n", | |
" clear_output(wait=True)\n", | |
" counter -= 1\n", | |
" \n", | |
" #fill missing data with zeros. Missing data represents no clicks that day.\n", | |
" time_series = time_series.fillna(0)\n", | |
" time_series = time_series.set_index('registration', drop=True)\n", | |
"\n", | |
" return time_series\n", | |
"\n", | |
"\n", | |
"student_vle = import_student_vle()\n", | |
"sum_clicks = group_by_day(student_vle)\n", | |
"\n", | |
"#save some memory\n", | |
"del student_vle\n", | |
"\n", | |
"time_series = create_time_series(sum_clicks)\n", | |
"\n", | |
"#save some memory\n", | |
"del sum_clicks\n", | |
"\n", | |
"#compress the dataframe\n", | |
"sparse_time_series = time_series.astype(pd.SparseDtype(\"int\", 0))\n", | |
"\n", | |
"print('The size of the uncompressed time series dataframe is: ', getsizeof(time_series), 'Bytes')\n", | |
"print('The size of the sparse time series dataframe is: ', getsizeof(sparse_time_series), 'Bytes')\n", | |
"\n", | |
"display(sparse_time_series)\n", | |
"\n", | |
"# uncomment the line below if you want to save the sparse dataset locally as a csv file\n", | |
"\n", | |
"# sparse_time_series.to_csv('sparse_student_time_series.csv')" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "HtyUt_Q3vayv" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment