Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Caellwyn/85eb123dcbf86afff0a05e17d545859e to your computer and use it in GitHub Desktop.
Save Caellwyn/85eb123dcbf86afff0a05e17d545859e to your computer and use it in GitHub Desktop.
process_trace_data.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "process_trace_data.ipynb",
"private_outputs": true,
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyN34JC9GrOgjuigsIRmqxK9",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Caellwyn/85eb123dcbf86afff0a05e17d545859e/process_trace_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "fmF2T3A9dP6R"
},
"source": [
"import requests\n",
"import io\n",
"import pandas as pd\n",
"import zipfile\n",
"from IPython.display import clear_output, display\n",
"from sys import getsizeof\n",
"\n",
"def import_student_vle():\n",
" \n",
" \"\"\"\n",
" Loads necessary table from anonymisedData.zip\n",
" \"\"\"\n",
" \n",
" url = 'https://github.com/Caellwyn/ou_student_predictions/raw/main/content/anonymisedData.zip'\n",
"\n",
" #download and unzip the data\n",
" OULAD = requests.get(url, stream=True)\n",
" zf = zipfile.ZipFile(io.BytesIO(OULAD.content))\n",
" del OULAD\n",
" student_vle = pd.read_csv(zf.open('studentVle.csv'))\n",
" del zf\n",
" #combine module, presentation, and student id columns into one registration column\n",
" #representing one course registration, or one student in one course.\n",
" student_vle['registration'] = student_vle['code_module'] \\\n",
" + student_vle['code_presentation'] \\\n",
" + student_vle['id_student'].astype(str)\n",
" student_vle = student_vle.drop(columns=['code_module','code_presentation','id_student'])\n",
" student_vle = student_vle.rename(columns={'sum_click':'sum_clicks'})\n",
" #Change the order of the columns for easy reading\n",
" student_vle = student_vle[['registration','id_site','date','sum_clicks']]\n",
" \n",
" #final table is a list of each student interaction with each activity in each course and the \n",
" #number of times a student clicked on that activity. \n",
" #It contains over 10 million records.\n",
" return student_vle\n",
"\n",
"def group_by_day(student_vle):\n",
" \n",
" \"\"\"\n",
" Groups activity records by registration and by day, rather than by activity. Table\n",
" will by organized by registration, then chronologically. Function sums the total clicks a student and \n",
" counts the number of activities completed for each student registration for each day.\n",
" \"\"\"\n",
" \n",
" #group by registration and day\n",
" vle_group = student_vle.groupby(by = ['registration', 'date'])\n",
"\n",
" #sum up the clicks each day\n",
" sum_clicks = vle_group.sum().reset_index()\n",
" sum_clicks = sum_clicks.drop(columns=['id_site'])\n",
" sum_clicks = sum_clicks.sort_values(by=['registration','date'])\n",
" \n",
" #clean up data.\n",
" # sum_clicks = sum_clicks.fillna(0)\n",
" # sum_clicks = sum_clicks.drop_duplicates(keep='first')\n",
" return sum_clicks\n",
"\n",
"def create_time_series(sum_clicks):\n",
" \n",
" \"\"\"\n",
" Takes the merged table of activities and returns a new table with one row per student registration\n",
" and columns for numbers of activities, clicks, and activities * clicks for each day of the course\n",
" up to the prediction window\n",
" \"\"\"\n",
" \n",
" #Create a new dataframe to hold our timeseries data.\n",
" time_series = pd.DataFrame()\n",
" \n",
" #Creates one record for each student registration\n",
" time_series['registration'] = sum_clicks['registration'].unique()\n",
" \n",
" start_date = sum_clicks['date'].min()\n",
" end_date = sum_clicks['date'].max()\n",
" date_range = range(start_date, end_date)\n",
" \n",
" counter = len(date_range)\n",
" \n",
" #for each day in the course, create a window filtered show data from only that date.\n",
" for date in date_range:\n",
" #create a window filtered by one day by passing a boolean expression.\n",
" single_date_df = sum_clicks[sum_clicks['date'] == date][['registration',\n",
" 'sum_clicks']]\n",
" #rename columns to describe date and data.\n",
" single_date_df.columns = ['registration'] + [f'clicks_on_day_{date}']\n",
" \n",
" #merge the dataframe for this day onto the time series dataframe.\n",
" time_series = time_series.merge(single_date_df, \n",
" how='left', \n",
" on='registration',\n",
" validate = 'm:1')\n",
"\n",
" print('Days remaining to be processed: ', counter)\n",
" clear_output(wait=True)\n",
" counter -= 1\n",
" \n",
" #fill missing data with zeros. Missing data represents no clicks that day.\n",
" time_series = time_series.fillna(0)\n",
" time_series = time_series.set_index('registration', drop=True)\n",
"\n",
" return time_series\n",
"\n",
"\n",
"student_vle = import_student_vle()\n",
"sum_clicks = group_by_day(student_vle)\n",
"\n",
"#save some memory\n",
"del student_vle\n",
"\n",
"time_series = create_time_series(sum_clicks)\n",
"\n",
"#save some memory\n",
"del sum_clicks\n",
"\n",
"#compress the dataframe\n",
"sparse_time_series = time_series.astype(pd.SparseDtype(\"int\", 0))\n",
"\n",
"print('The size of the uncompressed time series dataframe is: ', getsizeof(time_series), 'Bytes')\n",
"print('The size of the sparse time series dataframe is: ', getsizeof(sparse_time_series), 'Bytes')\n",
"\n",
"display(sparse_time_series)\n",
"\n",
"# uncomment the line below if you want to save the sparse dataset locally as a csv file\n",
"\n",
"# sparse_time_series.to_csv('sparse_student_time_series.csv')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "HtyUt_Q3vayv"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment