Caellwyn/From Trace Data to a Wide Format Time Series Table.ipynb

## From Trace Data to a Wide Format Time Series Table.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "process_trace_data.ipynb",
      "private_outputs": true,
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyN34JC9GrOgjuigsIRmqxK9",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/Caellwyn/85eb123dcbf86afff0a05e17d545859e/process_trace_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fmF2T3A9dP6R"
      },
      "source": [
        "import requests\n",
        "import io\n",
        "import pandas as pd\n",
        "import zipfile\n",
        "from IPython.display import clear_output, display\n",
        "from sys import getsizeof\n",
        "\n",
        "def import_student_vle():\n",
        "    \n",
        "    \"\"\"\n",
        "    Loads necessary table from anonymisedData.zip\n",
        "    \"\"\"\n",
        "    \n",
        "    url = 'https://github.com/Caellwyn/ou_student_predictions/raw/main/content/anonymisedData.zip'\n",
        "\n",
        "    #download and unzip the data\n",
        "    OULAD = requests.get(url, stream=True)\n",
        "    zf = zipfile.ZipFile(io.BytesIO(OULAD.content))\n",
        "    del OULAD\n",
        "    student_vle = pd.read_csv(zf.open('studentVle.csv'))\n",
        "    del zf\n",
        "    #combine module, presentation, and student id columns into one registration column\n",
        "    #representing one course registration, or one student in one course.\n",
        "    student_vle['registration'] = student_vle['code_module'] \\\n",
        "                                    + student_vle['code_presentation'] \\\n",
        "                                    + student_vle['id_student'].astype(str)\n",
        "    student_vle = student_vle.drop(columns=['code_module','code_presentation','id_student'])\n",
        "    student_vle = student_vle.rename(columns={'sum_click':'sum_clicks'})\n",
        "    #Change the order of the columns for easy reading\n",
        "    student_vle = student_vle[['registration','id_site','date','sum_clicks']]\n",
        "    \n",
        "    #final table is a list of each student interaction with each activity in each course and the \n",
        "    #number of times a student clicked on that activity.  \n",
        "    #It contains over 10 million records.\n",
        "    return student_vle\n",
        "\n",
        "def group_by_day(student_vle):\n",
        "    \n",
        "    \"\"\"\n",
        "    Groups activity records by registration and by day, rather than by activity. Table\n",
        "    will by organized by registration, then chronologically.  Function sums the total clicks a student and \n",
        "    counts the number of activities completed for each student registration for each day.\n",
        "    \"\"\"\n",
        "    \n",
        "    #group by registration and day\n",
        "    vle_group = student_vle.groupby(by = ['registration', 'date'])\n",
        "\n",
        "    #sum up the clicks each day\n",
        "    sum_clicks = vle_group.sum().reset_index()\n",
        "    sum_clicks = sum_clicks.drop(columns=['id_site'])\n",
        "    sum_clicks = sum_clicks.sort_values(by=['registration','date'])\n",
        "    \n",
        "    #clean up data.\n",
        "    # sum_clicks = sum_clicks.fillna(0)\n",
        "    # sum_clicks = sum_clicks.drop_duplicates(keep='first')\n",
        "    return sum_clicks\n",
        "\n",
        "def create_time_series(sum_clicks):\n",
        "    \n",
        "    \"\"\"\n",
        "    Takes the merged table of activities and returns a new table with one row per student registration\n",
        "    and columns for numbers of activities, clicks, and activities * clicks for each day of the course\n",
        "    up to the prediction window\n",
        "    \"\"\"\n",
        "    \n",
        "    #Create a new dataframe to hold our timeseries data.\n",
        "    time_series = pd.DataFrame()\n",
        "    \n",
        "    #Creates one record for each student registration\n",
        "    time_series['registration'] = sum_clicks['registration'].unique()\n",
        "    \n",
        "    start_date = sum_clicks['date'].min()\n",
        "    end_date = sum_clicks['date'].max()\n",
        "    date_range = range(start_date, end_date)\n",
        "    \n",
        "    counter = len(date_range)\n",
        "    \n",
        "    #for each day in the course, create a window filtered show data from only that date.\n",
        "    for date in date_range:\n",
        "        #create a window filtered by one day by passing a boolean expression.\n",
        "        single_date_df = sum_clicks[sum_clicks['date'] == date][['registration',\n",
        "                                                                 'sum_clicks']]\n",
        "        #rename columns to describe date and data.\n",
        "        single_date_df.columns = ['registration'] + [f'clicks_on_day_{date}']\n",
        "        \n",
        "        #merge the dataframe for this day onto the time series dataframe.\n",
        "        time_series = time_series.merge(single_date_df, \n",
        "                                        how='left', \n",
        "                                        on='registration',\n",
        "                                        validate = 'm:1')\n",
        "\n",
        "        print('Days remaining to be processed: ', counter)\n",
        "        clear_output(wait=True)\n",
        "        counter -= 1\n",
        "    \n",
        "    #fill missing data with zeros.  Missing data represents no clicks that day.\n",
        "    time_series = time_series.fillna(0)\n",
        "    time_series = time_series.set_index('registration', drop=True)\n",
        "\n",
        "    return time_series\n",
        "\n",
        "\n",
        "student_vle = import_student_vle()\n",
        "sum_clicks = group_by_day(student_vle)\n",
        "\n",
        "#save some memory\n",
        "del student_vle\n",
        "\n",
        "time_series = create_time_series(sum_clicks)\n",
        "\n",
        "#save some memory\n",
        "del sum_clicks\n",
        "\n",
        "#compress the dataframe\n",
        "sparse_time_series = time_series.astype(pd.SparseDtype(\"int\", 0))\n",
        "\n",
        "print('The size of the uncompressed time series dataframe is: ', getsizeof(time_series), 'Bytes')\n",
        "print('The size of the sparse time series dataframe is: ', getsizeof(sparse_time_series), 'Bytes')\n",
        "\n",
        "display(sparse_time_series)\n",
        "\n",
        "# uncomment the line below if you want to save the sparse dataset locally as a csv file\n",
        "\n",
        "# sparse_time_series.to_csv('sparse_student_time_series.csv')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HtyUt_Q3vayv"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "process_trace_data.ipynb",
	"private_outputs": true,
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyN34JC9GrOgjuigsIRmqxK9",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/Caellwyn/85eb123dcbf86afff0a05e17d545859e/process_trace_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "fmF2T3A9dP6R"
	},
	"source": [
	"import requests\n",
	"import io\n",
	"import pandas as pd\n",
	"import zipfile\n",
	"from IPython.display import clear_output, display\n",
	"from sys import getsizeof\n",
	"\n",
	"def import_student_vle():\n",
	" \n",
	" \"\"\"\n",
	" Loads necessary table from anonymisedData.zip\n",
	" \"\"\"\n",
	" \n",
	" url = 'https://github.com/Caellwyn/ou_student_predictions/raw/main/content/anonymisedData.zip'\n",
	"\n",
	" #download and unzip the data\n",
	" OULAD = requests.get(url, stream=True)\n",
	" zf = zipfile.ZipFile(io.BytesIO(OULAD.content))\n",
	" del OULAD\n",
	" student_vle = pd.read_csv(zf.open('studentVle.csv'))\n",
	" del zf\n",
	" #combine module, presentation, and student id columns into one registration column\n",
	" #representing one course registration, or one student in one course.\n",
	" student_vle['registration'] = student_vle['code_module'] \\\n",
	" + student_vle['code_presentation'] \\\n",
	" + student_vle['id_student'].astype(str)\n",
	" student_vle = student_vle.drop(columns=['code_module','code_presentation','id_student'])\n",
	" student_vle = student_vle.rename(columns={'sum_click':'sum_clicks'})\n",
	" #Change the order of the columns for easy reading\n",
	" student_vle = student_vle[['registration','id_site','date','sum_clicks']]\n",
	" \n",
	" #final table is a list of each student interaction with each activity in each course and the \n",
	" #number of times a student clicked on that activity. \n",
	" #It contains over 10 million records.\n",
	" return student_vle\n",
	"\n",
	"def group_by_day(student_vle):\n",
	" \n",
	" \"\"\"\n",
	" Groups activity records by registration and by day, rather than by activity. Table\n",
	" will by organized by registration, then chronologically. Function sums the total clicks a student and \n",
	" counts the number of activities completed for each student registration for each day.\n",
	" \"\"\"\n",
	" \n",
	" #group by registration and day\n",
	" vle_group = student_vle.groupby(by = ['registration', 'date'])\n",
	"\n",
	" #sum up the clicks each day\n",
	" sum_clicks = vle_group.sum().reset_index()\n",
	" sum_clicks = sum_clicks.drop(columns=['id_site'])\n",
	" sum_clicks = sum_clicks.sort_values(by=['registration','date'])\n",
	" \n",
	" #clean up data.\n",
	" # sum_clicks = sum_clicks.fillna(0)\n",
	" # sum_clicks = sum_clicks.drop_duplicates(keep='first')\n",
	" return sum_clicks\n",
	"\n",
	"def create_time_series(sum_clicks):\n",
	" \n",
	" \"\"\"\n",
	" Takes the merged table of activities and returns a new table with one row per student registration\n",
	" and columns for numbers of activities, clicks, and activities * clicks for each day of the course\n",
	" up to the prediction window\n",
	" \"\"\"\n",
	" \n",
	" #Create a new dataframe to hold our timeseries data.\n",
	" time_series = pd.DataFrame()\n",
	" \n",
	" #Creates one record for each student registration\n",
	" time_series['registration'] = sum_clicks['registration'].unique()\n",
	" \n",
	" start_date = sum_clicks['date'].min()\n",
	" end_date = sum_clicks['date'].max()\n",
	" date_range = range(start_date, end_date)\n",
	" \n",
	" counter = len(date_range)\n",
	" \n",
	" #for each day in the course, create a window filtered show data from only that date.\n",
	" for date in date_range:\n",
	" #create a window filtered by one day by passing a boolean expression.\n",
	" single_date_df = sum_clicks[sum_clicks['date'] == date][['registration',\n",
	" 'sum_clicks']]\n",
	" #rename columns to describe date and data.\n",
	" single_date_df.columns = ['registration'] + [f'clicks_on_day_{date}']\n",
	" \n",
	" #merge the dataframe for this day onto the time series dataframe.\n",
	" time_series = time_series.merge(single_date_df, \n",
	" how='left', \n",
	" on='registration',\n",
	" validate = 'm:1')\n",
	"\n",
	" print('Days remaining to be processed: ', counter)\n",
	" clear_output(wait=True)\n",
	" counter -= 1\n",
	" \n",
	" #fill missing data with zeros. Missing data represents no clicks that day.\n",
	" time_series = time_series.fillna(0)\n",
	" time_series = time_series.set_index('registration', drop=True)\n",
	"\n",
	" return time_series\n",
	"\n",
	"\n",
	"student_vle = import_student_vle()\n",
	"sum_clicks = group_by_day(student_vle)\n",
	"\n",
	"#save some memory\n",
	"del student_vle\n",
	"\n",
	"time_series = create_time_series(sum_clicks)\n",
	"\n",
	"#save some memory\n",
	"del sum_clicks\n",
	"\n",
	"#compress the dataframe\n",
	"sparse_time_series = time_series.astype(pd.SparseDtype(\"int\", 0))\n",
	"\n",
	"print('The size of the uncompressed time series dataframe is: ', getsizeof(time_series), 'Bytes')\n",
	"print('The size of the sparse time series dataframe is: ', getsizeof(sparse_time_series), 'Bytes')\n",
	"\n",
	"display(sparse_time_series)\n",
	"\n",
	"# uncomment the line below if you want to save the sparse dataset locally as a csv file\n",
	"\n",
	"# sparse_time_series.to_csv('sparse_student_time_series.csv')"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "HtyUt_Q3vayv"
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}