acmiyaguchi/Stub - Test Data.ipynb Secret

## Stub - Test Data.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate Stub Attribution Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Portions of code are taken from [1].\n",
    "# [1] https://github.com/mozilla/mozilla-reports/blob/master/etl/churn.kp/orig_src/Churn.ipynb\n",
    "\n",
    "import random\n",
    "import datetime\n",
    "import logging\n",
    "from pyspark.sql import Row, Window\n",
    "import pyspark.sql.functions as F\n",
    "from pyspark.sql.types import *\n",
    "from collections import OrderedDict\n",
    "from moztelemetry.standards import snap_to_beginning_of_week\n",
    "\n",
    "STUB_ATTRIBUTION_FIELDS = [\n",
    "    \"client_id\",\n",
    "    \"timestamp\",\n",
    "    \"submission_date_s3\",\n",
    "    \"subsession_start_date\",\n",
    "    \"profile_creation_date\",\n",
    "    \"source\",\n",
    "    \"medium\",\n",
    "    \"campaign\",\n",
    "    \"content\",\n",
    "]\n",
    "\n",
    "def generate_test_df(start_date, n_clients, n_weeks):\n",
    "    # this data should be deterministic\n",
    "    random.seed(42)\n",
    "    \n",
    "    # These labels are applied to the clients using a triangular distribution. This\n",
    "    # should suffice for demonstrating the different campaigns. A future improvement\n",
    "    # is to use a gaussian distribution, which would be better at\n",
    "    # simulating a long tail of rare permutation of dimensions. \n",
    "    #\n",
    "    # Labels are chosen to be somewhat representative of the production labels. However,\n",
    "    # if labels are to be chosen to be more broad, it might be useful to generate the \n",
    "    # label category and append a numerical id like the clients.\n",
    "    source_labels   = [\"google\", \"homepage\", \"yahoo\", \"bing\"]\n",
    "    medium_labels   = [\"referral\", \"organic\", \"cpc\"]\n",
    "    campaign_labels = [None, \"campaign_1\", \"campaign_2\"]\n",
    "    content_labels  = [None, \"content_1\", \"content_2\", None, \"content_3\"]\n",
    "    \n",
    "    attrib_dict = {\n",
    "        \"source\": source_labels,\n",
    "        \"medium\": medium_labels, \n",
    "        \"campaign\": campaign_labels, \n",
    "        \"content\": content_labels\n",
    "    }\n",
    "    \n",
    "    # Generate our fake clients\n",
    "    clients = [{'client_id': 'client_{:02d}'.format(x)} for x in range(n_clients)]\n",
    "    \n",
    "    # assign the attributes to each client\n",
    "    for client in clients:\n",
    "        for attrib, labels in attrib_dict.iteritems():\n",
    "            idx = int(random.triangular(0, len(labels)))\n",
    "            client[attrib] = labels[idx]\n",
    "    \n",
    "    # Assign each client to a week start, distributed evenly through the week\n",
    "    # profile_creation_date: days since epoch\n",
    "    new_per_week = n_clients // n_weeks\n",
    "    days_per_week = 7\n",
    "    \n",
    "    loss_rate = 0.25\n",
    "    loss_delta = 0.1\n",
    "    \n",
    "    date_start = datetime.datetime.strptime(start_date, \"%Y%M%d\")\n",
    "    epoch = datetime.datetime.utcfromtimestamp(0)\n",
    "    date_offset = (date_start - epoch).days\n",
    "\n",
    "    data = []\n",
    "    \n",
    "    for week_start in range(n_weeks):\n",
    "        days_since_epoch = date_offset + (week_start * days_per_week)\n",
    "        id_offset = week_start * new_per_week\n",
    "        \n",
    "        # create a new cohort of users\n",
    "        for relative_id in range(new_per_week):\n",
    "            uid = id_offset + relative_id\n",
    "            clients[uid]['profile_creation_date'] = days_since_epoch\n",
    "            \n",
    "        # Provide client activity per week. A certain percentage of users will\n",
    "        # drop off every week, randomly dropped from all user. This should allow \n",
    "        # us to see if we can observe a general trend in user usage. Each user \n",
    "        # should also send a signficant number of duplicate requests, to test \n",
    "        # that the process is resiliant against overcounting.\n",
    "        # subsession_start_date == submission_date_s3 == timestamp (ns)\n",
    "        cohort_size = new_per_week\n",
    "        for future_week in range(week_start, n_weeks):\n",
    "            # cohort_size models user drop-off\n",
    "            future_days = date_offset + (future_week * days_per_week)\n",
    "            for relative_id in range(cohort_size):\n",
    "                # randomly ping 1-3 times for duplicates\n",
    "                for _ in range(random.randint(1, 3)):\n",
    "                    uid = id_offset + relative_id\n",
    "                    arrival = (\n",
    "                        epoch + \n",
    "                        datetime.timedelta(\n",
    "                            future_days + random.randint(0, days_per_week - 1))\n",
    "                    )\n",
    "\n",
    "                    ping = clients[uid].copy()\n",
    "                    submission_dict = {\n",
    "                        \"subsession_start_date\": arrival.strftime(\"%Y-%m-%d\"),\n",
    "                        \"submission_date_s3\": arrival.strftime(\"%Y%m%d\"),\n",
    "                        \"timestamp\": (arrival-epoch).total_seconds() * (10 ** 9)  # nanoseconds\n",
    "                    }\n",
    "                    ping.update(submission_dict)\n",
    "                    data.append(ping)\n",
    "\n",
    "            # loss rate increases with cohort start date\n",
    "            cohort_size -= int(cohort_size * (loss_rate + (loss_delta * week_start)))\n",
    "    \n",
    "    return (\n",
    "        sc.parallelize(data)\n",
    "            .map(lambda d: Row(**d))\n",
    "            .toDF()\n",
    "            .select(STUB_ATTRIBUTION_FIELDS)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_newest_per_client(df):\n",
    "    window_spec = Window.partitionBy(df['client_id']).orderBy(df['timestamp'].desc())\n",
    "    rownum_by_timestamp = F.row_number().over(window_spec)\n",
    "    selectable_by_client = df.select(\n",
    "        rownum_by_timestamp.alias('row_number'),\n",
    "        *df.columns\n",
    "    )\n",
    "    return (selectable_by_client\n",
    "            .filter(selectable_by_client['row_number'] == 1)\n",
    "            .select(STUB_ATTRIBUTION_FIELDS))\n",
    "\n",
    "def fmt(d, date_format=\"%Y%m%d\"):\n",
    "    return datetime.datetime.strftime(d, date_format)\n",
    "\n",
    "def get_week_num(creation, today):\n",
    "    if creation is None or today is None:\n",
    "        return None\n",
    "\n",
    "    diff = (today.date() - creation).days\n",
    "    if diff < 0:\n",
    "        # Creation date is in the future. Bad data :(\n",
    "        return -1\n",
    "    # The initial week is week zero.\n",
    "    return int(diff / 7)\n",
    "\n",
    "def daynum_to_date(daynum):\n",
    "    \"\"\" Convert a number of days to a date. If it's out of range, default to a max date.\n",
    "    :param daynum: A number of days since Jan 1, 1970\n",
    "    \"\"\"\n",
    "    if daynum is None:\n",
    "        return None\n",
    "    if daynum < 0:\n",
    "        return None\n",
    "    daycount = int(daynum)\n",
    "    if daycount > 1000000:\n",
    "        # Some time in the 48th century, clearly bogus.\n",
    "        daycount = 1000000\n",
    "    return datetime.date(1970, 1, 1) + datetime.timedelta(daycount)\n",
    "\n",
    "def get_current_week(profile_creation_date, subsession_start_date, submission_date_s3):\n",
    "    pcd = daynum_to_date(profile_creation_date)\n",
    "    client_date = None\n",
    "    if subsession_start_date is not None:\n",
    "        try:\n",
    "            client_date = (datetime\n",
    "                           .datetime\n",
    "                           .strptime(subsession_start_date[0:10], \"%Y-%m-%d\"))\n",
    "        except ValueError as e1:\n",
    "            # Bogus format\n",
    "            return 'unknown'\n",
    "        except TypeError as e2:\n",
    "            # String contains null bytes or other weirdness. Example:\n",
    "            # TypeError: must be string without null bytes, not unicode\n",
    "            return 'unknown'\n",
    "    if client_date is None:\n",
    "        # Fall back to submission date\n",
    "        client_date = datetime.datetime.strptime(submission_date_s3, \"%Y%m%d\")\n",
    "    return get_week_num(pcd, client_date)\n",
    "\n",
    "current_week_udf = F.udf(get_current_week, StringType())\n",
    "\n",
    "# Note: defined in a functional way, but perhaps not the most pythonic (or readable)\n",
    "pcd_to_aquisition_udf = (\n",
    "    F.udf(lambda pcd: (\n",
    "            datetime.datetime.strftime(\n",
    "                snap_to_beginning_of_week(\n",
    "                    daynum_to_date(pcd), \n",
    "                    \"Sunday\"),\n",
    "                \"%Y-%m-%d\")),\n",
    "          StringType()))\n",
    "\n",
    "def compute_week(df, week_start):\n",
    "    week_start_date = datetime.datetime.strptime(week_start, \"%Y%m%d\")\n",
    "    week_end_date = week_start_date + datetime.timedelta(6)\n",
    "    week_start = fmt(week_start_date)\n",
    "    week_end = fmt(week_end_date)\n",
    "    \n",
    "    # Verify that the start date is a Sunday\n",
    "    if week_start_date.weekday() != 6:\n",
    "        print(\"Week start date {} is not a Sunday\".format(week_start))\n",
    "        return\n",
    "    \n",
    "    print(\"Starting week from {} to {} at {}\"\n",
    "          .format(week_start, week_end, datetime.datetime.utcnow()))\n",
    "    # the subsession_start_date field has a different form than submission_date_s3,\n",
    "    # so needs to be formatted with hyphens.\n",
    "    week_end_slop = fmt(week_end_date + datetime.timedelta(10))\n",
    "    week_end_excl = fmt(week_end_date + datetime.timedelta(1), date_format=\"%Y-%m-%d\")\n",
    "    week_start_hyphenated = fmt(week_start_date, date_format=\"%Y-%m-%d\")\n",
    "\n",
    "    current_week = (\n",
    "        df.filter(df['submission_date_s3'] >= week_start)\n",
    "          .filter(df['submission_date_s3'] <= week_end_slop)\n",
    "          .filter(df['subsession_start_date'] >= week_start_hyphenated)\n",
    "          .filter(df['subsession_start_date'] < week_end_excl)\n",
    "    )\n",
    "    newest_per_client = (\n",
    "        get_newest_per_client(current_week)\n",
    "        .select(\n",
    "            F.col('*'),\n",
    "            pcd_to_aquisition_udf(\n",
    "                F.col('profile_creation_date')\n",
    "            ).alias('aquisition_period'),\n",
    "            current_week_udf(\n",
    "                F.col('profile_creation_date'), \n",
    "                F.col('subsession_start_date'),\n",
    "                F.col('submission_date_s3')\n",
    "            ).alias('current_week')))\n",
    "    \n",
    "    return newest_per_client    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting week from 20160103 to 20160109 at 2017-02-02 23:02:23.750227\n",
      "root\n",
      " |-- client_id: string (nullable = true)\n",
      " |-- timestamp: double (nullable = true)\n",
      " |-- submission_date_s3: string (nullable = true)\n",
      " |-- subsession_start_date: string (nullable = true)\n",
      " |-- profile_creation_date: long (nullable = true)\n",
      " |-- source: string (nullable = true)\n",
      " |-- medium: string (nullable = true)\n",
      " |-- campaign: string (nullable = true)\n",
      " |-- content: string (nullable = true)\n",
      " |-- aquisition_period: string (nullable = true)\n",
      " |-- current_week: string (nullable = true)\n",
      "\n",
      "Starting week from 20160110 to 20160116 at 2017-02-02 23:03:25.074850\n",
      "Starting week from 20160117 to 20160123 at 2017-02-02 23:04:29.309601\n",
      "Starting week from 20160124 to 20160130 at 2017-02-02 23:05:29.744713\n",
      "Starting week from 20160131 to 20160206 at 2017-02-02 23:06:28.580993\n"
     ]
    }
   ],
   "source": [
    "S3_ATTRIBUTION_BUCKET = 'net-mozaws-prod-us-west-2-pipeline-analysis'\n",
    "S3_ATTRIBUTION_PREFIX = 'amiyaguchi/stub/v1'\n",
    "\n",
    "n_weeks = 5\n",
    "df = generate_test_df(100, n_weeks)\n",
    "\n",
    "start_date = datetime.datetime.strptime(\"20160103\", \"%Y%m%d\")\n",
    "for week in range(n_weeks):\n",
    "    delta = datetime.timedelta(7) * week\n",
    "    week_start = fmt(start_date + delta)\n",
    "    week_df = compute_week(df, week_start)\n",
    "    if week == 0:\n",
    "        week_df.printSchema()\n",
    "    s3_path = \"s3://{}/{}/{}={}\".format(S3_ATTRIBUTION_BUCKET, \n",
    "                                        S3_ATTRIBUTION_PREFIX, \n",
    "                                        'week_start', week_start)\n",
    "    logging.info(\"Writing dataframe to %s\", s3_path)\n",
    "    week_df.write.parquet(s3_path, mode=\"overwrite\")"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

## Stub - Test Data.py

# coding: utf-8

# ## Generate Stub Attribution Dataset

# In[18]:

# Portions of code are taken from [1].
# [1] https://github.com/mozilla/mozilla-reports/blob/master/etl/churn.kp/orig_src/Churn.ipynb

import random
import datetime
import logging
from pyspark.sql import Row, Window
import pyspark.sql.functions as F
from pyspark.sql.types import *
from collections import OrderedDict
from moztelemetry.standards import snap_to_beginning_of_week

STUB_ATTRIBUTION_FIELDS = [
    "client_id",
    "timestamp",
    "submission_date_s3",
    "subsession_start_date",
    "profile_creation_date",
    "source",
    "medium",
    "campaign",
    "content",
]

def generate_test_df(start_date, n_clients, n_weeks):
    # this data should be deterministic
    random.seed(42)

    # These labels are applied to the clients using a triangular distribution. This
    # should suffice for demonstrating the different campaigns. A future improvement
    # is to use a gaussian distribution, which would be better at
    # simulating a long tail of rare permutation of dimensions.
    #
    # Labels are chosen to be somewhat representative of the production labels. However,
    # if labels are to be chosen to be more broad, it might be useful to generate the
    # label category and append a numerical id like the clients.
    source_labels   = ["google", "homepage", "yahoo", "bing"]
    medium_labels   = ["referral", "organic", "cpc"]
    campaign_labels = [None, "campaign_1", "campaign_2"]
    content_labels  = [None, "content_1", "content_2", None, "content_3"]

    attrib_dict = {
        "source": source_labels,
        "medium": medium_labels,
        "campaign": campaign_labels,
        "content": content_labels
    }

    # Generate our fake clients
    clients = [{'client_id': 'client_{:02d}'.format(x)} for x in range(n_clients)]

    # assign the attributes to each client
    for client in clients:
        for attrib, labels in attrib_dict.iteritems():
            idx = int(random.triangular(0, len(labels)))
            client[attrib] = labels[idx]

    # Assign each client to a week start, distributed evenly through the week
    # profile_creation_date: days since epoch
    new_per_week = n_clients // n_weeks
    days_per_week = 7

    loss_rate = 0.25
    loss_delta = 0.1

    date_start = datetime.datetime.strptime(start_date, "%Y%M%d")
    epoch = datetime.datetime.utcfromtimestamp(0)
    date_offset = (date_start - epoch).days

    data = []

    for week_start in range(n_weeks):
        days_since_epoch = date_offset + (week_start * days_per_week)
        id_offset = week_start * new_per_week

        # create a new cohort of users
        for relative_id in range(new_per_week):
            uid = id_offset + relative_id
            clients[uid]['profile_creation_date'] = days_since_epoch

        # Provide client activity per week. A certain percentage of users will
        # drop off every week, randomly dropped from all user. This should allow
        # us to see if we can observe a general trend in user usage. Each user
        # should also send a signficant number of duplicate requests, to test
        # that the process is resiliant against overcounting.
        # subsession_start_date == submission_date_s3 == timestamp (ns)
        cohort_size = new_per_week
        for future_week in range(week_start, n_weeks):
            # cohort_size models user drop-off
            future_days = date_offset + (future_week * days_per_week)
            for relative_id in range(cohort_size):
                # randomly ping 1-3 times for duplicates
                for _ in range(random.randint(1, 3)):
                    uid = id_offset + relative_id
                    arrival = (
                        epoch +
                        datetime.timedelta(
                            future_days + random.randint(0, days_per_week - 1))
                    )

                    ping = clients[uid].copy()
                    submission_dict = {
                        "subsession_start_date": arrival.strftime("%Y-%m-%d"),
                        "submission_date_s3": arrival.strftime("%Y%m%d"),
                        "timestamp": (arrival-epoch).total_seconds() * (10 ** 9)  # nanoseconds
                    }
                    ping.update(submission_dict)
                    data.append(ping)

            # loss rate increases with cohort start date
            cohort_size -= int(cohort_size * (loss_rate + (loss_delta * week_start)))

    return (
        sc.parallelize(data)
            .map(lambda d: Row(**d))
            .toDF()
            .select(STUB_ATTRIBUTION_FIELDS)
    )


# In[57]:

def get_newest_per_client(df):
    window_spec = Window.partitionBy(df['client_id']).orderBy(df['timestamp'].desc())
    rownum_by_timestamp = F.row_number().over(window_spec)
    selectable_by_client = df.select(
        rownum_by_timestamp.alias('row_number'),
        *df.columns
    )
    return (selectable_by_client
            .filter(selectable_by_client['row_number'] == 1)
            .select(STUB_ATTRIBUTION_FIELDS))

def fmt(d, date_format="%Y%m%d"):
    return datetime.datetime.strftime(d, date_format)

def get_week_num(creation, today):
    if creation is None or today is None:
        return None

    diff = (today.date() - creation).days
    if diff < 0:
        # Creation date is in the future. Bad data :(
        return -1
    # The initial week is week zero.
    return int(diff / 7)

def daynum_to_date(daynum):
    """ Convert a number of days to a date. If it's out of range, default to a max date.
    :param daynum: A number of days since Jan 1, 1970
    """
    if daynum is None:
        return None
    if daynum < 0:
        return None
    daycount = int(daynum)
    if daycount > 1000000:
        # Some time in the 48th century, clearly bogus.
        daycount = 1000000
    return datetime.date(1970, 1, 1) + datetime.timedelta(daycount)

def get_current_week(profile_creation_date, subsession_start_date, submission_date_s3):
    pcd = daynum_to_date(profile_creation_date)
    client_date = None
    if subsession_start_date is not None:
        try:
            client_date = (datetime
                           .datetime
                           .strptime(subsession_start_date[0:10], "%Y-%m-%d"))
        except ValueError as e1:
            # Bogus format
            return 'unknown'
        except TypeError as e2:
            # String contains null bytes or other weirdness. Example:
            # TypeError: must be string without null bytes, not unicode
            return 'unknown'
    if client_date is None:
        # Fall back to submission date
        client_date = datetime.datetime.strptime(submission_date_s3, "%Y%m%d")
    return get_week_num(pcd, client_date)

current_week_udf = F.udf(get_current_week, StringType())

# Note: defined in a functional way, but perhaps not the most pythonic (or readable)
pcd_to_aquisition_udf = (
    F.udf(lambda pcd: (
            datetime.datetime.strftime(
                snap_to_beginning_of_week(
                    daynum_to_date(pcd),
                    "Sunday"),
                "%Y-%m-%d")),
          StringType()))

def compute_week(df, week_start):
    week_start_date = datetime.datetime.strptime(week_start, "%Y%m%d")
    week_end_date = week_start_date + datetime.timedelta(6)
    week_start = fmt(week_start_date)
    week_end = fmt(week_end_date)

    # Verify that the start date is a Sunday
    if week_start_date.weekday() != 6:
        print("Week start date {} is not a Sunday".format(week_start))
        return

    print("Starting week from {} to {} at {}"
          .format(week_start, week_end, datetime.datetime.utcnow()))
    # the subsession_start_date field has a different form than submission_date_s3,
    # so needs to be formatted with hyphens.
    week_end_slop = fmt(week_end_date + datetime.timedelta(10))
    week_end_excl = fmt(week_end_date + datetime.timedelta(1), date_format="%Y-%m-%d")
    week_start_hyphenated = fmt(week_start_date, date_format="%Y-%m-%d")

    current_week = (
        df.filter(df['submission_date_s3'] >= week_start)
          .filter(df['submission_date_s3'] <= week_end_slop)
          .filter(df['subsession_start_date'] >= week_start_hyphenated)
          .filter(df['subsession_start_date'] < week_end_excl)
    )
    newest_per_client = (
        get_newest_per_client(current_week)
        .select(
            F.col('*'),
            pcd_to_aquisition_udf(
                F.col('profile_creation_date')
            ).alias('aquisition_period'),
            current_week_udf(
                F.col('profile_creation_date'),
                F.col('subsession_start_date'),
                F.col('submission_date_s3')
            ).alias('current_week')))

    return newest_per_client


# In[59]:

S3_ATTRIBUTION_BUCKET = 'net-mozaws-prod-us-west-2-pipeline-analysis'
S3_ATTRIBUTION_PREFIX = 'amiyaguchi/stub/v1'

n_weeks = 5
df = generate_test_df(100, n_weeks)

start_date = datetime.datetime.strptime("20160103", "%Y%m%d")
for week in range(n_weeks):
    delta = datetime.timedelta(7) * week
    week_start = fmt(start_date + delta)
    week_df = compute_week(df, week_start)
    if week == 0:
        week_df.printSchema()
    s3_path = "s3://{}/{}/{}={}".format(S3_ATTRIBUTION_BUCKET,
                                        S3_ATTRIBUTION_PREFIX,
                                        'week_start', week_start)
    logging.info("Writing dataframe to %s", s3_path)
    week_df.write.parquet(s3_path, mode="overwrite")
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Generate Stub Attribution Dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Portions of code are taken from [1].\n",
	"# [1] https://github.com/mozilla/mozilla-reports/blob/master/etl/churn.kp/orig_src/Churn.ipynb\n",
	"\n",
	"import random\n",
	"import datetime\n",
	"import logging\n",
	"from pyspark.sql import Row, Window\n",
	"import pyspark.sql.functions as F\n",
	"from pyspark.sql.types import *\n",
	"from collections import OrderedDict\n",
	"from moztelemetry.standards import snap_to_beginning_of_week\n",
	"\n",
	"STUB_ATTRIBUTION_FIELDS = [\n",
	" \"client_id\",\n",
	" \"timestamp\",\n",
	" \"submission_date_s3\",\n",
	" \"subsession_start_date\",\n",
	" \"profile_creation_date\",\n",
	" \"source\",\n",
	" \"medium\",\n",
	" \"campaign\",\n",
	" \"content\",\n",
	"]\n",
	"\n",
	"def generate_test_df(start_date, n_clients, n_weeks):\n",
	" # this data should be deterministic\n",
	" random.seed(42)\n",
	" \n",
	" # These labels are applied to the clients using a triangular distribution. This\n",
	" # should suffice for demonstrating the different campaigns. A future improvement\n",
	" # is to use a gaussian distribution, which would be better at\n",
	" # simulating a long tail of rare permutation of dimensions. \n",
	" #\n",
	" # Labels are chosen to be somewhat representative of the production labels. However,\n",
	" # if labels are to be chosen to be more broad, it might be useful to generate the \n",
	" # label category and append a numerical id like the clients.\n",
	" source_labels = [\"google\", \"homepage\", \"yahoo\", \"bing\"]\n",
	" medium_labels = [\"referral\", \"organic\", \"cpc\"]\n",
	" campaign_labels = [None, \"campaign_1\", \"campaign_2\"]\n",
	" content_labels = [None, \"content_1\", \"content_2\", None, \"content_3\"]\n",
	" \n",
	" attrib_dict = {\n",
	" \"source\": source_labels,\n",
	" \"medium\": medium_labels, \n",
	" \"campaign\": campaign_labels, \n",
	" \"content\": content_labels\n",
	" }\n",
	" \n",
	" # Generate our fake clients\n",
	" clients = [{'client_id': 'client_{:02d}'.format(x)} for x in range(n_clients)]\n",
	" \n",
	" # assign the attributes to each client\n",
	" for client in clients:\n",
	" for attrib, labels in attrib_dict.iteritems():\n",
	" idx = int(random.triangular(0, len(labels)))\n",
	" client[attrib] = labels[idx]\n",
	" \n",
	" # Assign each client to a week start, distributed evenly through the week\n",
	" # profile_creation_date: days since epoch\n",
	" new_per_week = n_clients // n_weeks\n",
	" days_per_week = 7\n",
	" \n",
	" loss_rate = 0.25\n",
	" loss_delta = 0.1\n",
	" \n",
	" date_start = datetime.datetime.strptime(start_date, \"%Y%M%d\")\n",
	" epoch = datetime.datetime.utcfromtimestamp(0)\n",
	" date_offset = (date_start - epoch).days\n",
	"\n",
	" data = []\n",
	" \n",
	" for week_start in range(n_weeks):\n",
	" days_since_epoch = date_offset + (week_start * days_per_week)\n",
	" id_offset = week_start * new_per_week\n",
	" \n",
	" # create a new cohort of users\n",
	" for relative_id in range(new_per_week):\n",
	" uid = id_offset + relative_id\n",
	" clients[uid]['profile_creation_date'] = days_since_epoch\n",
	" \n",
	" # Provide client activity per week. A certain percentage of users will\n",
	" # drop off every week, randomly dropped from all user. This should allow \n",
	" # us to see if we can observe a general trend in user usage. Each user \n",
	" # should also send a signficant number of duplicate requests, to test \n",
	" # that the process is resiliant against overcounting.\n",
	" # subsession_start_date == submission_date_s3 == timestamp (ns)\n",
	" cohort_size = new_per_week\n",
	" for future_week in range(week_start, n_weeks):\n",
	" # cohort_size models user drop-off\n",
	" future_days = date_offset + (future_week * days_per_week)\n",
	" for relative_id in range(cohort_size):\n",
	" # randomly ping 1-3 times for duplicates\n",
	" for _ in range(random.randint(1, 3)):\n",
	" uid = id_offset + relative_id\n",
	" arrival = (\n",
	" epoch + \n",
	" datetime.timedelta(\n",
	" future_days + random.randint(0, days_per_week - 1))\n",
	" )\n",
	"\n",
	" ping = clients[uid].copy()\n",
	" submission_dict = {\n",
	" \"subsession_start_date\": arrival.strftime(\"%Y-%m-%d\"),\n",
	" \"submission_date_s3\": arrival.strftime(\"%Y%m%d\"),\n",
	" \"timestamp\": (arrival-epoch).total_seconds() * (10 ** 9) # nanoseconds\n",
	" }\n",
	" ping.update(submission_dict)\n",
	" data.append(ping)\n",
	"\n",
	" # loss rate increases with cohort start date\n",
	" cohort_size -= int(cohort_size * (loss_rate + (loss_delta * week_start)))\n",
	" \n",
	" return (\n",
	" sc.parallelize(data)\n",
	" .map(lambda d: Row(**d))\n",
	" .toDF()\n",
	" .select(STUB_ATTRIBUTION_FIELDS)\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def get_newest_per_client(df):\n",
	" window_spec = Window.partitionBy(df['client_id']).orderBy(df['timestamp'].desc())\n",
	" rownum_by_timestamp = F.row_number().over(window_spec)\n",
	" selectable_by_client = df.select(\n",
	" rownum_by_timestamp.alias('row_number'),\n",
	" *df.columns\n",
	" )\n",
	" return (selectable_by_client\n",
	" .filter(selectable_by_client['row_number'] == 1)\n",
	" .select(STUB_ATTRIBUTION_FIELDS))\n",
	"\n",
	"def fmt(d, date_format=\"%Y%m%d\"):\n",
	" return datetime.datetime.strftime(d, date_format)\n",
	"\n",
	"def get_week_num(creation, today):\n",
	" if creation is None or today is None:\n",
	" return None\n",
	"\n",
	" diff = (today.date() - creation).days\n",
	" if diff < 0:\n",
	" # Creation date is in the future. Bad data :(\n",
	" return -1\n",
	" # The initial week is week zero.\n",
	" return int(diff / 7)\n",
	"\n",
	"def daynum_to_date(daynum):\n",
	" \"\"\" Convert a number of days to a date. If it's out of range, default to a max date.\n",
	" :param daynum: A number of days since Jan 1, 1970\n",
	" \"\"\"\n",
	" if daynum is None:\n",
	" return None\n",
	" if daynum < 0:\n",
	" return None\n",
	" daycount = int(daynum)\n",
	" if daycount > 1000000:\n",
	" # Some time in the 48th century, clearly bogus.\n",
	" daycount = 1000000\n",
	" return datetime.date(1970, 1, 1) + datetime.timedelta(daycount)\n",
	"\n",
	"def get_current_week(profile_creation_date, subsession_start_date, submission_date_s3):\n",
	" pcd = daynum_to_date(profile_creation_date)\n",
	" client_date = None\n",
	" if subsession_start_date is not None:\n",
	" try:\n",
	" client_date = (datetime\n",
	" .datetime\n",
	" .strptime(subsession_start_date[0:10], \"%Y-%m-%d\"))\n",
	" except ValueError as e1:\n",
	" # Bogus format\n",
	" return 'unknown'\n",
	" except TypeError as e2:\n",
	" # String contains null bytes or other weirdness. Example:\n",
	" # TypeError: must be string without null bytes, not unicode\n",
	" return 'unknown'\n",
	" if client_date is None:\n",
	" # Fall back to submission date\n",
	" client_date = datetime.datetime.strptime(submission_date_s3, \"%Y%m%d\")\n",
	" return get_week_num(pcd, client_date)\n",
	"\n",
	"current_week_udf = F.udf(get_current_week, StringType())\n",
	"\n",
	"# Note: defined in a functional way, but perhaps not the most pythonic (or readable)\n",
	"pcd_to_aquisition_udf = (\n",
	" F.udf(lambda pcd: (\n",
	" datetime.datetime.strftime(\n",
	" snap_to_beginning_of_week(\n",
	" daynum_to_date(pcd), \n",
	" \"Sunday\"),\n",
	" \"%Y-%m-%d\")),\n",
	" StringType()))\n",
	"\n",
	"def compute_week(df, week_start):\n",
	" week_start_date = datetime.datetime.strptime(week_start, \"%Y%m%d\")\n",
	" week_end_date = week_start_date + datetime.timedelta(6)\n",
	" week_start = fmt(week_start_date)\n",
	" week_end = fmt(week_end_date)\n",
	" \n",
	" # Verify that the start date is a Sunday\n",
	" if week_start_date.weekday() != 6:\n",
	" print(\"Week start date {} is not a Sunday\".format(week_start))\n",
	" return\n",
	" \n",
	" print(\"Starting week from {} to {} at {}\"\n",
	" .format(week_start, week_end, datetime.datetime.utcnow()))\n",
	" # the subsession_start_date field has a different form than submission_date_s3,\n",
	" # so needs to be formatted with hyphens.\n",
	" week_end_slop = fmt(week_end_date + datetime.timedelta(10))\n",
	" week_end_excl = fmt(week_end_date + datetime.timedelta(1), date_format=\"%Y-%m-%d\")\n",
	" week_start_hyphenated = fmt(week_start_date, date_format=\"%Y-%m-%d\")\n",
	"\n",
	" current_week = (\n",
	" df.filter(df['submission_date_s3'] >= week_start)\n",
	" .filter(df['submission_date_s3'] <= week_end_slop)\n",
	" .filter(df['subsession_start_date'] >= week_start_hyphenated)\n",
	" .filter(df['subsession_start_date'] < week_end_excl)\n",
	" )\n",
	" newest_per_client = (\n",
	" get_newest_per_client(current_week)\n",
	" .select(\n",
	" F.col('*'),\n",
	" pcd_to_aquisition_udf(\n",
	" F.col('profile_creation_date')\n",
	" ).alias('aquisition_period'),\n",
	" current_week_udf(\n",
	" F.col('profile_creation_date'), \n",
	" F.col('subsession_start_date'),\n",
	" F.col('submission_date_s3')\n",
	" ).alias('current_week')))\n",
	" \n",
	" return newest_per_client "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Starting week from 20160103 to 20160109 at 2017-02-02 23:02:23.750227\n",
	"root\n",
	" \|-- client_id: string (nullable = true)\n",
	" \|-- timestamp: double (nullable = true)\n",
	" \|-- submission_date_s3: string (nullable = true)\n",
	" \|-- subsession_start_date: string (nullable = true)\n",
	" \|-- profile_creation_date: long (nullable = true)\n",
	" \|-- source: string (nullable = true)\n",
	" \|-- medium: string (nullable = true)\n",
	" \|-- campaign: string (nullable = true)\n",
	" \|-- content: string (nullable = true)\n",
	" \|-- aquisition_period: string (nullable = true)\n",
	" \|-- current_week: string (nullable = true)\n",
	"\n",
	"Starting week from 20160110 to 20160116 at 2017-02-02 23:03:25.074850\n",
	"Starting week from 20160117 to 20160123 at 2017-02-02 23:04:29.309601\n",
	"Starting week from 20160124 to 20160130 at 2017-02-02 23:05:29.744713\n",
	"Starting week from 20160131 to 20160206 at 2017-02-02 23:06:28.580993\n"
	]
	}
	],
	"source": [
	"S3_ATTRIBUTION_BUCKET = 'net-mozaws-prod-us-west-2-pipeline-analysis'\n",
	"S3_ATTRIBUTION_PREFIX = 'amiyaguchi/stub/v1'\n",
	"\n",
	"n_weeks = 5\n",
	"df = generate_test_df(100, n_weeks)\n",
	"\n",
	"start_date = datetime.datetime.strptime(\"20160103\", \"%Y%m%d\")\n",
	"for week in range(n_weeks):\n",
	" delta = datetime.timedelta(7) * week\n",
	" week_start = fmt(start_date + delta)\n",
	" week_df = compute_week(df, week_start)\n",
	" if week == 0:\n",
	" week_df.printSchema()\n",
	" s3_path = \"s3://{}/{}/{}={}\".format(S3_ATTRIBUTION_BUCKET, \n",
	" S3_ATTRIBUTION_PREFIX, \n",
	" 'week_start', week_start)\n",
	" logging.info(\"Writing dataframe to %s\", s3_path)\n",
	" week_df.write.parquet(s3_path, mode=\"overwrite\")"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda root]",
	"language": "python",
	"name": "conda-root-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}

	# coding: utf-8

	# ## Generate Stub Attribution Dataset

	# In[18]:

	# Portions of code are taken from [1].
	# [1] https://github.com/mozilla/mozilla-reports/blob/master/etl/churn.kp/orig_src/Churn.ipynb

	import random
	import datetime
	import logging
	from pyspark.sql import Row, Window
	import pyspark.sql.functions as F
	from pyspark.sql.types import *
	from collections import OrderedDict
	from moztelemetry.standards import snap_to_beginning_of_week

	STUB_ATTRIBUTION_FIELDS = [
	"client_id",
	"timestamp",
	"submission_date_s3",
	"subsession_start_date",
	"profile_creation_date",
	"source",
	"medium",
	"campaign",
	"content",
	]

	def generate_test_df(start_date, n_clients, n_weeks):
	# this data should be deterministic
	random.seed(42)

	# These labels are applied to the clients using a triangular distribution. This
	# should suffice for demonstrating the different campaigns. A future improvement
	# is to use a gaussian distribution, which would be better at
	# simulating a long tail of rare permutation of dimensions.
	#
	# Labels are chosen to be somewhat representative of the production labels. However,
	# if labels are to be chosen to be more broad, it might be useful to generate the
	# label category and append a numerical id like the clients.
	source_labels = ["google", "homepage", "yahoo", "bing"]
	medium_labels = ["referral", "organic", "cpc"]
	campaign_labels = [None, "campaign_1", "campaign_2"]
	content_labels = [None, "content_1", "content_2", None, "content_3"]

	attrib_dict = {
	"source": source_labels,
	"medium": medium_labels,
	"campaign": campaign_labels,
	"content": content_labels
	}

	# Generate our fake clients
	clients = [{'client_id': 'client_{:02d}'.format(x)} for x in range(n_clients)]

	# assign the attributes to each client
	for client in clients:
	for attrib, labels in attrib_dict.iteritems():
	idx = int(random.triangular(0, len(labels)))
	client[attrib] = labels[idx]

	# Assign each client to a week start, distributed evenly through the week
	# profile_creation_date: days since epoch
	new_per_week = n_clients // n_weeks
	days_per_week = 7

	loss_rate = 0.25
	loss_delta = 0.1

	date_start = datetime.datetime.strptime(start_date, "%Y%M%d")
	epoch = datetime.datetime.utcfromtimestamp(0)
	date_offset = (date_start - epoch).days

	data = []

	for week_start in range(n_weeks):
	days_since_epoch = date_offset + (week_start * days_per_week)
	id_offset = week_start * new_per_week

	# create a new cohort of users
	for relative_id in range(new_per_week):
	uid = id_offset + relative_id
	clients[uid]['profile_creation_date'] = days_since_epoch

	# Provide client activity per week. A certain percentage of users will
	# drop off every week, randomly dropped from all user. This should allow
	# us to see if we can observe a general trend in user usage. Each user
	# should also send a signficant number of duplicate requests, to test
	# that the process is resiliant against overcounting.
	# subsession_start_date == submission_date_s3 == timestamp (ns)
	cohort_size = new_per_week
	for future_week in range(week_start, n_weeks):
	# cohort_size models user drop-off
	future_days = date_offset + (future_week * days_per_week)
	for relative_id in range(cohort_size):
	# randomly ping 1-3 times for duplicates
	for _ in range(random.randint(1, 3)):
	uid = id_offset + relative_id
	arrival = (
	epoch +
	datetime.timedelta(
	future_days + random.randint(0, days_per_week - 1))
	)

	ping = clients[uid].copy()
	submission_dict = {
	"subsession_start_date": arrival.strftime("%Y-%m-%d"),
	"submission_date_s3": arrival.strftime("%Y%m%d"),
	"timestamp": (arrival-epoch).total_seconds() * (10 ** 9) # nanoseconds
	}
	ping.update(submission_dict)
	data.append(ping)

	# loss rate increases with cohort start date
	cohort_size -= int(cohort_size * (loss_rate + (loss_delta * week_start)))

	return (
	sc.parallelize(data)
	.map(lambda d: Row(**d))
	.toDF()
	.select(STUB_ATTRIBUTION_FIELDS)
	)


	# In[57]:

	def get_newest_per_client(df):
	window_spec = Window.partitionBy(df['client_id']).orderBy(df['timestamp'].desc())
	rownum_by_timestamp = F.row_number().over(window_spec)
	selectable_by_client = df.select(
	rownum_by_timestamp.alias('row_number'),
	*df.columns
	)
	return (selectable_by_client
	.filter(selectable_by_client['row_number'] == 1)
	.select(STUB_ATTRIBUTION_FIELDS))

	def fmt(d, date_format="%Y%m%d"):
	return datetime.datetime.strftime(d, date_format)

	def get_week_num(creation, today):
	if creation is None or today is None:
	return None

	diff = (today.date() - creation).days
	if diff < 0:
	# Creation date is in the future. Bad data :(
	return -1
	# The initial week is week zero.
	return int(diff / 7)

	def daynum_to_date(daynum):
	""" Convert a number of days to a date. If it's out of range, default to a max date.
	:param daynum: A number of days since Jan 1, 1970
	"""
	if daynum is None:
	return None
	if daynum < 0:
	return None
	daycount = int(daynum)
	if daycount > 1000000:
	# Some time in the 48th century, clearly bogus.
	daycount = 1000000
	return datetime.date(1970, 1, 1) + datetime.timedelta(daycount)

	def get_current_week(profile_creation_date, subsession_start_date, submission_date_s3):
	pcd = daynum_to_date(profile_creation_date)
	client_date = None
	if subsession_start_date is not None:
	try:
	client_date = (datetime
	.datetime
	.strptime(subsession_start_date[0:10], "%Y-%m-%d"))
	except ValueError as e1:
	# Bogus format
	return 'unknown'
	except TypeError as e2:
	# String contains null bytes or other weirdness. Example:
	# TypeError: must be string without null bytes, not unicode
	return 'unknown'
	if client_date is None:
	# Fall back to submission date
	client_date = datetime.datetime.strptime(submission_date_s3, "%Y%m%d")
	return get_week_num(pcd, client_date)

	current_week_udf = F.udf(get_current_week, StringType())

	# Note: defined in a functional way, but perhaps not the most pythonic (or readable)
	pcd_to_aquisition_udf = (
	F.udf(lambda pcd: (
	datetime.datetime.strftime(
	snap_to_beginning_of_week(
	daynum_to_date(pcd),
	"Sunday"),
	"%Y-%m-%d")),
	StringType()))

	def compute_week(df, week_start):
	week_start_date = datetime.datetime.strptime(week_start, "%Y%m%d")
	week_end_date = week_start_date + datetime.timedelta(6)
	week_start = fmt(week_start_date)
	week_end = fmt(week_end_date)

	# Verify that the start date is a Sunday
	if week_start_date.weekday() != 6:
	print("Week start date {} is not a Sunday".format(week_start))
	return

	print("Starting week from {} to {} at {}"
	.format(week_start, week_end, datetime.datetime.utcnow()))
	# the subsession_start_date field has a different form than submission_date_s3,
	# so needs to be formatted with hyphens.
	week_end_slop = fmt(week_end_date + datetime.timedelta(10))
	week_end_excl = fmt(week_end_date + datetime.timedelta(1), date_format="%Y-%m-%d")
	week_start_hyphenated = fmt(week_start_date, date_format="%Y-%m-%d")

	current_week = (
	df.filter(df['submission_date_s3'] >= week_start)
	.filter(df['submission_date_s3'] <= week_end_slop)
	.filter(df['subsession_start_date'] >= week_start_hyphenated)
	.filter(df['subsession_start_date'] < week_end_excl)
	)
	newest_per_client = (
	get_newest_per_client(current_week)
	.select(
	F.col('*'),
	pcd_to_aquisition_udf(
	F.col('profile_creation_date')
	).alias('aquisition_period'),
	current_week_udf(
	F.col('profile_creation_date'),
	F.col('subsession_start_date'),
	F.col('submission_date_s3')
	).alias('current_week')))

	return newest_per_client


	# In[59]:

	S3_ATTRIBUTION_BUCKET = 'net-mozaws-prod-us-west-2-pipeline-analysis'
	S3_ATTRIBUTION_PREFIX = 'amiyaguchi/stub/v1'

	n_weeks = 5
	df = generate_test_df(100, n_weeks)

	start_date = datetime.datetime.strptime("20160103", "%Y%m%d")
	for week in range(n_weeks):
	delta = datetime.timedelta(7) * week
	week_start = fmt(start_date + delta)
	week_df = compute_week(df, week_start)
	if week == 0:
	week_df.printSchema()
	s3_path = "s3://{}/{}/{}={}".format(S3_ATTRIBUTION_BUCKET,
	S3_ATTRIBUTION_PREFIX,
	'week_start', week_start)
	logging.info("Writing dataframe to %s", s3_path)
	week_df.write.parquet(s3_path, mode="overwrite")