huonw/flaky tests.ipynb

## flaky tests.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analysing test failures on Buildkite, using JUnit XML artifacts\n",
    "\n",
    "This notebook helps one explore test failures on Buildkite CI, by looking at failures in JUnit XML files.\n",
    "\n",
    "It tries to be moderately generic, but has only been used under the following conditions:\n",
    "\n",
    "- a token with read access to everything exists in `~/.buildkite/read_token`\n",
    "- test results are recorded in JUnit XML files and uploaded as artifacts matching the glob `junit-*.xml` (see `JUNIT_RE` below)\n",
    "- the JUnit XML files use `pytest`'s particular format\n",
    "\n",
    "## Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "org = \"stellar\"\n",
    "pipeline = \"stellargraph-public\"\n",
    "JUNIT_RE = re.compile(r\"^junit-.*\\.xml$\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Definitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install pybuildkite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pybuildkite.buildkite import Buildkite\n",
    "import os\n",
    "import requests\n",
    "import xml.etree.ElementTree as ET\n",
    "import re\n",
    "import pandas as pd\n",
    "import warnings\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "buildkite = Buildkite()\n",
    "with open(os.path.expanduser(\"~/.buildkite/read_token\")) as tok_file:\n",
    "    token = tok_file.read().strip()\n",
    "    buildkite.set_access_token(token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_builds(count):\n",
    "    builds = {}\n",
    "    next_page = 1\n",
    "    while next_page and len(builds) < count:\n",
    "        print(f\"downloading page {next_page} (found {len(builds)}/{count} builds)\")\n",
    "        resp = buildkite.builds().list_all_for_pipeline(org, pipeline, page=next_page, with_pagination=True)\n",
    "        for build in resp.body:\n",
    "            builds[build[\"number\"]] = build\n",
    "        next_page = resp.next_page\n",
    "    return builds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "def junit_xml_artifacts(build_number):\n",
    "    artifacts = buildkite.artifacts().list_artifacts_for_build(org, pipeline, build_number)\n",
    "    return [\n",
    "        art\n",
    "        for art in artifacts\n",
    "        if JUNIT_RE.match(art[\"path\"])\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_artifact(art):\n",
    "    return requests.get(art[\"download_url\"], headers={\"Authorization\": f\"Bearer {token}\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check(elem, expected, context):\n",
    "    if elem.tag != expected:\n",
    "        warnings.warn(f\"found tag {elem.tag!r}, expected {expected!r} (context: {context})\")\n",
    "        return False\n",
    "    \n",
    "    return True\n",
    "\n",
    "def find_failures(xml_string, context):\n",
    "    testsuites = ET.fromstring(xml_string)        \n",
    "    if not check(testsuites, \"testsuites\", context):\n",
    "        return\n",
    "\n",
    "    for testsuite in testsuites:\n",
    "        if not check(testsuite, \"testsuite\", context):\n",
    "            continue\n",
    "        \n",
    "        for testcase in testsuite:\n",
    "            if not check(testcase, \"testcase\", context):\n",
    "                continue\n",
    "\n",
    "            has_failure = any(x.tag == \"failure\" for x in testcase)\n",
    "            if has_failure:\n",
    "                yield testcase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarise_testcase(testcase):\n",
    "    return (testcase.attrib[\"classname\"], testcase.attrib[\"name\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "def failures_for_build(build_number):\n",
    "    return [\n",
    "        (build_number,) + summarise_testcase(testcase)\n",
    "        for art in junit_xml_artifacts(build_number)\n",
    "        for testcase in find_failures(download_artifact(art).text, f\"build number {build_number}\")\n",
    "    ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "downloading page 1 (found 0/1000 builds)\n",
      "downloading page 2 (found 100/1000 builds)\n",
      "downloading page 3 (found 200/1000 builds)\n",
      "downloading page 4 (found 300/1000 builds)\n",
      "downloading page 5 (found 400/1000 builds)\n",
      "downloading page 6 (found 500/1000 builds)\n",
      "downloading page 7 (found 600/1000 builds)\n",
      "downloading page 8 (found 700/1000 builds)\n",
      "downloading page 9 (found 800/1000 builds)\n",
      "downloading page 10 (found 900/1000 builds)\n"
     ]
    }
   ],
   "source": [
    "most_recent_builds = get_builds(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys([4746, 4745, 4744, 4743, 4742, 4741, 4740, 4739, 4738, 4737, 4736, 4735, 4734, 4733, 4732, 4731, 4730, 4729, 4728, 4727, 4726, 4725, 4724, 4723, 4722, 4721, 4720, 4719, 4718, 4717, 4716, 4715, 4714, 4713, 4712, 4711, 4710, 4709, 4708, 4707, 4706, 4705, 4704, 4703, 4702, 4701, 4700, 4699, 4698, 4697, 4696, 4695, 4694, 4693, 4692, 4691, 4690, 4689, 4688, 4687, 4686, 4685, 4684, 4683, 4682, 4681, 4680, 4679, 4678, 4677, 4676, 4675, 4674, 4673, 4672, 4671, 4670, 4669, 4668, 4667, 4666, 4665, 4664, 4663, 4662, 4661, 4660, 4659, 4658, 4657, 4656, 4655, 4654, 4653, 4652, 4651, 4650, 4649, 4648, 4647, 4646, 4645, 4644, 4643, 4642, 4641, 4640, 4639, 4638, 4637, 4636, 4635, 4634, 4633, 4632, 4631, 4630, 4629, 4628, 4627, 4626, 4625, 4624, 4623, 4622, 4621, 4620, 4619, 4618, 4617, 4616, 4615, 4614, 4613, 4612, 4611, 4610, 4609, 4608, 4607, 4606, 4605, 4604, 4603, 4602, 4601, 4600, 4599, 4598, 4597, 4596, 4595, 4594, 4593, 4592, 4591, 4590, 4589, 4588, 4587, 4586, 4585, 4584, 4583, 4582, 4581, 4580, 4579, 4578, 4577, 4576, 4575, 4574, 4573, 4572, 4571, 4570, 4569, 4568, 4567, 4566, 4565, 4564, 4563, 4562, 4561, 4560, 4559, 4558, 4557, 4556, 4555, 4554, 4553, 4552, 4551, 4550, 4549, 4548, 4547, 4546, 4545, 4544, 4543, 4542, 4541, 4540, 4539, 4538, 4537, 4536, 4535, 4534, 4533, 4532, 4531, 4530, 4529, 4528, 4527, 4526, 4525, 4524, 4523, 4522, 4521, 4520, 4519, 4518, 4517, 4516, 4515, 4514, 4513, 4512, 4511, 4510, 4509, 4508, 4507, 4506, 4505, 4504, 4503, 4502, 4501, 4500, 4499, 4498, 4497, 4496, 4495, 4494, 4493, 4492, 4491, 4490, 4489, 4488, 4487, 4486, 4485, 4484, 4483, 4482, 4481, 4480, 4479, 4478, 4477, 4476, 4475, 4474, 4473, 4472, 4471, 4470, 4469, 4468, 4467, 4466, 4465, 4464, 4463, 4462, 4461, 4460, 4459, 4458, 4457, 4456, 4455, 4454, 4453, 4452, 4451, 4450, 4449, 4448, 4447, 4446, 4445, 4444, 4443, 4442, 4441, 4440, 4439, 4438, 4437, 4436, 4435, 4434, 4433, 4432, 4431, 4430, 4429, 4428, 4427, 4426, 4425, 4424, 4423, 4422, 4421, 4420, 4419, 4418, 4417, 4416, 4415, 4414, 4413, 4412, 4411, 4410, 4409, 4408, 4407, 4406, 4405, 4404, 4403, 4402, 4401, 4400, 4399, 4398, 4397, 4396, 4395, 4394, 4393, 4392, 4391, 4390, 4389, 4388, 4387, 4386, 4385, 4384, 4383, 4382, 4381, 4380, 4379, 4378, 4377, 4376, 4375, 4374, 4373, 4372, 4371, 4370, 4369, 4368, 4367, 4366, 4365, 4364, 4363, 4362, 4361, 4360, 4359, 4358, 4357, 4356, 4355, 4354, 4353, 4352, 4351, 4350, 4349, 4348, 4347, 4346, 4345, 4344, 4343, 4342, 4341, 4340, 4339, 4338, 4337, 4336, 4335, 4334, 4333, 4332, 4331, 4330, 4329, 4328, 4327, 4326, 4325, 4324, 4323, 4322, 4321, 4320, 4319, 4318, 4317, 4316, 4315, 4314, 4313, 4312, 4311, 4310, 4309, 4308, 4307, 4306, 4305, 4304, 4303, 4302, 4301, 4300, 4299, 4298, 4297, 4296, 4295, 4294, 4293, 4292, 4291, 4290, 4289, 4288, 4287, 4286, 4285, 4284, 4283, 4282, 4281, 4280, 4279, 4278, 4277, 4276, 4275, 4274, 4273, 4272, 4271, 4270, 4269, 4268, 4267, 4266, 4265, 4264, 4263, 4262, 4261, 4260, 4259, 4258, 4257, 4256, 4255, 4254, 4253, 4252, 4251, 4250, 4249, 4248, 4247, 4246, 4245, 4244, 4243, 4242, 4241, 4240, 4239, 4238, 4237, 4236, 4235, 4234, 4233, 4232, 4231, 4230, 4229, 4228, 4227, 4226, 4225, 4224, 4223, 4222, 4221, 4220, 4219, 4218, 4217, 4216, 4215, 4214, 4213, 4212, 4211, 4210, 4209, 4208, 4207, 4206, 4205, 4204, 4203, 4202, 4201, 4200, 4199, 4198, 4197, 4196, 4195, 4194, 4193, 4192, 4191, 4190, 4189, 4188, 4187, 4186, 4185, 4184, 4183, 4182, 4181, 4180, 4179, 4178, 4177, 4176, 4175, 4174, 4173, 4172, 4171, 4170, 4169, 4168, 4167, 4166, 4165, 4164, 4163, 4162, 4161, 4160, 4159, 4158, 4157, 4156, 4155, 4154, 4153, 4152, 4151, 4150, 4149, 4148, 4147, 4146, 4145, 4144, 4143, 4142, 4141, 4140, 4139, 4138, 4137, 4136, 4135, 4134, 4133, 4132, 4131, 4130, 4129, 4128, 4127, 4126, 4125, 4124, 4123, 4122, 4121, 4120, 4119, 4118, 4117, 4116, 4115, 4114, 4113, 4112, 4111, 4110, 4109, 4108, 4107, 4106, 4105, 4104, 4103, 4102, 4101, 4100, 4099, 4098, 4097, 4096, 4095, 4094, 4093, 4092, 4091, 4090, 4089, 4088, 4087, 4086, 4085, 4084, 4083, 4082, 4081, 4080, 4079, 4078, 4077, 4076, 4075, 4074, 4073, 4072, 4071, 4070, 4069, 4068, 4067, 4066, 4065, 4064, 4063, 4062, 4061, 4060, 4059, 4058, 4057, 4056, 4055, 4054, 4053, 4052, 4051, 4050, 4049, 4048, 4047, 4046, 4045, 4044, 4043, 4042, 4041, 4040, 4039, 4038, 4037, 4036, 4035, 4034, 4033, 4032, 4031, 4030, 4029, 4028, 4027, 4026, 4025, 4024, 4023, 4022, 4021, 4020, 4019, 4018, 4017, 4016, 4015, 4014, 4013, 4012, 4011, 4010, 4009, 4008, 4007, 4006, 4005, 4004, 4003, 4002, 4001, 4000, 3999, 3998, 3997, 3996, 3995, 3994, 3993, 3992, 3991, 3990, 3989, 3988, 3987, 3986, 3985, 3984, 3983, 3982, 3981, 3980, 3979, 3978, 3977, 3976, 3975, 3974, 3973, 3972, 3971, 3970, 3969, 3968, 3967, 3966, 3965, 3964, 3963, 3962, 3961, 3960, 3959, 3958, 3957, 3956, 3955, 3954, 3953, 3952, 3951, 3950, 3949, 3948, 3947, 3946, 3945, 3944, 3943, 3942, 3941, 3940, 3939, 3938, 3937, 3936, 3935, 3934, 3933, 3932, 3931, 3930, 3929, 3928, 3927, 3926, 3925, 3924, 3923, 3922, 3921, 3920, 3919, 3918, 3917, 3916, 3915, 3914, 3913, 3912, 3911, 3910, 3909, 3908, 3907, 3906, 3905, 3904, 3903, 3902, 3901, 3900, 3899, 3898, 3897, 3896, 3895, 3894, 3893, 3892, 3891, 3890, 3889, 3888, 3887, 3886, 3885, 3884, 3883, 3882, 3881, 3880, 3879, 3878, 3877, 3876, 3875, 3874, 3873, 3872, 3871, 3870, 3869, 3868, 3867, 3866, 3865, 3864, 3863, 3862, 3861, 3860, 3859, 3858, 3857, 3856, 3855, 3854, 3853, 3852, 3851, 3850, 3849, 3848, 3847, 3846, 3845, 3844, 3843, 3842, 3841, 3840, 3839, 3838, 3837, 3836, 3835, 3834, 3833, 3832, 3831, 3830, 3829, 3828, 3827, 3826, 3825, 3824, 3823, 3822, 3821, 3820, 3819, 3818, 3817, 3816, 3815, 3814, 3813, 3812, 3811, 3810, 3809, 3808, 3807, 3806, 3805, 3804, 3803, 3802, 3801, 3800, 3799, 3798, 3797, 3796, 3795, 3794, 3793, 3792, 3791, 3790, 3789, 3788, 3787, 3786, 3785, 3784, 3783, 3782, 3781, 3780, 3779, 3778, 3777, 3776, 3775, 3774, 3773, 3772, 3771, 3770, 3769, 3768, 3767, 3766, 3765, 3764, 3763, 3762, 3761, 3760, 3759, 3758, 3757, 3756, 3755, 3754, 3753, 3752, 3751, 3750, 3749, 3748, 3747])"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "most_recent_builds.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "from concurrent.futures import ThreadPoolExecutor\n",
    "pool = ThreadPoolExecutor(max_workers=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/wil9dw/.pyenv/versions/3.6.9/lib/python3.6/site-packages/ipykernel_launcher.py:3: UserWarning: found tag 'Error', expected 'testsuites' (context: build number 4598)\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n",
      "/Users/wil9dw/.pyenv/versions/3.6.9/lib/python3.6/site-packages/ipykernel_launcher.py:3: UserWarning: found tag 'Error', expected 'testsuites' (context: build number 4604)\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2min 52s, sys: 21.9 s, total: 3min 14s\n",
      "Wall time: 14min 22s\n"
     ]
    }
   ],
   "source": [
    "%%time # this takes a while\n",
    "raw_failures = list(pool.map(failures_for_build, most_recent_builds.keys()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DataFrame creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [],
   "source": [
    "builds_df_all = pd.DataFrame.from_dict(most_recent_builds, orient=\"index\")\n",
    "builds_df = pd.get_dummies(builds_df_all[[\"state\", \"branch\", \"commit\"]], columns=[\"state\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 291,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>build</th>\n",
       "      <th>classname</th>\n",
       "      <th>name</th>\n",
       "      <th>branch</th>\n",
       "      <th>commit</th>\n",
       "      <th>state_canceled</th>\n",
       "      <th>state_failed</th>\n",
       "      <th>state_passed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4744</td>\n",
       "      <td>tests.layer.test_knowledge_graph</td>\n",
       "      <td>test_model_rankings[RotH]</td>\n",
       "      <td>feature/1569-probabilities</td>\n",
       "      <td>aca59902b6813fa144ddf5223f92c843543ec3b5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4743</td>\n",
       "      <td>tests.layer.test_knowledge_graph</td>\n",
       "      <td>test_model_rankings[RotH]</td>\n",
       "      <td>feature/1569-probabilities</td>\n",
       "      <td>aca59902b6813fa144ddf5223f92c843543ec3b5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4740</td>\n",
       "      <td>tests.test_aaa_on_gpu</td>\n",
       "      <td>test_on_gpu_when_requested</td>\n",
       "      <td>testing-branch-for-scheduled-builds-DO_NOT_DELETE</td>\n",
       "      <td>7deaa1daf2e07f03614f2de2c855b6138800f2b9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4740</td>\n",
       "      <td>tests.test_aaa_on_gpu</td>\n",
       "      <td>test_on_gpu_when_requested</td>\n",
       "      <td>testing-branch-for-scheduled-builds-DO_NOT_DELETE</td>\n",
       "      <td>7deaa1daf2e07f03614f2de2c855b6138800f2b9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4740</td>\n",
       "      <td>tests.test_aaa_on_gpu</td>\n",
       "      <td>test_on_gpu_when_requested</td>\n",
       "      <td>testing-branch-for-scheduled-builds-DO_NOT_DELETE</td>\n",
       "      <td>7deaa1daf2e07f03614f2de2c855b6138800f2b9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   build                         classname                        name  \\\n",
       "0   4744  tests.layer.test_knowledge_graph   test_model_rankings[RotH]   \n",
       "1   4743  tests.layer.test_knowledge_graph   test_model_rankings[RotH]   \n",
       "2   4740             tests.test_aaa_on_gpu  test_on_gpu_when_requested   \n",
       "3   4740             tests.test_aaa_on_gpu  test_on_gpu_when_requested   \n",
       "4   4740             tests.test_aaa_on_gpu  test_on_gpu_when_requested   \n",
       "\n",
       "                                              branch  \\\n",
       "0                         feature/1569-probabilities   \n",
       "1                         feature/1569-probabilities   \n",
       "2  testing-branch-for-scheduled-builds-DO_NOT_DELETE   \n",
       "3  testing-branch-for-scheduled-builds-DO_NOT_DELETE   \n",
       "4  testing-branch-for-scheduled-builds-DO_NOT_DELETE   \n",
       "\n",
       "                                     commit  state_canceled  state_failed  \\\n",
       "0  aca59902b6813fa144ddf5223f92c843543ec3b5               0             0   \n",
       "1  aca59902b6813fa144ddf5223f92c843543ec3b5               0             1   \n",
       "2  7deaa1daf2e07f03614f2de2c855b6138800f2b9               0             1   \n",
       "3  7deaa1daf2e07f03614f2de2c855b6138800f2b9               0             1   \n",
       "4  7deaa1daf2e07f03614f2de2c855b6138800f2b9               0             1   \n",
       "\n",
       "   state_passed  \n",
       "0             1  \n",
       "1             0  \n",
       "2             0  \n",
       "3             0  \n",
       "4             0  "
      ]
     },
     "execution_count": 291,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failures = pd.DataFrame([f for fs in raw_failures for f in fs], columns=(\"build\", \"classname\", \"name\"))\n",
    "failures = failures.join(builds_df, \"build\")\n",
    "failures.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Analysis\n",
    "\n",
    "### Descriptive stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>passed</th>\n",
       "      <td>591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>failed</th>\n",
       "      <td>387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>canceled</th>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          state\n",
       "passed      591\n",
       "failed      387\n",
       "canceled     22"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# build states\n",
    "builds_df_all[\"state\"].value_counts().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "167"
      ]
     },
     "execution_count": 190,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# builds that failed with test failures\n",
    "len(failures.build.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>build</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4716</th>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4717</th>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4053</th>\n",
       "      <td>73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4278</th>\n",
       "      <td>78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3826</th>\n",
       "      <td>86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3824</th>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4032</th>\n",
       "      <td>168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4322</th>\n",
       "      <td>213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4284</th>\n",
       "      <td>285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4282</th>\n",
       "      <td>351</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         0\n",
       "build     \n",
       "4716    66\n",
       "4717    66\n",
       "4053    73\n",
       "4278    78\n",
       "3826    86\n",
       "3824    95\n",
       "4032   168\n",
       "4322   213\n",
       "4284   285\n",
       "4282   351"
      ]
     },
     "execution_count": 193,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# highest number of test failures per build\n",
    "failures.groupby(\"build\").size().sort_values().to_frame().tail(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Flaky tests - failures in passed builds\n",
    "\n",
    "We can find flaky tests by looking for test failures within builds that passed. This means that someone retried the build and it eventually worked."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>builds</th>\n",
       "      <th>state_failed</th>\n",
       "      <th>state_passed</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>classname</th>\n",
       "      <th>name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"5\" valign=\"top\">tests.core.test_convert</th>\n",
       "      <th>test_columnar_convert_invalid_input</th>\n",
       "      <td>[4137, 4137, 4137, 4135, 4135]</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_columnar_convert_ndarray</th>\n",
       "      <td>[4486, 4486, 4486, 4485, 4485, 4485, 4284, 428...</td>\n",
       "      <td>23</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_columnar_convert_rowframe</th>\n",
       "      <td>[4486, 4486, 4486, 4485, 4485, 4485, 4284, 428...</td>\n",
       "      <td>18</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_columnar_convert_rowframe_ndarray_invalid</th>\n",
       "      <td>[4284, 4284, 4284, 4283, 4283, 4283, 4282, 428...</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_convert_edges_type_column[False]</th>\n",
       "      <td>[4248, 4248, 4248, 4247, 4247, 4247]</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                   builds  \\\n",
       "classname               name                                                                                                \n",
       "tests.core.test_convert test_columnar_convert_invalid_input                                [4137, 4137, 4137, 4135, 4135]   \n",
       "                        test_columnar_convert_ndarray                   [4486, 4486, 4486, 4485, 4485, 4485, 4284, 428...   \n",
       "                        test_columnar_convert_rowframe                  [4486, 4486, 4486, 4485, 4485, 4485, 4284, 428...   \n",
       "                        test_columnar_convert_rowframe_ndarray_invalid  [4284, 4284, 4284, 4283, 4283, 4283, 4282, 428...   \n",
       "                        test_convert_edges_type_column[False]                        [4248, 4248, 4248, 4247, 4247, 4247]   \n",
       "\n",
       "                                                                        state_failed  \\\n",
       "classname               name                                                           \n",
       "tests.core.test_convert test_columnar_convert_invalid_input                        5   \n",
       "                        test_columnar_convert_ndarray                             23   \n",
       "                        test_columnar_convert_rowframe                            18   \n",
       "                        test_columnar_convert_rowframe_ndarray_invalid            12   \n",
       "                        test_convert_edges_type_column[False]                      6   \n",
       "\n",
       "                                                                        state_passed  \n",
       "classname               name                                                          \n",
       "tests.core.test_convert test_columnar_convert_invalid_input                        0  \n",
       "                        test_columnar_convert_ndarray                              0  \n",
       "                        test_columnar_convert_rowframe                             0  \n",
       "                        test_columnar_convert_rowframe_ndarray_invalid             0  \n",
       "                        test_convert_edges_type_column[False]                      0  "
      ]
     },
     "execution_count": 226,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failures_by_state = failures.groupby([\"classname\", \"name\"]).agg(builds=(\"build\", list), state_failed=(\"state_failed\", \"sum\"), state_passed=(\"state_passed\", \"sum\"))\n",
    "failures_by_state.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "state_failed    2600\n",
       "state_passed      64\n",
       "dtype: int64"
      ]
     },
     "execution_count": 227,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failures_by_state[[\"state_failed\", \"state_passed\"]].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>builds</th>\n",
       "      <th>state_failed</th>\n",
       "      <th>state_passed</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>classname</th>\n",
       "      <th>name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">tests.core.test_utils</th>\n",
       "      <th>test_normalize_adj</th>\n",
       "      <td>[4237, 4237, 4148, 3967, 3837]</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_normalized_laplacian</th>\n",
       "      <td>[4256, 4256]</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">tests.data.test_edge_splitter.TestEdgeSplitterHeterogeneous</th>\n",
       "      <th>test_split_data_by_edge_type</th>\n",
       "      <td>[4679, 4670, 4670, 4626, 4626, 4579, 4579, 451...</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_split_data_by_edge_type_and_attribute</th>\n",
       "      <td>[4679, 4611, 4471, 4167, 3898, 3865, 3862, 377...</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">tests.layer.test_knowledge_graph</th>\n",
       "      <th>test_model_rankings[RotH]</th>\n",
       "      <td>[4744, 4743, 4737, 4737, 4737, 4737, 4732]</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_model_rankings[RotatE]</th>\n",
       "      <td>[4683, 4683, 4284, 4284, 4284, 4282, 4282, 4282]</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_rotate</th>\n",
       "      <td>[4499, 4353, 4353, 4185, 4185, 4113, 4113, 411...</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tests.mapper.test_node_mappers</th>\n",
       "      <th>test_nodemapper_isolated_nodes</th>\n",
       "      <td>[4735, 4630, 4384, 4342, 4330, 4284, 4284, 428...</td>\n",
       "      <td>13</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">tests.reproducibility.test_graphsage</th>\n",
       "      <th>test_nai[False]</th>\n",
       "      <td>[4691, 4284, 4284, 4284, 4282, 4282, 4282, 4245]</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_nai[True]</th>\n",
       "      <td>[4691, 4649, 4642, 4343, 4343, 4306, 4306, 428...</td>\n",
       "      <td>15</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_unsupervised[False]</th>\n",
       "      <td>[4730, 4730, 4728, 4728, 4726, 4706, 4691, 469...</td>\n",
       "      <td>31</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_unsupervised[True]</th>\n",
       "      <td>[4730, 4728, 4691, 4690, 4690, 4688, 4683, 468...</td>\n",
       "      <td>28</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                          builds  \\\n",
       "classname                                          name                                                                                            \n",
       "tests.core.test_utils                              test_normalize_adj                                             [4237, 4237, 4148, 3967, 3837]   \n",
       "                                                   test_normalized_laplacian                                                        [4256, 4256]   \n",
       "tests.data.test_edge_splitter.TestEdgeSplitterH... test_split_data_by_edge_type                [4679, 4670, 4670, 4626, 4626, 4579, 4579, 451...   \n",
       "                                                   test_split_data_by_edge_type_and_attribute  [4679, 4611, 4471, 4167, 3898, 3865, 3862, 377...   \n",
       "tests.layer.test_knowledge_graph                   test_model_rankings[RotH]                          [4744, 4743, 4737, 4737, 4737, 4737, 4732]   \n",
       "                                                   test_model_rankings[RotatE]                  [4683, 4683, 4284, 4284, 4284, 4282, 4282, 4282]   \n",
       "                                                   test_rotate                                 [4499, 4353, 4353, 4185, 4185, 4113, 4113, 411...   \n",
       "tests.mapper.test_node_mappers                     test_nodemapper_isolated_nodes              [4735, 4630, 4384, 4342, 4330, 4284, 4284, 428...   \n",
       "tests.reproducibility.test_graphsage               test_nai[False]                              [4691, 4284, 4284, 4284, 4282, 4282, 4282, 4245]   \n",
       "                                                   test_nai[True]                              [4691, 4649, 4642, 4343, 4343, 4306, 4306, 428...   \n",
       "                                                   test_unsupervised[False]                    [4730, 4730, 4728, 4728, 4726, 4706, 4691, 469...   \n",
       "                                                   test_unsupervised[True]                     [4730, 4728, 4691, 4690, 4690, 4688, 4683, 468...   \n",
       "\n",
       "                                                                                               state_failed  \\\n",
       "classname                                          name                                                       \n",
       "tests.core.test_utils                              test_normalize_adj                                     3   \n",
       "                                                   test_normalized_laplacian                              0   \n",
       "tests.data.test_edge_splitter.TestEdgeSplitterH... test_split_data_by_edge_type                          11   \n",
       "                                                   test_split_data_by_edge_type_and_attribute             4   \n",
       "tests.layer.test_knowledge_graph                   test_model_rankings[RotH]                              5   \n",
       "                                                   test_model_rankings[RotatE]                            6   \n",
       "                                                   test_rotate                                            5   \n",
       "tests.mapper.test_node_mappers                     test_nodemapper_isolated_nodes                        13   \n",
       "tests.reproducibility.test_graphsage               test_nai[False]                                        7   \n",
       "                                                   test_nai[True]                                        15   \n",
       "                                                   test_unsupervised[False]                              31   \n",
       "                                                   test_unsupervised[True]                               28   \n",
       "\n",
       "                                                                                               state_passed  \n",
       "classname                                          name                                                      \n",
       "tests.core.test_utils                              test_normalize_adj                                     2  \n",
       "                                                   test_normalized_laplacian                              2  \n",
       "tests.data.test_edge_splitter.TestEdgeSplitterH... test_split_data_by_edge_type                           4  \n",
       "                                                   test_split_data_by_edge_type_and_attribute             5  \n",
       "tests.layer.test_knowledge_graph                   test_model_rankings[RotH]                              2  \n",
       "                                                   test_model_rankings[RotatE]                            2  \n",
       "                                                   test_rotate                                            4  \n",
       "tests.mapper.test_node_mappers                     test_nodemapper_isolated_nodes                         4  \n",
       "tests.reproducibility.test_graphsage               test_nai[False]                                        1  \n",
       "                                                   test_nai[True]                                        10  \n",
       "                                                   test_unsupervised[False]                              17  \n",
       "                                                   test_unsupervised[True]                               11  "
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flaky = failures_by_state[failures_by_state.state_passed > 0]\n",
    "flaky"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "- `tests.core.test_utils` `test_normalize_adj` (total = 5, in failed builds = 3, in successful builds = 2)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3837\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3967\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4148\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4237 (2 times)\n",
      "\n",
      "- `tests.core.test_utils` `test_normalized_laplacian` (total = 2, in failed builds = 0, in successful builds = 2)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4256 (2 times)\n",
      "\n",
      "- `tests.data.test_edge_splitter.TestEdgeSplitterHeterogeneous` `test_split_data_by_edge_type` (total = 15, in failed builds = 11, in successful builds = 4)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3824\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4275 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4283\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4343 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4495\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4513\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4579 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4626 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4670 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4679\n",
      "\n",
      "- `tests.data.test_edge_splitter.TestEdgeSplitterHeterogeneous` `test_split_data_by_edge_type_and_attribute` (total = 9, in failed builds = 4, in successful builds = 5)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3773 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3862\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3865\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3898\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4167\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4471\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4611\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4679\n",
      "\n",
      "- `tests.layer.test_knowledge_graph` `test_model_rankings[RotH]` (total = 7, in failed builds = 5, in successful builds = 2)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4732\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4737 (4 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4743\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4744\n",
      "\n",
      "- `tests.layer.test_knowledge_graph` `test_model_rankings[RotatE]` (total = 8, in failed builds = 6, in successful builds = 2)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4282 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4284 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4683 (2 times)\n",
      "\n",
      "- `tests.layer.test_knowledge_graph` `test_rotate` (total = 9, in failed builds = 5, in successful builds = 4)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4112 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4113 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4185 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4353 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4499\n",
      "\n",
      "- `tests.mapper.test_node_mappers` `test_nodemapper_isolated_nodes` (total = 17, in failed builds = 13, in successful builds = 4)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3917 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4053 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4206 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4282 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4284 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4330\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4342\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4384\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4630\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4735\n",
      "\n",
      "- `tests.reproducibility.test_graphsage` `test_nai[False]` (total = 8, in failed builds = 7, in successful builds = 1)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4245\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4282 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4284 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4691\n",
      "\n",
      "- `tests.reproducibility.test_graphsage` `test_nai[True]` (total = 25, in failed builds = 15, in successful builds = 10)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3858\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3889\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3898 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3915\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3972\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3973\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3986 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4016\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4206 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4282 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4284 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4306 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4343 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4642\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4649\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4691\n",
      "\n",
      "- `tests.reproducibility.test_graphsage` `test_unsupervised[False]` (total = 48, in failed builds = 31, in successful builds = 17)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4051\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4058\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4060\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4065\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4128\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4129\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4132\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4135\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4140\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4152\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4162\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4193\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4198\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4282 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4284 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4637 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4639 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4642 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4648 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4649 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4652 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4654 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4683 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4688\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4690 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4691 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4706\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4726\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4728 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4730 (2 times)\n",
      "\n",
      "- `tests.reproducibility.test_graphsage` `test_unsupervised[True]` (total = 39, in failed builds = 28, in successful builds = 11)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/3931\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4051\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4054\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4058\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4060\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4128\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4129 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4132 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4140\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4152\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4198\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4282 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4284 (3 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4346\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4394\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4620 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4637 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4639\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4648 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4649 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4652\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4683 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4688\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4690 (2 times)\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4691\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4728\n",
      "  - https://buildkite.com/stellar/stellargraph-public/builds/4730\n"
     ]
    }
   ],
   "source": [
    "# markdown output for copying into (GitHub) issues:\n",
    "for ((classname, name), builds, fail, success) in flaky.itertuples():\n",
    "    print()\n",
    "    print(f\"- `{classname}` `{name}` (total = {fail + success}, in failed builds = {fail}, in successful builds = {success})\")\n",
    "    for b, count in pd.Series(builds).value_counts().sort_index().iteritems():\n",
    "        extra = \"\" if count == 1 else f\" ({count} times)\"\n",
    "        print(f\"  - https://buildkite.com/stellar/stellargraph-public/builds/{b}{extra}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Flaky tests - failure and pass on a single commit\n",
    "\n",
    "An alternative way to find a flaky test is to look for multiple builds of a single commit, where some builds failed and some passed, and then finding the tests that failed within those builds. This works best if there's not too many configuration differences between the different builds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "metadata": {},
   "outputs": [],
   "source": [
    "builds_by_commit = builds_df.reset_index().groupby(\"commit\").agg(\n",
    "    builds=(\"index\", list),\n",
    "    state_passed=(\"state_passed\", \"sum\"), \n",
    "    state_failed=(\"state_failed\", \"sum\")\n",
    ")\n",
    "passed_and_failed = builds_by_commit[(builds_by_commit.state_passed > 0) & (builds_by_commit.state_failed > 0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 293,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>builds</th>\n",
       "      <th>state_passed</th>\n",
       "      <th>state_failed</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>commit</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0deb275def782953fd5c1efbd5ad61a66b961204</th>\n",
       "      <td>[4612, 4611]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13eb0d1ad94e4dc442754196de691f22810d13e7</th>\n",
       "      <td>[4570, 4569]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1f19ff356cef296cef0b57b4126147383b0431aa</th>\n",
       "      <td>[4150, 4148]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>231b64e0ad77c27a9063cf461411438bad31d1f6</th>\n",
       "      <td>[4649, 4648]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>450000411f834577d6ff0fd26e4a9eee8c5192fa</th>\n",
       "      <td>[3967, 3966]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4884deeb7a1c8872eed787716cc2381596e3d7d2</th>\n",
       "      <td>[4546, 4537, 4532, 4531]</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6333e1024f83e859ab1952f9317670844792d1ee</th>\n",
       "      <td>[4501, 4495, 4491]</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7774f8f1df0e394289bd6cd3de761efd78f830c4</th>\n",
       "      <td>[4653, 4652]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7af1707b7bf24060ef4d9f20f209a333a9e46f34</th>\n",
       "      <td>[4499, 4498]</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7deaa1daf2e07f03614f2de2c855b6138800f2b9</th>\n",
       "      <td>[4740, 4739, 4698, 4697, 4685, 4684, 4661, 458...</td>\n",
       "      <td>31</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                     builds  \\\n",
       "commit                                                                                        \n",
       "0deb275def782953fd5c1efbd5ad61a66b961204                                       [4612, 4611]   \n",
       "13eb0d1ad94e4dc442754196de691f22810d13e7                                       [4570, 4569]   \n",
       "1f19ff356cef296cef0b57b4126147383b0431aa                                       [4150, 4148]   \n",
       "231b64e0ad77c27a9063cf461411438bad31d1f6                                       [4649, 4648]   \n",
       "450000411f834577d6ff0fd26e4a9eee8c5192fa                                       [3967, 3966]   \n",
       "4884deeb7a1c8872eed787716cc2381596e3d7d2                           [4546, 4537, 4532, 4531]   \n",
       "6333e1024f83e859ab1952f9317670844792d1ee                                 [4501, 4495, 4491]   \n",
       "7774f8f1df0e394289bd6cd3de761efd78f830c4                                       [4653, 4652]   \n",
       "7af1707b7bf24060ef4d9f20f209a333a9e46f34                                       [4499, 4498]   \n",
       "7deaa1daf2e07f03614f2de2c855b6138800f2b9  [4740, 4739, 4698, 4697, 4685, 4684, 4661, 458...   \n",
       "\n",
       "                                          state_passed  state_failed  \n",
       "commit                                                                \n",
       "0deb275def782953fd5c1efbd5ad61a66b961204             1             1  \n",
       "13eb0d1ad94e4dc442754196de691f22810d13e7             1             1  \n",
       "1f19ff356cef296cef0b57b4126147383b0431aa             1             1  \n",
       "231b64e0ad77c27a9063cf461411438bad31d1f6             1             1  \n",
       "450000411f834577d6ff0fd26e4a9eee8c5192fa             1             1  \n",
       "4884deeb7a1c8872eed787716cc2381596e3d7d2             2             1  \n",
       "6333e1024f83e859ab1952f9317670844792d1ee             1             2  \n",
       "7774f8f1df0e394289bd6cd3de761efd78f830c4             1             1  \n",
       "7af1707b7bf24060ef4d9f20f209a333a9e46f34             1             1  \n",
       "7deaa1daf2e07f03614f2de2c855b6138800f2b9            31             7  "
      ]
     },
     "execution_count": 293,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "passed_and_failed.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>builds</th>\n",
       "      <th>state_failed</th>\n",
       "      <th>state_passed</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>classname</th>\n",
       "      <th>name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>tests.core.test_utils</th>\n",
       "      <th>test_normalize_adj</th>\n",
       "      <td>[4148, 3967]</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">tests.data.test_edge_splitter.TestEdgeSplitterHeterogeneous</th>\n",
       "      <th>test_split_data_by_edge_type</th>\n",
       "      <td>[4495, 4343, 4343]</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_split_data_by_edge_type_and_attribute</th>\n",
       "      <td>[4611]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tests.datasets.test_datasets</th>\n",
       "      <th>test_dataset_download[METR_LA]</th>\n",
       "      <td>[4339, 4339]</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">tests.layer.test_knowledge_graph</th>\n",
       "      <th>test_model_rankings[RotH]</th>\n",
       "      <td>[4744, 4743, 4737, 4737, 4737, 4737]</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_rotate</th>\n",
       "      <td>[4499]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tests.mapper.test_node_mappers</th>\n",
       "      <th>test_nodemapper_isolated_nodes</th>\n",
       "      <td>[4630, 4384, 4342]</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">tests.reproducibility.test_graphsage</th>\n",
       "      <th>test_link_prediction[False]</th>\n",
       "      <td>[4726, 4649, 4495, 4062, 4062]</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_nai[True]</th>\n",
       "      <td>[4649, 4343, 4343]</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_unsupervised[False]</th>\n",
       "      <td>[4726, 4652, 4652, 4649, 4649, 4648, 4648]</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>test_unsupervised[True]</th>\n",
       "      <td>[4652, 4649, 4649, 4648, 4648, 3931]</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tests.test_aaa_on_gpu</th>\n",
       "      <th>test_on_gpu_when_requested</th>\n",
       "      <td>[4740, 4740, 4740]</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tests.utils.test_hyperbolic</th>\n",
       "      <th>test_poincare_ball_distance_self</th>\n",
       "      <td>[4740]</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                   builds  \\\n",
       "classname                                          name                                                                                     \n",
       "tests.core.test_utils                              test_normalize_adj                                                        [4148, 3967]   \n",
       "tests.data.test_edge_splitter.TestEdgeSplitterH... test_split_data_by_edge_type                                        [4495, 4343, 4343]   \n",
       "                                                   test_split_data_by_edge_type_and_attribute                                      [4611]   \n",
       "tests.datasets.test_datasets                       test_dataset_download[METR_LA]                                            [4339, 4339]   \n",
       "tests.layer.test_knowledge_graph                   test_model_rankings[RotH]                         [4744, 4743, 4737, 4737, 4737, 4737]   \n",
       "                                                   test_rotate                                                                     [4499]   \n",
       "tests.mapper.test_node_mappers                     test_nodemapper_isolated_nodes                                      [4630, 4384, 4342]   \n",
       "tests.reproducibility.test_graphsage               test_link_prediction[False]                             [4726, 4649, 4495, 4062, 4062]   \n",
       "                                                   test_nai[True]                                                      [4649, 4343, 4343]   \n",
       "                                                   test_unsupervised[False]                    [4726, 4652, 4652, 4649, 4649, 4648, 4648]   \n",
       "                                                   test_unsupervised[True]                           [4652, 4649, 4649, 4648, 4648, 3931]   \n",
       "tests.test_aaa_on_gpu                              test_on_gpu_when_requested                                          [4740, 4740, 4740]   \n",
       "tests.utils.test_hyperbolic                        test_poincare_ball_distance_self                                                [4740]   \n",
       "\n",
       "                                                                                               state_failed  \\\n",
       "classname                                          name                                                       \n",
       "tests.core.test_utils                              test_normalize_adj                                     2   \n",
       "tests.data.test_edge_splitter.TestEdgeSplitterH... test_split_data_by_edge_type                           3   \n",
       "                                                   test_split_data_by_edge_type_and_attribute             1   \n",
       "tests.datasets.test_datasets                       test_dataset_download[METR_LA]                         2   \n",
       "tests.layer.test_knowledge_graph                   test_model_rankings[RotH]                              5   \n",
       "                                                   test_rotate                                            1   \n",
       "tests.mapper.test_node_mappers                     test_nodemapper_isolated_nodes                         2   \n",
       "tests.reproducibility.test_graphsage               test_link_prediction[False]                            5   \n",
       "                                                   test_nai[True]                                         3   \n",
       "                                                   test_unsupervised[False]                               5   \n",
       "                                                   test_unsupervised[True]                                4   \n",
       "tests.test_aaa_on_gpu                              test_on_gpu_when_requested                             3   \n",
       "tests.utils.test_hyperbolic                        test_poincare_ball_distance_self                       1   \n",
       "\n",
       "                                                                                               state_passed  \n",
       "classname                                          name                                                      \n",
       "tests.core.test_utils                              test_normalize_adj                                     0  \n",
       "tests.data.test_edge_splitter.TestEdgeSplitterH... test_split_data_by_edge_type                           0  \n",
       "                                                   test_split_data_by_edge_type_and_attribute             0  \n",
       "tests.datasets.test_datasets                       test_dataset_download[METR_LA]                         0  \n",
       "tests.layer.test_knowledge_graph                   test_model_rankings[RotH]                              1  \n",
       "                                                   test_rotate                                            0  \n",
       "tests.mapper.test_node_mappers                     test_nodemapper_isolated_nodes                         1  \n",
       "tests.reproducibility.test_graphsage               test_link_prediction[False]                            0  \n",
       "                                                   test_nai[True]                                         0  \n",
       "                                                   test_unsupervised[False]                               2  \n",
       "                                                   test_unsupervised[True]                                2  \n",
       "tests.test_aaa_on_gpu                              test_on_gpu_when_requested                             0  \n",
       "tests.utils.test_hyperbolic                        test_poincare_ball_distance_self                       0  "
      ]
     },
     "execution_count": 294,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failures_of_pass_fail = failures[failures.commit.isin(passed_and_failed.index)]\n",
    "failures_of_pass_fail.groupby([\"classname\", \"name\"]).agg(builds=(\"build\", list), state_failed=(\"state_failed\", \"sum\"), state_passed=(\"state_passed\", \"sum\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}