nvictus/01-simplify_vcf_info.ipynb

## 01-simplify_vcf_info.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import annotations\n",
    "from io import BytesIO\n",
    "\n",
    "import numpy as np\n",
    "import oxbow as ox\n",
    "import pandas as pd\n",
    "import polars as pl\n",
    "import pyarrow as pa\n",
    "import pysam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This dictionary maps the VCF-derived input types to numpy/pandas dtypes\n",
    "TYPE_MAP = {\n",
    "    \"Integer\": \"int64\",\n",
    "    \"Float\": \"float64\",\n",
    "    \"String\": \"object\",\n",
    "    \"Flag\": \"bool\",\n",
    "}\n",
    "\n",
    "\n",
    "def read_vcf_as_pandas(vcf_path: str) -> pd.DataFrame:\n",
    "    ipc = ox.read_vcf(vcf_path)\n",
    "    return pa.ipc.open_file(BytesIO(ipc)).read_pandas()\n",
    "\n",
    "\n",
    "def read_vcf_as_polars(vcf_path: str) -> pl.DataFrame:\n",
    "    ipc = ox.read_vcf(vcf_path)\n",
    "    return pl.read_ipc(ipc)\n",
    "\n",
    "\n",
    "def read_info_schema(vcf_path: str) -> pd.DataFrame:\n",
    "    \"\"\" \n",
    "    Read the schema of the INFO column of a VCF.\n",
    "\n",
    "    This currently uses pysam.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    vcf_path : str\n",
    "        Path to bgzipped vcf, with tabix indexed .tbi file with similar name \n",
    "        in same folder.\n",
    "    \n",
    "    Returns\n",
    "    -------\n",
    "    pd.DataFrame\n",
    "        Dataframe with [name, number, type] columns\n",
    "\n",
    "    Notes\n",
    "    -----\n",
    "    Possible values for `type` are: \"Integer\", \"Float\", \"String\", \"Flag\".\n",
    "\n",
    "    Possible values for `number` are:\n",
    "        - An integer (e.g. 0, 1, 2, 3, 4, etc.) - for fields where the number\n",
    "          of values per VCF record is fixed. 0 means the field is a \"Flag\".\n",
    "        - A string (\"A\", \"G\", \"R\") - for fields where the number of\n",
    "          values per VCF record is determined by the number of alts, the total\n",
    "          number of alleles, or the number of genotypes, respectively.\n",
    "        - A dot (\".\") - for fields where the number of values per VCF record\n",
    "          varies, is unknown, or is unbounded.\n",
    "    \"\"\"\n",
    "    with pysam.VariantFile(vcf_path) as f:\n",
    "        return pd.DataFrame(\n",
    "            [(obj.name, obj.number, obj.type) for obj in f.header.info.values()],\n",
    "            columns=[\"name\", \"number\", \"type\"],\n",
    "        )\n",
    "\n",
    "\n",
    "def _parse_into_info_record(\n",
    "    pairs: list[str], \n",
    "    fields: set[str],\n",
    "    dtype_map: pd.Series, \n",
    "    arity_map: pd.Series\n",
    ") -> dict:\n",
    "    \"\"\"\n",
    "    Parse a sequence of key-value INFO strings into a dictionary.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    pairs : list[str]\n",
    "        List of `{key}={value}` pairs in the INFO column of a VCF. Note that\n",
    "        Flag fields will be represented as `{key}` only and variadic values\n",
    "        will be separated by commas.\n",
    "    fields : set[str]\n",
    "        Set of fields to extract from the INFO column.\n",
    "    dtype_map : pd.Series\n",
    "        Series mapping INFO field names to a corresponding numpy dtype.\n",
    "    arity_map : pd.Series\n",
    "        Series mapping INFO field names to a corresponding number or \"arity\".\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    dict\n",
    "        Dictionary mapping INFO field names to their values.\n",
    "    \"\"\"\n",
    "    record = {}\n",
    "    for pair in pairs:\n",
    "        key, *value = pair.split(\"=\")\n",
    "        if key in fields:\n",
    "            arity = arity_map[key]\n",
    "            dtype = dtype_map[key]\n",
    "            if arity == 0:\n",
    "                record[key] = True\n",
    "            elif arity == 1:\n",
    "                record[key] = np.array(value[0], dtype=dtype)\n",
    "            else:\n",
    "                record[key] = np.array(value[0].split(\",\"), dtype=dtype)\n",
    "    return record\n",
    " \n",
    "\n",
    "def extract_info(\n",
    "    info: pd.Series, \n",
    "    schema: pd.DataFrame, \n",
    "    fields: list[str] | None = None\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Extracts fields from INFO column of a VCF.\n",
    "        \n",
    "    Parameters\n",
    "    ----------\n",
    "    info : pd.Series\n",
    "        The info column of a VCF dataframe.\n",
    "    schema : pd.DataFrame\n",
    "        Dataframe with [name, number, type] columns describing the INFO fields\n",
    "        of a VCF.\n",
    "    fields : list[str], optional\n",
    "        List of fields to extract from the INFO column. If None, all fields\n",
    "        will be extracted.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    pd.DataFrame\n",
    "        Dataframe with columns corresponding to the requested fields.\n",
    "    \"\"\"\n",
    "    if fields is None:\n",
    "        fields = list(schema[\"name\"])\n",
    "    else:\n",
    "        names_available = set(schema[\"name\"])\n",
    "        for field in fields:\n",
    "            if field not in names_available:\n",
    "                raise ValueError(f\"Field '{field}' not found in INFO schema.\")\n",
    "    \n",
    "    # Create series mapping field names to dtypes and arities\n",
    "    dtype_map = schema.set_index(\"name\")[\"type\"].map(TYPE_MAP)\n",
    "    arity_map = schema.set_index(\"name\")[\"number\"]\n",
    "\n",
    "    # Split the key-value pairs in the info column into record dictionaries\n",
    "    records = (\n",
    "        info\n",
    "        .str\n",
    "        .split(\";\")\n",
    "        .apply(\n",
    "            _parse_into_info_record, \n",
    "            fields=set(fields),\n",
    "            dtype_map=dtype_map, \n",
    "            arity_map=arity_map\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Convert to dataframe\n",
    "    info_df = pd.DataFrame.from_records(records).convert_dtypes()\n",
    "\n",
    "    # Include columns for requested fields that are missing from INFO records \n",
    "    # but still declared in the schema\n",
    "    for field in fields:\n",
    "        if field not in info_df.columns:\n",
    "            info_df[field] = pd.NA\n",
    "\n",
    "    # Reorder columns as requested\n",
    "    info_df = info_df[fields]\n",
    "    return info_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>number</th>\n",
       "      <th>type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CONFLICT</td>\n",
       "      <td>.</td>\n",
       "      <td>String</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AC</td>\n",
       "      <td>A</td>\n",
       "      <td>Integer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AF</td>\n",
       "      <td>A</td>\n",
       "      <td>Float</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NS</td>\n",
       "      <td>1</td>\n",
       "      <td>Integer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AN</td>\n",
       "      <td>1</td>\n",
       "      <td>Integer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>LV</td>\n",
       "      <td>1</td>\n",
       "      <td>Integer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>PS</td>\n",
       "      <td>1</td>\n",
       "      <td>String</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>AT</td>\n",
       "      <td>R</td>\n",
       "      <td>String</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       name number     type\n",
       "0  CONFLICT      .   String\n",
       "1        AC      A  Integer\n",
       "2        AF      A    Float\n",
       "3        NS      1  Integer\n",
       "4        AN      1  Integer\n",
       "5        LV      1  Integer\n",
       "6        PS      1   String\n",
       "7        AT      R   String"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vcf_path = \"DRR.vcf.gz\"\n",
    "df = read_vcf_as_pandas(vcf_path)\n",
    "schema = read_info_schema(vcf_path)\n",
    "schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>chrom</th>\n",
       "      <th>pos</th>\n",
       "      <th>id</th>\n",
       "      <th>ref</th>\n",
       "      <th>alt</th>\n",
       "      <th>qual</th>\n",
       "      <th>filter</th>\n",
       "      <th>format</th>\n",
       "      <th>AC</th>\n",
       "      <th>LV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>genome</td>\n",
       "      <td>18</td>\n",
       "      <td>&gt;8&gt;11</td>\n",
       "      <td>G</td>\n",
       "      <td>T</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[12]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>genome</td>\n",
       "      <td>26</td>\n",
       "      <td>&gt;12&gt;16</td>\n",
       "      <td>T</td>\n",
       "      <td>G,C</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[2, 5]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>genome</td>\n",
       "      <td>28</td>\n",
       "      <td>&gt;16&gt;19</td>\n",
       "      <td>A</td>\n",
       "      <td>G</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[6]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>genome</td>\n",
       "      <td>30</td>\n",
       "      <td>&gt;19&gt;22</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[54]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>genome</td>\n",
       "      <td>35</td>\n",
       "      <td>&gt;22&gt;27</td>\n",
       "      <td>GG</td>\n",
       "      <td>AG,GT</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[2, 4]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>482</th>\n",
       "      <td>genome</td>\n",
       "      <td>9544</td>\n",
       "      <td>&gt;2561&gt;2564</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[2]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>genome</td>\n",
       "      <td>9592</td>\n",
       "      <td>&gt;2566&gt;2569</td>\n",
       "      <td>C</td>\n",
       "      <td>T</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[2]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>genome</td>\n",
       "      <td>9598</td>\n",
       "      <td>&gt;2569&gt;2572</td>\n",
       "      <td>G</td>\n",
       "      <td>A</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[20]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>genome</td>\n",
       "      <td>9634</td>\n",
       "      <td>&gt;2574&gt;2577</td>\n",
       "      <td>T</td>\n",
       "      <td>C</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[10]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>486</th>\n",
       "      <td>genome</td>\n",
       "      <td>9676</td>\n",
       "      <td>&gt;2581&gt;2584</td>\n",
       "      <td>A</td>\n",
       "      <td>G</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GT:AD</td>\n",
       "      <td>[83]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>487 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      chrom   pos          id ref    alt  qual filter format      AC LV\n",
       "0    genome    18       >8>11   G      T  60.0    NaN  GT:AD    [12]  0\n",
       "1    genome    26      >12>16   T    G,C  60.0    NaN  GT:AD  [2, 5]  0\n",
       "2    genome    28      >16>19   A      G  60.0    NaN  GT:AD     [6]  0\n",
       "3    genome    30      >19>22   G      A  60.0    NaN  GT:AD    [54]  0\n",
       "4    genome    35      >22>27  GG  AG,GT  60.0    NaN  GT:AD  [2, 4]  0\n",
       "..      ...   ...         ...  ..    ...   ...    ...    ...     ... ..\n",
       "482  genome  9544  >2561>2564   T      C  60.0    NaN  GT:AD     [2]  0\n",
       "483  genome  9592  >2566>2569   C      T  60.0    NaN  GT:AD     [2]  0\n",
       "484  genome  9598  >2569>2572   G      A  60.0    NaN  GT:AD    [20]  0\n",
       "485  genome  9634  >2574>2577   T      C  60.0    NaN  GT:AD    [10]  0\n",
       "486  genome  9676  >2581>2584   A      G  60.0    NaN  GT:AD    [83]  0\n",
       "\n",
       "[487 rows x 10 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "info = extract_info(df[\"info\"], schema, [\"AC\", \"LV\"])\n",
    "out = df.drop(columns=[\"info\"]).assign(**info)\n",
    "out = out.fillna(pd.NA)\n",
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vcfcodeathon2023",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from __future__ import annotations\n",
	"from io import BytesIO\n",
	"\n",
	"import numpy as np\n",
	"import oxbow as ox\n",
	"import pandas as pd\n",
	"import polars as pl\n",
	"import pyarrow as pa\n",
	"import pysam"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# This dictionary maps the VCF-derived input types to numpy/pandas dtypes\n",
	"TYPE_MAP = {\n",
	" \"Integer\": \"int64\",\n",
	" \"Float\": \"float64\",\n",
	" \"String\": \"object\",\n",
	" \"Flag\": \"bool\",\n",
	"}\n",
	"\n",
	"\n",
	"def read_vcf_as_pandas(vcf_path: str) -> pd.DataFrame:\n",
	" ipc = ox.read_vcf(vcf_path)\n",
	" return pa.ipc.open_file(BytesIO(ipc)).read_pandas()\n",
	"\n",
	"\n",
	"def read_vcf_as_polars(vcf_path: str) -> pl.DataFrame:\n",
	" ipc = ox.read_vcf(vcf_path)\n",
	" return pl.read_ipc(ipc)\n",
	"\n",
	"\n",
	"def read_info_schema(vcf_path: str) -> pd.DataFrame:\n",
	" \"\"\" \n",
	" Read the schema of the INFO column of a VCF.\n",
	"\n",
	" This currently uses pysam.\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" vcf_path : str\n",
	" Path to bgzipped vcf, with tabix indexed .tbi file with similar name \n",
	" in same folder.\n",
	" \n",
	" Returns\n",
	" -------\n",
	" pd.DataFrame\n",
	" Dataframe with [name, number, type] columns\n",
	"\n",
	" Notes\n",
	" -----\n",
	" Possible values for `type` are: \"Integer\", \"Float\", \"String\", \"Flag\".\n",
	"\n",
	" Possible values for `number` are:\n",
	" - An integer (e.g. 0, 1, 2, 3, 4, etc.) - for fields where the number\n",
	" of values per VCF record is fixed. 0 means the field is a \"Flag\".\n",
	" - A string (\"A\", \"G\", \"R\") - for fields where the number of\n",
	" values per VCF record is determined by the number of alts, the total\n",
	" number of alleles, or the number of genotypes, respectively.\n",
	" - A dot (\".\") - for fields where the number of values per VCF record\n",
	" varies, is unknown, or is unbounded.\n",
	" \"\"\"\n",
	" with pysam.VariantFile(vcf_path) as f:\n",
	" return pd.DataFrame(\n",
	" [(obj.name, obj.number, obj.type) for obj in f.header.info.values()],\n",
	" columns=[\"name\", \"number\", \"type\"],\n",
	" )\n",
	"\n",
	"\n",
	"def _parse_into_info_record(\n",
	" pairs: list[str], \n",
	" fields: set[str],\n",
	" dtype_map: pd.Series, \n",
	" arity_map: pd.Series\n",
	") -> dict:\n",
	" \"\"\"\n",
	" Parse a sequence of key-value INFO strings into a dictionary.\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" pairs : list[str]\n",
	" List of `{key}={value}` pairs in the INFO column of a VCF. Note that\n",
	" Flag fields will be represented as `{key}` only and variadic values\n",
	" will be separated by commas.\n",
	" fields : set[str]\n",
	" Set of fields to extract from the INFO column.\n",
	" dtype_map : pd.Series\n",
	" Series mapping INFO field names to a corresponding numpy dtype.\n",
	" arity_map : pd.Series\n",
	" Series mapping INFO field names to a corresponding number or \"arity\".\n",
	"\n",
	" Returns\n",
	" -------\n",
	" dict\n",
	" Dictionary mapping INFO field names to their values.\n",
	" \"\"\"\n",
	" record = {}\n",
	" for pair in pairs:\n",
	" key, *value = pair.split(\"=\")\n",
	" if key in fields:\n",
	" arity = arity_map[key]\n",
	" dtype = dtype_map[key]\n",
	" if arity == 0:\n",
	" record[key] = True\n",
	" elif arity == 1:\n",
	" record[key] = np.array(value[0], dtype=dtype)\n",
	" else:\n",
	" record[key] = np.array(value[0].split(\",\"), dtype=dtype)\n",
	" return record\n",
	" \n",
	"\n",
	"def extract_info(\n",
	" info: pd.Series, \n",
	" schema: pd.DataFrame, \n",
	" fields: list[str] \| None = None\n",
	") -> pd.DataFrame:\n",
	" \"\"\"\n",
	" Extracts fields from INFO column of a VCF.\n",
	" \n",
	" Parameters\n",
	" ----------\n",
	" info : pd.Series\n",
	" The info column of a VCF dataframe.\n",
	" schema : pd.DataFrame\n",
	" Dataframe with [name, number, type] columns describing the INFO fields\n",
	" of a VCF.\n",
	" fields : list[str], optional\n",
	" List of fields to extract from the INFO column. If None, all fields\n",
	" will be extracted.\n",
	"\n",
	" Returns\n",
	" -------\n",
	" pd.DataFrame\n",
	" Dataframe with columns corresponding to the requested fields.\n",
	" \"\"\"\n",
	" if fields is None:\n",
	" fields = list(schema[\"name\"])\n",
	" else:\n",
	" names_available = set(schema[\"name\"])\n",
	" for field in fields:\n",
	" if field not in names_available:\n",
	" raise ValueError(f\"Field '{field}' not found in INFO schema.\")\n",
	" \n",
	" # Create series mapping field names to dtypes and arities\n",
	" dtype_map = schema.set_index(\"name\")[\"type\"].map(TYPE_MAP)\n",
	" arity_map = schema.set_index(\"name\")[\"number\"]\n",
	"\n",
	" # Split the key-value pairs in the info column into record dictionaries\n",
	" records = (\n",
	" info\n",
	" .str\n",
	" .split(\";\")\n",
	" .apply(\n",
	" _parse_into_info_record, \n",
	" fields=set(fields),\n",
	" dtype_map=dtype_map, \n",
	" arity_map=arity_map\n",
	" )\n",
	" )\n",
	"\n",
	" # Convert to dataframe\n",
	" info_df = pd.DataFrame.from_records(records).convert_dtypes()\n",
	"\n",
	" # Include columns for requested fields that are missing from INFO records \n",
	" # but still declared in the schema\n",
	" for field in fields:\n",
	" if field not in info_df.columns:\n",
	" info_df[field] = pd.NA\n",
	"\n",
	" # Reorder columns as requested\n",
	" info_df = info_df[fields]\n",
	" return info_df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>name</th>\n",
	" <th>number</th>\n",
	" <th>type</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>CONFLICT</td>\n",
	" <td>.</td>\n",
	" <td>String</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>AC</td>\n",
	" <td>A</td>\n",
	" <td>Integer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>AF</td>\n",
	" <td>A</td>\n",
	" <td>Float</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>NS</td>\n",
	" <td>1</td>\n",
	" <td>Integer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>AN</td>\n",
	" <td>1</td>\n",
	" <td>Integer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>LV</td>\n",
	" <td>1</td>\n",
	" <td>Integer</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>PS</td>\n",
	" <td>1</td>\n",
	" <td>String</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>AT</td>\n",
	" <td>R</td>\n",
	" <td>String</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" name number type\n",
	"0 CONFLICT . String\n",
	"1 AC A Integer\n",
	"2 AF A Float\n",
	"3 NS 1 Integer\n",
	"4 AN 1 Integer\n",
	"5 LV 1 Integer\n",
	"6 PS 1 String\n",
	"7 AT R String"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vcf_path = \"DRR.vcf.gz\"\n",
	"df = read_vcf_as_pandas(vcf_path)\n",
	"schema = read_info_schema(vcf_path)\n",
	"schema"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>chrom</th>\n",
	" <th>pos</th>\n",
	" <th>id</th>\n",
	" <th>ref</th>\n",
	" <th>alt</th>\n",
	" <th>qual</th>\n",
	" <th>filter</th>\n",
	" <th>format</th>\n",
	" <th>AC</th>\n",
	" <th>LV</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>genome</td>\n",
	" <td>18</td>\n",
	" <td>>8>11</td>\n",
	" <td>G</td>\n",
	" <td>T</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[12]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>genome</td>\n",
	" <td>26</td>\n",
	" <td>>12>16</td>\n",
	" <td>T</td>\n",
	" <td>G,C</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[2, 5]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>genome</td>\n",
	" <td>28</td>\n",
	" <td>>16>19</td>\n",
	" <td>A</td>\n",
	" <td>G</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[6]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>genome</td>\n",
	" <td>30</td>\n",
	" <td>>19>22</td>\n",
	" <td>G</td>\n",
	" <td>A</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[54]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>genome</td>\n",
	" <td>35</td>\n",
	" <td>>22>27</td>\n",
	" <td>GG</td>\n",
	" <td>AG,GT</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[2, 4]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>482</th>\n",
	" <td>genome</td>\n",
	" <td>9544</td>\n",
	" <td>>2561>2564</td>\n",
	" <td>T</td>\n",
	" <td>C</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[2]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>483</th>\n",
	" <td>genome</td>\n",
	" <td>9592</td>\n",
	" <td>>2566>2569</td>\n",
	" <td>C</td>\n",
	" <td>T</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[2]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>484</th>\n",
	" <td>genome</td>\n",
	" <td>9598</td>\n",
	" <td>>2569>2572</td>\n",
	" <td>G</td>\n",
	" <td>A</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[20]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>485</th>\n",
	" <td>genome</td>\n",
	" <td>9634</td>\n",
	" <td>>2574>2577</td>\n",
	" <td>T</td>\n",
	" <td>C</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[10]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>486</th>\n",
	" <td>genome</td>\n",
	" <td>9676</td>\n",
	" <td>>2581>2584</td>\n",
	" <td>A</td>\n",
	" <td>G</td>\n",
	" <td>60.0</td>\n",
	" <td>NaN</td>\n",
	" <td>GT:AD</td>\n",
	" <td>[83]</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>487 rows × 10 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" chrom pos id ref alt qual filter format AC LV\n",
	"0 genome 18 >8>11 G T 60.0 NaN GT:AD [12] 0\n",
	"1 genome 26 >12>16 T G,C 60.0 NaN GT:AD [2, 5] 0\n",
	"2 genome 28 >16>19 A G 60.0 NaN GT:AD [6] 0\n",
	"3 genome 30 >19>22 G A 60.0 NaN GT:AD [54] 0\n",
	"4 genome 35 >22>27 GG AG,GT 60.0 NaN GT:AD [2, 4] 0\n",
	".. ... ... ... .. ... ... ... ... ... ..\n",
	"482 genome 9544 >2561>2564 T C 60.0 NaN GT:AD [2] 0\n",
	"483 genome 9592 >2566>2569 C T 60.0 NaN GT:AD [2] 0\n",
	"484 genome 9598 >2569>2572 G A 60.0 NaN GT:AD [20] 0\n",
	"485 genome 9634 >2574>2577 T C 60.0 NaN GT:AD [10] 0\n",
	"486 genome 9676 >2581>2584 A G 60.0 NaN GT:AD [83] 0\n",
	"\n",
	"[487 rows x 10 columns]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"info = extract_info(df[\"info\"], schema, [\"AC\", \"LV\"])\n",
	"out = df.drop(columns=[\"info\"]).assign(**info)\n",
	"out = out.fillna(pd.NA)\n",
	"out"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "vcfcodeathon2023",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.4"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}