Skip to content

Instantly share code, notes, and snippets.

@nvictus
Last active August 31, 2023 19:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nvictus/c72a07df052560f24c75576636892a16 to your computer and use it in GitHub Desktop.
Save nvictus/c72a07df052560f24c75576636892a16 to your computer and use it in GitHub Desktop.
Extract info fields from a VCF dataframe
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"from io import BytesIO\n",
"\n",
"import numpy as np\n",
"import oxbow as ox\n",
"import pandas as pd\n",
"import polars as pl\n",
"import pyarrow as pa\n",
"import pysam"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# This dictionary maps the VCF-derived input types to numpy/pandas dtypes\n",
"TYPE_MAP = {\n",
" \"Integer\": \"int64\",\n",
" \"Float\": \"float64\",\n",
" \"String\": \"object\",\n",
" \"Flag\": \"bool\",\n",
"}\n",
"\n",
"\n",
"def read_vcf_as_pandas(vcf_path: str) -> pd.DataFrame:\n",
" ipc = ox.read_vcf(vcf_path)\n",
" return pa.ipc.open_file(BytesIO(ipc)).read_pandas()\n",
"\n",
"\n",
"def read_vcf_as_polars(vcf_path: str) -> pl.DataFrame:\n",
" ipc = ox.read_vcf(vcf_path)\n",
" return pl.read_ipc(ipc)\n",
"\n",
"\n",
"def read_info_schema(vcf_path: str) -> pd.DataFrame:\n",
" \"\"\" \n",
" Read the schema of the INFO column of a VCF.\n",
"\n",
" This currently uses pysam.\n",
"\n",
" Parameters\n",
" ----------\n",
" vcf_path : str\n",
" Path to bgzipped vcf, with tabix indexed .tbi file with similar name \n",
" in same folder.\n",
" \n",
" Returns\n",
" -------\n",
" pd.DataFrame\n",
" Dataframe with [name, number, type] columns\n",
"\n",
" Notes\n",
" -----\n",
" Possible values for `type` are: \"Integer\", \"Float\", \"String\", \"Flag\".\n",
"\n",
" Possible values for `number` are:\n",
" - An integer (e.g. 0, 1, 2, 3, 4, etc.) - for fields where the number\n",
" of values per VCF record is fixed. 0 means the field is a \"Flag\".\n",
" - A string (\"A\", \"G\", \"R\") - for fields where the number of\n",
" values per VCF record is determined by the number of alts, the total\n",
" number of alleles, or the number of genotypes, respectively.\n",
" - A dot (\".\") - for fields where the number of values per VCF record\n",
" varies, is unknown, or is unbounded.\n",
" \"\"\"\n",
" with pysam.VariantFile(vcf_path) as f:\n",
" return pd.DataFrame(\n",
" [(obj.name, obj.number, obj.type) for obj in f.header.info.values()],\n",
" columns=[\"name\", \"number\", \"type\"],\n",
" )\n",
"\n",
"\n",
"def _parse_into_info_record(\n",
" pairs: list[str], \n",
" fields: set[str],\n",
" dtype_map: pd.Series, \n",
" arity_map: pd.Series\n",
") -> dict:\n",
" \"\"\"\n",
" Parse a sequence of key-value INFO strings into a dictionary.\n",
"\n",
" Parameters\n",
" ----------\n",
" pairs : list[str]\n",
" List of `{key}={value}` pairs in the INFO column of a VCF. Note that\n",
" Flag fields will be represented as `{key}` only and variadic values\n",
" will be separated by commas.\n",
" fields : set[str]\n",
" Set of fields to extract from the INFO column.\n",
" dtype_map : pd.Series\n",
" Series mapping INFO field names to a corresponding numpy dtype.\n",
" arity_map : pd.Series\n",
" Series mapping INFO field names to a corresponding number or \"arity\".\n",
"\n",
" Returns\n",
" -------\n",
" dict\n",
" Dictionary mapping INFO field names to their values.\n",
" \"\"\"\n",
" record = {}\n",
" for pair in pairs:\n",
" key, *value = pair.split(\"=\")\n",
" if key in fields:\n",
" arity = arity_map[key]\n",
" dtype = dtype_map[key]\n",
" if arity == 0:\n",
" record[key] = True\n",
" elif arity == 1:\n",
" record[key] = np.array(value[0], dtype=dtype)\n",
" else:\n",
" record[key] = np.array(value[0].split(\",\"), dtype=dtype)\n",
" return record\n",
" \n",
"\n",
"def extract_info(\n",
" info: pd.Series, \n",
" schema: pd.DataFrame, \n",
" fields: list[str] | None = None\n",
") -> pd.DataFrame:\n",
" \"\"\"\n",
" Extracts fields from INFO column of a VCF.\n",
" \n",
" Parameters\n",
" ----------\n",
" info : pd.Series\n",
" The info column of a VCF dataframe.\n",
" schema : pd.DataFrame\n",
" Dataframe with [name, number, type] columns describing the INFO fields\n",
" of a VCF.\n",
" fields : list[str], optional\n",
" List of fields to extract from the INFO column. If None, all fields\n",
" will be extracted.\n",
"\n",
" Returns\n",
" -------\n",
" pd.DataFrame\n",
" Dataframe with columns corresponding to the requested fields.\n",
" \"\"\"\n",
" if fields is None:\n",
" fields = list(schema[\"name\"])\n",
" else:\n",
" names_available = set(schema[\"name\"])\n",
" for field in fields:\n",
" if field not in names_available:\n",
" raise ValueError(f\"Field '{field}' not found in INFO schema.\")\n",
" \n",
" # Create series mapping field names to dtypes and arities\n",
" dtype_map = schema.set_index(\"name\")[\"type\"].map(TYPE_MAP)\n",
" arity_map = schema.set_index(\"name\")[\"number\"]\n",
"\n",
" # Split the key-value pairs in the info column into record dictionaries\n",
" records = (\n",
" info\n",
" .str\n",
" .split(\";\")\n",
" .apply(\n",
" _parse_into_info_record, \n",
" fields=set(fields),\n",
" dtype_map=dtype_map, \n",
" arity_map=arity_map\n",
" )\n",
" )\n",
"\n",
" # Convert to dataframe\n",
" info_df = pd.DataFrame.from_records(records).convert_dtypes()\n",
"\n",
" # Include columns for requested fields that are missing from INFO records \n",
" # but still declared in the schema\n",
" for field in fields:\n",
" if field not in info_df.columns:\n",
" info_df[field] = pd.NA\n",
"\n",
" # Reorder columns as requested\n",
" info_df = info_df[fields]\n",
" return info_df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>number</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CONFLICT</td>\n",
" <td>.</td>\n",
" <td>String</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AC</td>\n",
" <td>A</td>\n",
" <td>Integer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AF</td>\n",
" <td>A</td>\n",
" <td>Float</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NS</td>\n",
" <td>1</td>\n",
" <td>Integer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AN</td>\n",
" <td>1</td>\n",
" <td>Integer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>LV</td>\n",
" <td>1</td>\n",
" <td>Integer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>PS</td>\n",
" <td>1</td>\n",
" <td>String</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>AT</td>\n",
" <td>R</td>\n",
" <td>String</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name number type\n",
"0 CONFLICT . String\n",
"1 AC A Integer\n",
"2 AF A Float\n",
"3 NS 1 Integer\n",
"4 AN 1 Integer\n",
"5 LV 1 Integer\n",
"6 PS 1 String\n",
"7 AT R String"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vcf_path = \"DRR.vcf.gz\"\n",
"df = read_vcf_as_pandas(vcf_path)\n",
"schema = read_info_schema(vcf_path)\n",
"schema"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>chrom</th>\n",
" <th>pos</th>\n",
" <th>id</th>\n",
" <th>ref</th>\n",
" <th>alt</th>\n",
" <th>qual</th>\n",
" <th>filter</th>\n",
" <th>format</th>\n",
" <th>AC</th>\n",
" <th>LV</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>genome</td>\n",
" <td>18</td>\n",
" <td>&gt;8&gt;11</td>\n",
" <td>G</td>\n",
" <td>T</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[12]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>genome</td>\n",
" <td>26</td>\n",
" <td>&gt;12&gt;16</td>\n",
" <td>T</td>\n",
" <td>G,C</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[2, 5]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>genome</td>\n",
" <td>28</td>\n",
" <td>&gt;16&gt;19</td>\n",
" <td>A</td>\n",
" <td>G</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[6]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>genome</td>\n",
" <td>30</td>\n",
" <td>&gt;19&gt;22</td>\n",
" <td>G</td>\n",
" <td>A</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[54]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>genome</td>\n",
" <td>35</td>\n",
" <td>&gt;22&gt;27</td>\n",
" <td>GG</td>\n",
" <td>AG,GT</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[2, 4]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>482</th>\n",
" <td>genome</td>\n",
" <td>9544</td>\n",
" <td>&gt;2561&gt;2564</td>\n",
" <td>T</td>\n",
" <td>C</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[2]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>483</th>\n",
" <td>genome</td>\n",
" <td>9592</td>\n",
" <td>&gt;2566&gt;2569</td>\n",
" <td>C</td>\n",
" <td>T</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[2]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>484</th>\n",
" <td>genome</td>\n",
" <td>9598</td>\n",
" <td>&gt;2569&gt;2572</td>\n",
" <td>G</td>\n",
" <td>A</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[20]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>genome</td>\n",
" <td>9634</td>\n",
" <td>&gt;2574&gt;2577</td>\n",
" <td>T</td>\n",
" <td>C</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[10]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>486</th>\n",
" <td>genome</td>\n",
" <td>9676</td>\n",
" <td>&gt;2581&gt;2584</td>\n",
" <td>A</td>\n",
" <td>G</td>\n",
" <td>60.0</td>\n",
" <td>NaN</td>\n",
" <td>GT:AD</td>\n",
" <td>[83]</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>487 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" chrom pos id ref alt qual filter format AC LV\n",
"0 genome 18 >8>11 G T 60.0 NaN GT:AD [12] 0\n",
"1 genome 26 >12>16 T G,C 60.0 NaN GT:AD [2, 5] 0\n",
"2 genome 28 >16>19 A G 60.0 NaN GT:AD [6] 0\n",
"3 genome 30 >19>22 G A 60.0 NaN GT:AD [54] 0\n",
"4 genome 35 >22>27 GG AG,GT 60.0 NaN GT:AD [2, 4] 0\n",
".. ... ... ... .. ... ... ... ... ... ..\n",
"482 genome 9544 >2561>2564 T C 60.0 NaN GT:AD [2] 0\n",
"483 genome 9592 >2566>2569 C T 60.0 NaN GT:AD [2] 0\n",
"484 genome 9598 >2569>2572 G A 60.0 NaN GT:AD [20] 0\n",
"485 genome 9634 >2574>2577 T C 60.0 NaN GT:AD [10] 0\n",
"486 genome 9676 >2581>2584 A G 60.0 NaN GT:AD [83] 0\n",
"\n",
"[487 rows x 10 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"info = extract_info(df[\"info\"], schema, [\"AC\", \"LV\"])\n",
"out = df.drop(columns=[\"info\"]).assign(**info)\n",
"out = out.fillna(pd.NA)\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "vcfcodeathon2023",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment