bendichter/convert_ephys_dandiset_to_bids.ipynb

## convert_ephys_dandiset_to_bids.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ced26f74",
   "metadata": {},
   "source": [
    "## Convert extarcellular electrophysiology NWB dataset to BIDS\n",
    "\n",
    "This is a notebook that prototypes a conversion of a collection of NWB files to the BIDS format with the extension outlined in [BEP 32](https://docs.google.com/document/d/1oG-C8T-dWPqfVzL2W8HO3elWK8NIh2cOCPssRGv23n0/edit). \n",
    "\n",
    "The general strategy is: \n",
    "1. Iterate over all NWB files and extract the relevant metadata from each file, building a mapping from filepath to metadata.\n",
    "2. Build the BIDS structure step-by-step:\n",
    "    1. root\n",
    "    2. participants\n",
    "    3. sessions\n",
    "    4. contacts, probes, and channels\n",
    "    \n",
    "This strategy of metadata extraction and then BIDS formation is ideal because opening the NWB files is by far the most time-intensive step, and this approach ensures that this is done only once per file.\n",
    "\n",
    "\n",
    "Metadata is extracted entirely from the contents of the NWB files, so it will not matter what the source directory structure is or what the original filenames are. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "17ec0d1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pynwb import NWBHDF5IO\n",
    "from pynwb.ecephys import ElectricalSeries\n",
    "from glob import glob\n",
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def extract_metadata(filepath: str) -> dict:\n",
    "\n",
    "    with NWBHDF5IO(filepath, load_namespaces=True) as io:\n",
    "        nwbfile = io.read()\n",
    "\n",
    "        subject = nwbfile.subject\n",
    "\n",
    "        probes = set([x.device for x in nwbfile.electrodes[\"group\"][:]])\n",
    "\n",
    "\n",
    "        ess = [\n",
    "            x for x in nwbfile.objects.values()\n",
    "            if isinstance(x, ElectricalSeries)\n",
    "        ]\n",
    "\n",
    "        metadata = {\n",
    "            \"general_ephys\": {\n",
    "                \"InstitutionName\": nwbfile.institution,\n",
    "\n",
    "            },\n",
    "            \"participant\": {\n",
    "                \"participant_id\": \"sub-\" + subject.subject_id,\n",
    "                \"species\": subject.species,\n",
    "                \"strain\": subject.strain,\n",
    "                \"birthday\": subject.date_of_birth,\n",
    "                \"age\": subject.age,\n",
    "                \"sex\": subject.sex,\n",
    "            },\n",
    "            \"session\": {\n",
    "                \"session_id\": \"ses-\" + nwbfile.session_id,\n",
    "                \"number_of_trials\": len(nwbfile.trials) if nwbfile.trials else None,\n",
    "                \"comments\": nwbfile.session_description,\n",
    "            },\n",
    "            \"probes\": [\n",
    "                {\n",
    "                    \"probe_id\": probe.name,\n",
    "                    \"type\": \"unknown\",\n",
    "                    \"description\": probe.description,\n",
    "                    \"manufacturer\": probe.manufacturer,\n",
    "                }\n",
    "                for probe in probes\n",
    "            ],\n",
    "            \"contacts\": [\n",
    "                {\n",
    "                    \"contact_id\": contact.index[0],\n",
    "                    \"probe_id\": contact.group.iloc[0].device.name,\n",
    "                    \"impedance\": contact[\"imp\"].iloc[0] if contact[\"imp\"].iloc[0] > 0 else None,\n",
    "                    \"location\": contact[\"location\"].iloc[0] if contact[\"location\"].iloc[0] not in (\"unknown\",) else None,\n",
    "                }\n",
    "                for contact in nwbfile.electrodes\n",
    "            ],\n",
    "            \"channels\": [\n",
    "                {\n",
    "                    \"channel_id\": contact.index[0],\n",
    "                    \"contact_id\": contact.index[0],\n",
    "                    \"type\": \"EXT\",\n",
    "                    \"unit\": \"V\",\n",
    "                    \"sampling_frequency\": ess[0].rate,\n",
    "                    \"gain\": ess[0].conversion,\n",
    "                }\n",
    "                for contact in nwbfile.electrodes\n",
    "            ]\n",
    "        }\n",
    "    \n",
    "    return metadata\n",
    "\n",
    "def unique_list_of_dicts(data):\n",
    "    # Convert to set of tuples\n",
    "    unique_data = set(tuple(d.items()) for d in data)\n",
    "    \n",
    "    # Convert back to list of dictionaries\n",
    "    unique_list_of_dicts = [dict(t) for t in unique_data]\n",
    "    \n",
    "    return unique_list_of_dicts\n",
    "\n",
    "\n",
    "def drop_false_cols(df):\n",
    "    for col in df.columns:\n",
    "        if not any(df[col][:]):\n",
    "            df.drop(columns=[col], inplace=True)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "d72c7090",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/bendichter/opt/miniconda3/lib/python3.9/site-packages/hdmf/spec/namespace.py:531: UserWarning: Ignoring cached namespace 'hdmf-common' version 1.3.0 because version 1.7.0 is already loaded.\n",
      "  warn(\"Ignoring cached namespace '%s' version %s because version %s is already loaded.\"\n",
      "/Users/bendichter/opt/miniconda3/lib/python3.9/site-packages/hdmf/spec/namespace.py:531: UserWarning: Ignoring cached namespace 'core' version 2.2.5 because version 2.6.0-alpha is already loaded.\n",
      "  warn(\"Ignoring cached namespace '%s' version %s because version %s is already loaded.\"\n"
     ]
    }
   ],
   "source": [
    "path = \"/Volumes/Extreme Pro/neural_data/dandisets/000044\"\n",
    "\n",
    "nwb_files = glob(path + \"/sub-*/*.nwb\")\n",
    "\n",
    "all_metadata = {x: extract_metadata(x) for x in nwb_files}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "ffa56cef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# root\n",
    "\n",
    "out_path = \"bids_output\"\n",
    "\n",
    "os.mkdir(out_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "6cbfb45c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# participants\n",
    "\n",
    "# create particiant table\n",
    "\n",
    "participants = unique_list_of_dicts(\n",
    "    [x[\"participant\"] for x in all_metadata.values()]\n",
    ")\n",
    "\n",
    "df = pd.DataFrame(participants)\n",
    "\n",
    "drop_false_cols(df)\n",
    "        \n",
    "df.to_csv(os.path.join(out_path, \"participants.tsv\"), sep=\"\\t\", index=False)\n",
    "\n",
    "\n",
    "# create particiant json\n",
    "default_participants_json = {\n",
    "    \"participant_id\": {\"Description\": \"Unique identifier of the participant\"},\n",
    "    \"species\": {\"Description\": \"The binomial species name from the NCBI Taxonomy\"},\n",
    "    \"strain\": {\"Description\": \"Identifier of the strain\"},\n",
    "    \"birthdate\": {\"Description\": \"Day of birth of the participant in ISO8601 format\"},\n",
    "    \"age\": {\"Description\": \"Age of the participant at time of recording\", \"Units\": \"days\"},\n",
    "    \"sex\": {\"Description\": \"Sex of participant\"},\n",
    "}\n",
    "\n",
    "participants_json = {k: v for k, v in default_participants_json.items() if k in df.columns}\n",
    "\n",
    "with open(os.path.join(out_path, \"participants.json\"), \"w\") as json_file:\n",
    "    json.dump(participants_json, json_file, indent=4)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "3d6889d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# sessions\n",
    "\n",
    "default_session_json = {\n",
    "   \"session_quality\": {\n",
    "      \"LongName\": \"General quality of the session\",\n",
    "      \"Description\": \"Quality of the session\",\n",
    "      \"Levels\": {\n",
    "         \"Bad\": \"Bad quality, should not be considered for further analysis\",\n",
    "         \"ok\": \"Ok quality, can be considered for further analysis with care\",\n",
    "         \"good\": \"Good quality, should be used for analysis\",\n",
    "         \"Excellent\": \"Excellent quality, extraordinarily good session\",\n",
    "      }\n",
    "   },\n",
    "   \"data_quality\": {\n",
    "      \"LongName\": \"Quality of the recorded signals\",\n",
    "      \"Description\": \"Quality of the recorded signals\",\n",
    "      \"Levels\": {\n",
    "         \"Bad\": \"Bad quality, should not be considered for further analysis\",\n",
    "         \"ok\": \"Ok quality, can be considered for further analysis with care\",\n",
    "         \"good\": \"Good quality, should be used for analysis\",\n",
    "         \"Excellent\": \"Excellent quality, extraordinarily good session\",\n",
    "      },\n",
    "   },\n",
    "   \"number_of_trials\": {\n",
    "      \"LongName\": \"Number of trials in this session\",\n",
    "      \"Description\": \"Count of attempted trials in the session (integer)\",\n",
    "   },\n",
    "   \"comment\": {\n",
    "      \"LongName\": \"General comments\",\n",
    "      \"Description\": \"General comments by the experimenter on the session\",\n",
    "   },\n",
    "}\n",
    "\n",
    "for participant in participants:\n",
    "    participant_id = participant[\"participant_id\"]\n",
    "    \n",
    "    os.mkdir(os.path.join(out_path, participant_id))\n",
    "    \n",
    "    for metadata in all_metadata.values():\n",
    "        sessions = [\n",
    "            x[\"session\"] for x in all_metadata.values() if\n",
    "            x[\"participant\"][\"participant_id\"] == participant_id\n",
    "        ]\n",
    "        \n",
    "        df = pd.DataFrame(sessions)\n",
    "        drop_false_cols(df)\n",
    "        \n",
    "        df.to_csv(os.path.join(out_path, participant_id, \"sessions.tsv\"), sep=\"\\t\", index=False)\n",
    "        \n",
    "        session_json = {k: v for k, v in default_session_json.items() if k in df.columns}\n",
    "        \n",
    "        with open(os.path.join(out_path, participant_id, \"sessions.json\"), \"w\") as json_file:\n",
    "            json.dump(session_json, json_file, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "34232c6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# contacts, probes, and channels\n",
    "\n",
    "for metadata in all_metadata.values():\n",
    "\n",
    "    session_id = metadata[\"session\"][\"session_id\"]\n",
    "    participant_id = metadata[\"participant\"][\"participant_id\"]\n",
    "\n",
    "    os.mkdir(os.path.join(out_path, participant_id, session_id))\n",
    "    os.mkdir(os.path.join(out_path, participant_id, session_id, \"ephys\"))\n",
    "    \n",
    "    for var in (\"contacts\", \"probes\", \"channels\"):\n",
    "        df = pd.DataFrame(metadata[var])\n",
    "        drop_false_cols(df)\n",
    "        df.to_csv(os.path.join(out_path, participant_id, session_id, \"ephys\", var + \".tsv\"), sep=\"\\t\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aec8723c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "ced26f74",
	"metadata": {},
	"source": [
	"## Convert extarcellular electrophysiology NWB dataset to BIDS\n",
	"\n",
	"This is a notebook that prototypes a conversion of a collection of NWB files to the BIDS format with the extension outlined in [BEP 32](https://docs.google.com/document/d/1oG-C8T-dWPqfVzL2W8HO3elWK8NIh2cOCPssRGv23n0/edit). \n",
	"\n",
	"The general strategy is: \n",
	"1. Iterate over all NWB files and extract the relevant metadata from each file, building a mapping from filepath to metadata.\n",
	"2. Build the BIDS structure step-by-step:\n",
	" 1. root\n",
	" 2. participants\n",
	" 3. sessions\n",
	" 4. contacts, probes, and channels\n",
	" \n",
	"This strategy of metadata extraction and then BIDS formation is ideal because opening the NWB files is by far the most time-intensive step, and this approach ensures that this is done only once per file.\n",
	"\n",
	"\n",
	"Metadata is extracted entirely from the contents of the NWB files, so it will not matter what the source directory structure is or what the original filenames are. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 124,
	"id": "17ec0d1c",
	"metadata": {},
	"outputs": [],
	"source": [
	"from pynwb import NWBHDF5IO\n",
	"from pynwb.ecephys import ElectricalSeries\n",
	"from glob import glob\n",
	"import os\n",
	"import json\n",
	"import pandas as pd\n",
	"\n",
	"\n",
	"def extract_metadata(filepath: str) -> dict:\n",
	"\n",
	" with NWBHDF5IO(filepath, load_namespaces=True) as io:\n",
	" nwbfile = io.read()\n",
	"\n",
	" subject = nwbfile.subject\n",
	"\n",
	" probes = set([x.device for x in nwbfile.electrodes[\"group\"][:]])\n",
	"\n",
	"\n",
	" ess = [\n",
	" x for x in nwbfile.objects.values()\n",
	" if isinstance(x, ElectricalSeries)\n",
	" ]\n",
	"\n",
	" metadata = {\n",
	" \"general_ephys\": {\n",
	" \"InstitutionName\": nwbfile.institution,\n",
	"\n",
	" },\n",
	" \"participant\": {\n",
	" \"participant_id\": \"sub-\" + subject.subject_id,\n",
	" \"species\": subject.species,\n",
	" \"strain\": subject.strain,\n",
	" \"birthday\": subject.date_of_birth,\n",
	" \"age\": subject.age,\n",
	" \"sex\": subject.sex,\n",
	" },\n",
	" \"session\": {\n",
	" \"session_id\": \"ses-\" + nwbfile.session_id,\n",
	" \"number_of_trials\": len(nwbfile.trials) if nwbfile.trials else None,\n",
	" \"comments\": nwbfile.session_description,\n",
	" },\n",
	" \"probes\": [\n",
	" {\n",
	" \"probe_id\": probe.name,\n",
	" \"type\": \"unknown\",\n",
	" \"description\": probe.description,\n",
	" \"manufacturer\": probe.manufacturer,\n",
	" }\n",
	" for probe in probes\n",
	" ],\n",
	" \"contacts\": [\n",
	" {\n",
	" \"contact_id\": contact.index[0],\n",
	" \"probe_id\": contact.group.iloc[0].device.name,\n",
	" \"impedance\": contact[\"imp\"].iloc[0] if contact[\"imp\"].iloc[0] > 0 else None,\n",
	" \"location\": contact[\"location\"].iloc[0] if contact[\"location\"].iloc[0] not in (\"unknown\",) else None,\n",
	" }\n",
	" for contact in nwbfile.electrodes\n",
	" ],\n",
	" \"channels\": [\n",
	" {\n",
	" \"channel_id\": contact.index[0],\n",
	" \"contact_id\": contact.index[0],\n",
	" \"type\": \"EXT\",\n",
	" \"unit\": \"V\",\n",
	" \"sampling_frequency\": ess[0].rate,\n",
	" \"gain\": ess[0].conversion,\n",
	" }\n",
	" for contact in nwbfile.electrodes\n",
	" ]\n",
	" }\n",
	" \n",
	" return metadata\n",
	"\n",
	"def unique_list_of_dicts(data):\n",
	" # Convert to set of tuples\n",
	" unique_data = set(tuple(d.items()) for d in data)\n",
	" \n",
	" # Convert back to list of dictionaries\n",
	" unique_list_of_dicts = [dict(t) for t in unique_data]\n",
	" \n",
	" return unique_list_of_dicts\n",
	"\n",
	"\n",
	"def drop_false_cols(df):\n",
	" for col in df.columns:\n",
	" if not any(df[col][:]):\n",
	" df.drop(columns=[col], inplace=True)\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 125,
	"id": "d72c7090",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/bendichter/opt/miniconda3/lib/python3.9/site-packages/hdmf/spec/namespace.py:531: UserWarning: Ignoring cached namespace 'hdmf-common' version 1.3.0 because version 1.7.0 is already loaded.\n",
	" warn(\"Ignoring cached namespace '%s' version %s because version %s is already loaded.\"\n",
	"/Users/bendichter/opt/miniconda3/lib/python3.9/site-packages/hdmf/spec/namespace.py:531: UserWarning: Ignoring cached namespace 'core' version 2.2.5 because version 2.6.0-alpha is already loaded.\n",
	" warn(\"Ignoring cached namespace '%s' version %s because version %s is already loaded.\"\n"
	]
	}
	],
	"source": [
	"path = \"/Volumes/Extreme Pro/neural_data/dandisets/000044\"\n",
	"\n",
	"nwb_files = glob(path + \"/sub-/.nwb\")\n",
	"\n",
	"all_metadata = {x: extract_metadata(x) for x in nwb_files}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 126,
	"id": "ffa56cef",
	"metadata": {},
	"outputs": [],
	"source": [
	"# root\n",
	"\n",
	"out_path = \"bids_output\"\n",
	"\n",
	"os.mkdir(out_path)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 127,
	"id": "6cbfb45c",
	"metadata": {},
	"outputs": [],
	"source": [
	"# participants\n",
	"\n",
	"# create particiant table\n",
	"\n",
	"participants = unique_list_of_dicts(\n",
	" [x[\"participant\"] for x in all_metadata.values()]\n",
	")\n",
	"\n",
	"df = pd.DataFrame(participants)\n",
	"\n",
	"drop_false_cols(df)\n",
	" \n",
	"df.to_csv(os.path.join(out_path, \"participants.tsv\"), sep=\"\\t\", index=False)\n",
	"\n",
	"\n",
	"# create particiant json\n",
	"default_participants_json = {\n",
	" \"participant_id\": {\"Description\": \"Unique identifier of the participant\"},\n",
	" \"species\": {\"Description\": \"The binomial species name from the NCBI Taxonomy\"},\n",
	" \"strain\": {\"Description\": \"Identifier of the strain\"},\n",
	" \"birthdate\": {\"Description\": \"Day of birth of the participant in ISO8601 format\"},\n",
	" \"age\": {\"Description\": \"Age of the participant at time of recording\", \"Units\": \"days\"},\n",
	" \"sex\": {\"Description\": \"Sex of participant\"},\n",
	"}\n",
	"\n",
	"participants_json = {k: v for k, v in default_participants_json.items() if k in df.columns}\n",
	"\n",
	"with open(os.path.join(out_path, \"participants.json\"), \"w\") as json_file:\n",
	" json.dump(participants_json, json_file, indent=4)\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 128,
	"id": "3d6889d9",
	"metadata": {},
	"outputs": [],
	"source": [
	"# sessions\n",
	"\n",
	"default_session_json = {\n",
	" \"session_quality\": {\n",
	" \"LongName\": \"General quality of the session\",\n",
	" \"Description\": \"Quality of the session\",\n",
	" \"Levels\": {\n",
	" \"Bad\": \"Bad quality, should not be considered for further analysis\",\n",
	" \"ok\": \"Ok quality, can be considered for further analysis with care\",\n",
	" \"good\": \"Good quality, should be used for analysis\",\n",
	" \"Excellent\": \"Excellent quality, extraordinarily good session\",\n",
	" }\n",
	" },\n",
	" \"data_quality\": {\n",
	" \"LongName\": \"Quality of the recorded signals\",\n",
	" \"Description\": \"Quality of the recorded signals\",\n",
	" \"Levels\": {\n",
	" \"Bad\": \"Bad quality, should not be considered for further analysis\",\n",
	" \"ok\": \"Ok quality, can be considered for further analysis with care\",\n",
	" \"good\": \"Good quality, should be used for analysis\",\n",
	" \"Excellent\": \"Excellent quality, extraordinarily good session\",\n",
	" },\n",
	" },\n",
	" \"number_of_trials\": {\n",
	" \"LongName\": \"Number of trials in this session\",\n",
	" \"Description\": \"Count of attempted trials in the session (integer)\",\n",
	" },\n",
	" \"comment\": {\n",
	" \"LongName\": \"General comments\",\n",
	" \"Description\": \"General comments by the experimenter on the session\",\n",
	" },\n",
	"}\n",
	"\n",
	"for participant in participants:\n",
	" participant_id = participant[\"participant_id\"]\n",
	" \n",
	" os.mkdir(os.path.join(out_path, participant_id))\n",
	" \n",
	" for metadata in all_metadata.values():\n",
	" sessions = [\n",
	" x[\"session\"] for x in all_metadata.values() if\n",
	" x[\"participant\"][\"participant_id\"] == participant_id\n",
	" ]\n",
	" \n",
	" df = pd.DataFrame(sessions)\n",
	" drop_false_cols(df)\n",
	" \n",
	" df.to_csv(os.path.join(out_path, participant_id, \"sessions.tsv\"), sep=\"\\t\", index=False)\n",
	" \n",
	" session_json = {k: v for k, v in default_session_json.items() if k in df.columns}\n",
	" \n",
	" with open(os.path.join(out_path, participant_id, \"sessions.json\"), \"w\") as json_file:\n",
	" json.dump(session_json, json_file, indent=4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 129,
	"id": "34232c6e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# contacts, probes, and channels\n",
	"\n",
	"for metadata in all_metadata.values():\n",
	"\n",
	" session_id = metadata[\"session\"][\"session_id\"]\n",
	" participant_id = metadata[\"participant\"][\"participant_id\"]\n",
	"\n",
	" os.mkdir(os.path.join(out_path, participant_id, session_id))\n",
	" os.mkdir(os.path.join(out_path, participant_id, session_id, \"ephys\"))\n",
	" \n",
	" for var in (\"contacts\", \"probes\", \"channels\"):\n",
	" df = pd.DataFrame(metadata[var])\n",
	" drop_false_cols(df)\n",
	" df.to_csv(os.path.join(out_path, participant_id, session_id, \"ephys\", var + \".tsv\"), sep=\"\\t\", index=False)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "aec8723c",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.18"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}