Avantol13/bdc_discovery_data_extract.py

## bdc_discovery_data_extract.py
"""
Script to get initial metadata for BDCat studies from Gen3 Graph and by scraping dbGaP.

Available tags:

    Program
        - TOPMed
        - COVID 19

    Study Registration
        - dbGaP
        - BIOLINCC

    Disease Type
        - Heart
        - Lung
        - Blood

    Data Type
        - DCC Harmonized
        - Clinical phenotype
        - Genotype
        - Imaging
"""
import os
import requests
import logging
import sys
import csv
import copy
import asyncio
from datetime import date

from bs4 import BeautifulSoup
from gen3.submission import Gen3Submission
from gen3.auth import Gen3Auth
from gen3.tools import metadata
from gen3.tools.metadata.ingest_manifest import manifest_row_parsers


COMMONS = "https://preprod.gen3.biodatacatalyst.nhlbi.nih.gov/"
API_KEY_FILEPATH = "/home/buffy/Documents/bdcat_preprod_creds.json"

COMMONS_TO_SUBMIT_METADATA = "https://staging.gen3.biodatacatalyst.nhlbi.nih.gov/"
API_KEY_FILEPATH_FOR_METADATA = "/home/buffy/Documents/bdcat_staging_creds.json"

OUTPUT_FILE = "discovery_metadata.tsv"

DBGAP_WEBSITE = "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id="

logging.basicConfig(filename="generate_discovery_metadata.log", level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))


def generate_discovery_metadata():
    auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
    submission = Gen3Submission(COMMONS, auth_provider=auth)

    print("getting currently submitted project/study data...")
    query_txt = """
{
  project(first:0) {
    project_id
    code
    name
    studies(first:0) {
      study_id
      dbgap_phs
      dbgap_consent
      dbgap_version
      dbgap_accession
      dbgap_consent_text
      dbgap_participant_set
      authz
      full_name
      short_name
      study_description
      _subjects_count
    }
  }
}
    """
    raw_results = submission.query(query_txt).get("data", {}).get("project", [])
    results = []
    fields = set()
    for raw_result in raw_results:
        studies = raw_result.get("studies")

        study_data = {}
        if len(studies) != 1:
            logging.warning(
                f"expect 1:1 project:study, got {studies} from {raw_result}"
            )
        else:
            study_data = studies[0]

        del raw_result["studies"]

        result = copy.deepcopy(raw_result)
        result.update(study_data)

        if "authz" in result:
            result["authz"] = str(result["authz"]).replace("'", '"')

        result["tags"] = _determine_tags_from_study_info(result)
        result["study_description"] = _get_study_description(result)

        # don't include studies with no subjects for now, this effectively removes
        # any projects that were created but have no data submitted
        if result.get("_subjects_count"):
            results.append(result)

    fields = fields | set(result.keys())

    with open(OUTPUT_FILE, "w+", encoding="utf-8") as output_file:
        logging.info(f"writing headers to {OUTPUT_FILE}: {fields}")
        output_writer = csv.DictWriter(
            output_file,
            delimiter="\t",
            fieldnames=fields,
            extrasaction="ignore",
        )
        output_writer.writeheader()

        for row in results:
            output_writer.writerow(row)

    output_file.close()


def submit_to_metadata_service(commons, api_key):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

    auth = Gen3Auth(commons, refresh_file=api_key)

    # must provide a str to namespace the metadata from the file in a block in
    # the metadata service
    metadata_source = "gen3_discovery"

    # override default unique ID parsing behavior
    def _custom_get_guid_for_row(commons_url, row, lock):
        """
        Given a row from the manifest, return the unique ID to use for the metadata object.

        Args:
            commons_url (str): root domain for commons where mds lives
            row (dict): column_name:row_value
            lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
                connections if making a call to an external service

        Returns:
            str: unique ID
        """
        return row.get("name")

    # override default unique ID parsing behavior
    manifest_row_parsers["guid_for_row"] = _custom_get_guid_for_row

    loop.run_until_complete(
        metadata.async_ingest_metadata_manifest(
            commons,
            manifest_file=OUTPUT_FILE,
            metadata_source=metadata_source,
            auth=auth,
            metadata_type="discovery_metadata"
        )
    )


def _get_study_description(study):
    dbgap_phs = study.get("dbgap_phs", "") or ""
    dbgap_version = study.get("dbgap_version", "") or ""
    dbgap_participant_set = study.get("dbgap_participant_set", "") or ""
    dbgap_study = f"{dbgap_phs}.{dbgap_version}.{dbgap_participant_set}"

    study_description = study.get("study_description")
    if dbgap_study != "..":
        url = DBGAP_WEBSITE + dbgap_study

        logging.debug(f"scraping {url}")
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        report = soup.find("dl", class_="report")
        if report:
            study_description_start = report.find("dt")

            # sometimes the study description isn't the first "dd" tag
            if "Study Description" not in study_description_start.getText():
                study_description_start = study_description_start.find_next_sibling(
                    "dt"
                )

            study_description = study_description_start.find_next_sibling("dd") or ""

            if study_description:
                links = study_description.find(id="important-links")
                if links:
                    links.decompose()

                study_description = (
                    study_description.getText().strip()
                    + f"\n\nNOTE: This text was scraped from https://www.ncbi.nlm.nih.gov/ on {date.today()} and may not include exact formatting or images."
                )

                logging.debug(f"{study_description}")

    return study_description


def _determine_tags_from_study_info(study):
    tags = []
    if study.get("project_id", "") and study.get("project_id", "").startswith("parent"):
        tags.append(_get_tag("Parent", "Program"))
        tags.append(_get_tag("DCC Harmonized", "Data Type"))
        tags.append(_get_tag("Clinical Phenotype", "Data Type"))

    if study.get("project_id", "") and study.get("project_id", "").startswith("topmed"):
        tags.append(_get_tag("TOPMed", "Program"))
        tags.append(_get_tag("Genotype", "Data Type"))

        if _is_topmed_study_geno_and_pheno(study.get("code", "")):
            tags.append(_get_tag("Clinical Phenotype", "Data Type"))

    if study.get("project_id", "") and study.get("project_id", "").startswith("COVID"):
        tags.append(_get_tag("COVID 19", "Program"))

    if study.get("dbgap_accession", "") and study.get("dbgap_accession", "").startswith(
        "phs"
    ):
        tags.append(_get_tag("dbGaP", "Study Registration"))

    return str(tags).replace("'", '"')


def _get_tag(name, category):
    return {"name": name, "category": category}


def _is_topmed_study_geno_and_pheno(study):
    # if the topmed study has both gennomic and phenotype data (instead of having a parent
    # study with pheno and a topmed with geno separately)
    #
    # determined from https://docs.google.com/spreadsheets/d/1iVOmZVu_IzsVMdefH-1Rgf8zrjqvnZOUEA2dxS5iRjc/edit#gid=698119570
    # filter to "program"=="topmed" and "parent_study_accession"==""
    return study in [
        "SAGE_DS-LD-IRB-COL",
        "Amish_HMB-IRB-MDS",
        "CRA_DS-ASTHMA-IRB-MDS-RD",
        "VAFAR_HMB-IRB",
        "PARTNERS_HMB",
        "WGHS_HMB",
        "BAGS_GRU-IRB",
        "Sarcoidosis_DS-SAR-IRB",
        "HyperGEN_GRU-IRB",
        "HyperGEN_DS-CVD-IRB-RD",
        "THRV_DS-CVD-IRB-COL-NPU-RD",
        "miRhythm_GRU",
        "AustralianFamilialAF_HMB-NPU-MDS",
        "pharmHU_HMB",
        "pharmHU_DS-SCD-RD",
        "pharmHU_DS-SCD",
        "SAPPHIRE_asthma_DS-ASTHMA-IRB-COL",
        "REDS-III_Brazil_SCD_GRU-IRB-PUB-NPU",
        "Walk_PHaSST_SCD_HMB-IRB-PUB-COL-NPU-MDS-GSO",
        "Walk_PHaSST_SCD_DS-SCD-IRB-PUB-COL-NPU-MDS-RD",
        "MLOF_HMB-PUB",
        "AFLMU_HMB-IRB-PUB-COL-NPU-MDS",
        "MPP_HMB-NPU-MDS",
        "INSPIRE_AF_DS-MULTIPLE_DISEASES-MDS",
        "DECAF_GRU",
        "GENAF_HMB-NPU",
        "JHU_AF_HMB-NPU-MDS",
        "ChildrensHS_GAP_GRU",
        "ChildrensHS_IGERA_GRU",
        "ChildrensHS_MetaAir_GRU",
        "CHIRAH_DS-ASTHMA-IRB-COL",
        "EGCUT_GRU",
        "IPF_DS-PUL-ILD-IRB-NPU",
        "IPF_DS-LD-IRB-NPU",
        "IPF_DS-PFIB-IRB-NPU",
        "IPF_HMB-IRB-NPU",
        "IPF_DS-ILD-IRB-NPU",
        "OMG_SCD_DS-SCD-IRB-PUB-COL-MDS-RD",
        "BioVU_AF_HMB-GSO",
        "LTRC_HMB-MDS",
        "PUSH_SCD_DS-SCD-IRB-PUB-COL",
        "GGAF_GRU",
        "PIMA_DS-ASTHMA-IRB-COL",
        "CARE_BADGER_DS-ASTHMA-IRB-COL",
        "CARE_TREXA_DS-ASTHMA-IRB-COL",
    ]


if __name__ == "__main__":
    generate_discovery_metadata()
    submit_to_metadata_service(
        COMMONS_TO_SUBMIT_METADATA, API_KEY_FILEPATH_FOR_METADATA
    )
	"""
	Script to get initial metadata for BDCat studies from Gen3 Graph and by scraping dbGaP.

	Available tags:

	Program
	- TOPMed
	- COVID 19

	Study Registration
	- dbGaP
	- BIOLINCC

	Disease Type
	- Heart
	- Lung
	- Blood

	Data Type
	- DCC Harmonized
	- Clinical phenotype
	- Genotype
	- Imaging
	"""
	import os
	import requests
	import logging
	import sys
	import csv
	import copy
	import asyncio
	from datetime import date

	from bs4 import BeautifulSoup
	from gen3.submission import Gen3Submission
	from gen3.auth import Gen3Auth
	from gen3.tools import metadata
	from gen3.tools.metadata.ingest_manifest import manifest_row_parsers


	COMMONS = "https://preprod.gen3.biodatacatalyst.nhlbi.nih.gov/"
	API_KEY_FILEPATH = "/home/buffy/Documents/bdcat_preprod_creds.json"

	COMMONS_TO_SUBMIT_METADATA = "https://staging.gen3.biodatacatalyst.nhlbi.nih.gov/"
	API_KEY_FILEPATH_FOR_METADATA = "/home/buffy/Documents/bdcat_staging_creds.json"

	OUTPUT_FILE = "discovery_metadata.tsv"

	DBGAP_WEBSITE = "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id="

	logging.basicConfig(filename="generate_discovery_metadata.log", level=logging.DEBUG)
	logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))


	def generate_discovery_metadata():
	auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
	submission = Gen3Submission(COMMONS, auth_provider=auth)

	print("getting currently submitted project/study data...")
	query_txt = """
	{
	project(first:0) {
	project_id
	code
	name
	studies(first:0) {
	study_id
	dbgap_phs
	dbgap_consent
	dbgap_version
	dbgap_accession
	dbgap_consent_text
	dbgap_participant_set
	authz
	full_name
	short_name
	study_description
	_subjects_count
	}
	}
	}
	"""
	raw_results = submission.query(query_txt).get("data", {}).get("project", [])
	results = []
	fields = set()
	for raw_result in raw_results:
	studies = raw_result.get("studies")

	study_data = {}
	if len(studies) != 1:
	logging.warning(
	f"expect 1:1 project:study, got {studies} from {raw_result}"
	)
	else:
	study_data = studies[0]

	del raw_result["studies"]

	result = copy.deepcopy(raw_result)
	result.update(study_data)

	if "authz" in result:
	result["authz"] = str(result["authz"]).replace("'", '"')

	result["tags"] = _determine_tags_from_study_info(result)
	result["study_description"] = _get_study_description(result)

	# don't include studies with no subjects for now, this effectively removes
	# any projects that were created but have no data submitted
	if result.get("_subjects_count"):
	results.append(result)

	fields = fields \| set(result.keys())

	with open(OUTPUT_FILE, "w+", encoding="utf-8") as output_file:
	logging.info(f"writing headers to {OUTPUT_FILE}: {fields}")
	output_writer = csv.DictWriter(
	output_file,
	delimiter="\t",
	fieldnames=fields,
	extrasaction="ignore",
	)
	output_writer.writeheader()

	for row in results:
	output_writer.writerow(row)

	output_file.close()


	def submit_to_metadata_service(commons, api_key):
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)

	auth = Gen3Auth(commons, refresh_file=api_key)

	# must provide a str to namespace the metadata from the file in a block in
	# the metadata service
	metadata_source = "gen3_discovery"

	# override default unique ID parsing behavior
	def _custom_get_guid_for_row(commons_url, row, lock):
	"""
	Given a row from the manifest, return the unique ID to use for the metadata object.

	Args:
	commons_url (str): root domain for commons where mds lives
	row (dict): column_name:row_value
	lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
	connections if making a call to an external service

	Returns:
	str: unique ID
	"""
	return row.get("name")

	# override default unique ID parsing behavior
	manifest_row_parsers["guid_for_row"] = _custom_get_guid_for_row

	loop.run_until_complete(
	metadata.async_ingest_metadata_manifest(
	commons,
	manifest_file=OUTPUT_FILE,
	metadata_source=metadata_source,
	auth=auth,
	metadata_type="discovery_metadata"
	)
	)


	def _get_study_description(study):
	dbgap_phs = study.get("dbgap_phs", "") or ""
	dbgap_version = study.get("dbgap_version", "") or ""
	dbgap_participant_set = study.get("dbgap_participant_set", "") or ""
	dbgap_study = f"{dbgap_phs}.{dbgap_version}.{dbgap_participant_set}"

	study_description = study.get("study_description")
	if dbgap_study != "..":
	url = DBGAP_WEBSITE + dbgap_study

	logging.debug(f"scraping {url}")
	page = requests.get(url)
	soup = BeautifulSoup(page.content, "html.parser")

	report = soup.find("dl", class_="report")
	if report:
	study_description_start = report.find("dt")

	# sometimes the study description isn't the first "dd" tag
	if "Study Description" not in study_description_start.getText():
	study_description_start = study_description_start.find_next_sibling(
	"dt"
	)

	study_description = study_description_start.find_next_sibling("dd") or ""

	if study_description:
	links = study_description.find(id="important-links")
	if links:
	links.decompose()

	study_description = (
	study_description.getText().strip()
	+ f"\n\nNOTE: This text was scraped from https://www.ncbi.nlm.nih.gov/ on {date.today()} and may not include exact formatting or images."
	)

	logging.debug(f"{study_description}")

	return study_description


	def _determine_tags_from_study_info(study):
	tags = []
	if study.get("project_id", "") and study.get("project_id", "").startswith("parent"):
	tags.append(_get_tag("Parent", "Program"))
	tags.append(_get_tag("DCC Harmonized", "Data Type"))
	tags.append(_get_tag("Clinical Phenotype", "Data Type"))

	if study.get("project_id", "") and study.get("project_id", "").startswith("topmed"):
	tags.append(_get_tag("TOPMed", "Program"))
	tags.append(_get_tag("Genotype", "Data Type"))

	if _is_topmed_study_geno_and_pheno(study.get("code", "")):
	tags.append(_get_tag("Clinical Phenotype", "Data Type"))

	if study.get("project_id", "") and study.get("project_id", "").startswith("COVID"):
	tags.append(_get_tag("COVID 19", "Program"))

	if study.get("dbgap_accession", "") and study.get("dbgap_accession", "").startswith(
	"phs"
	):
	tags.append(_get_tag("dbGaP", "Study Registration"))

	return str(tags).replace("'", '"')


	def _get_tag(name, category):
	return {"name": name, "category": category}


	def _is_topmed_study_geno_and_pheno(study):
	# if the topmed study has both gennomic and phenotype data (instead of having a parent
	# study with pheno and a topmed with geno separately)
	#
	# determined from https://docs.google.com/spreadsheets/d/1iVOmZVu_IzsVMdefH-1Rgf8zrjqvnZOUEA2dxS5iRjc/edit#gid=698119570
	# filter to "program"=="topmed" and "parent_study_accession"==""
	return study in [
	"SAGE_DS-LD-IRB-COL",
	"Amish_HMB-IRB-MDS",
	"CRA_DS-ASTHMA-IRB-MDS-RD",
	"VAFAR_HMB-IRB",
	"PARTNERS_HMB",
	"WGHS_HMB",
	"BAGS_GRU-IRB",
	"Sarcoidosis_DS-SAR-IRB",
	"HyperGEN_GRU-IRB",
	"HyperGEN_DS-CVD-IRB-RD",
	"THRV_DS-CVD-IRB-COL-NPU-RD",
	"miRhythm_GRU",
	"AustralianFamilialAF_HMB-NPU-MDS",
	"pharmHU_HMB",
	"pharmHU_DS-SCD-RD",
	"pharmHU_DS-SCD",
	"SAPPHIRE_asthma_DS-ASTHMA-IRB-COL",
	"REDS-III_Brazil_SCD_GRU-IRB-PUB-NPU",
	"Walk_PHaSST_SCD_HMB-IRB-PUB-COL-NPU-MDS-GSO",
	"Walk_PHaSST_SCD_DS-SCD-IRB-PUB-COL-NPU-MDS-RD",
	"MLOF_HMB-PUB",
	"AFLMU_HMB-IRB-PUB-COL-NPU-MDS",
	"MPP_HMB-NPU-MDS",
	"INSPIRE_AF_DS-MULTIPLE_DISEASES-MDS",
	"DECAF_GRU",
	"GENAF_HMB-NPU",
	"JHU_AF_HMB-NPU-MDS",
	"ChildrensHS_GAP_GRU",
	"ChildrensHS_IGERA_GRU",
	"ChildrensHS_MetaAir_GRU",
	"CHIRAH_DS-ASTHMA-IRB-COL",
	"EGCUT_GRU",
	"IPF_DS-PUL-ILD-IRB-NPU",
	"IPF_DS-LD-IRB-NPU",
	"IPF_DS-PFIB-IRB-NPU",
	"IPF_HMB-IRB-NPU",
	"IPF_DS-ILD-IRB-NPU",
	"OMG_SCD_DS-SCD-IRB-PUB-COL-MDS-RD",
	"BioVU_AF_HMB-GSO",
	"LTRC_HMB-MDS",
	"PUSH_SCD_DS-SCD-IRB-PUB-COL",
	"GGAF_GRU",
	"PIMA_DS-ASTHMA-IRB-COL",
	"CARE_BADGER_DS-ASTHMA-IRB-COL",
	"CARE_TREXA_DS-ASTHMA-IRB-COL",
	]


	if __name__ == "__main__":
	generate_discovery_metadata()
	submit_to_metadata_service(
	COMMONS_TO_SUBMIT_METADATA, API_KEY_FILEPATH_FOR_METADATA
	)