anngvu/example_geo_metadata_parsing.R

## example_geo_metadata_parsing.R
# Convert GEO metadata to NF annotation templates
# NOTE: This was ran while dataset was still under embargo.
# If/when data is public, use GEOQuery pkg to avoid XML parsing...

library(xml2)
library(glue)

geo <- read.csv("template_GEO.csv")

# Retrieve xml files
xml_files <- c()
for(i in 1:nrow(geo)) {
  acc <- geo$acc[i]
  token <- geo$token[i]
  destfile <- glue("{acc}.xml")
  download.file(glue("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={acc}&targ=gsm&form=xml&view=full&token={token}"),
                destfile = destfile)
  xml_files <- c(xml_files, destfile)
}

# Note that sample characteristic meta can vary between datasets
sampleCharAttrs <- function(s) unique(unlist(xml_attrs(xml_find_all(s, "./Channel/Characteristics"), "tag")))


parsed <- list()
for(xf in xml_files) {
  x <- xml_ns_strip(read_xml(xf))
  s <- xml_find_all(x, "//MINiML/Sample")
  meta <- list(
    sample_id_geo = xml_attr(xml_find_first(s, "."), "iid"),
    sampleID = xml_text(xml_find_first(s, "./Title"), trim = T),
    species = xml_text(xml_find_first(s, "./Channel/Organism"), trim = T),
    sample_info_1 = xml_text(xml_find_first(s, "./Channel/Source"), trim = T),
    assay = xml_text(xml_find_first(s, "./Library-Strategy"), trim = T),
    platform = xml_text(xml_find_first(s, "./Instrument-Model/Predefined"), trim = T),
    sra = xml_attr(xml_find_first(s, "./Relation[@type='SRA']"), "target")
  )
  meta_df <- meta[lengths(meta) != 0]
  meta_df <- data.frame(meta)
  # Parse characteristics separately
  sample_tags <- sampleCharAttrs(s)
  cat("Sample characteristics available:", paste(sample_tags, collapse = ", "), "\n")
  sample_chars <- lapply(sample_tags, function(tag) xml_text(xml_find_first(s, glue("./Channel/Characteristics[@tag='{tag}']")), trim = T))
  names(sample_chars) <- sample_tags
  sample_chars <- data.frame(sample_chars)
  parsed[[sub(".xml", "", xf)]] <- cbind(meta, sample_chars)
}


# Mapping has to be done somewhat manually
assay_map <- c(`ATAC-seq` = "ATACseq", `ChIP-Seq` = "ChIPSeq", `RNA-Seq` = "rnaSeq", `Bisulfite-Seq` = "bisulfiteSeq", `OTHER` = "CUT&RUN")
platform_map <- c(`Illumina HiSeq 2500` = "HiSeq 2500") # consider changing HiSeq 2500 to Illumina HiSeq 2500 in data model
assay_data_type <- c(`ATAC-seq` = "chromatinActivity", `ChIP-Seq` = "chromatinActivity", `RNA-Seq` = "geneExpression",
                     `Bisulfite-Seq` = "chromatinActivity", `OTHER` = "chromatinActivity")

ext_template <- as.list(read.csv("GenomicsAssayTemplateExtended.csv", colClasses = "character"))
ext_template$Component <- "GenomicsAssayTemplateExtended"
ext_template$resourceType <- "experimentalData"
ext_template$dataSubtype <- "raw"
ext_template$fileFormat <- "fastq"

filled <- rep(list(ext_template), length(parsed))
names(filled) <- names(parsed)

# These attributes can be translated pretty consistently
easy_fill <- function(x, filled, parsed) {
  filled[[x]]$assay <- assay_map[parsed[[x]]$assay]
  # filled[[x]]$platform <- platform_map[parsed[[x]]$platform] # no longer need mapping because we'll change the model anyway
  filled[[x]]$platform <- parsed[[x]]$platform
  filled[[x]]$dataType <- assay_data_type[parsed[[x]]$assay]
  filled[[x]]$specimenID <- parsed[[x]]$sampleID
  filled[[x]]$species <- parsed[[x]]$species
  filled[[x]]$comments <- parsed[[x]]$sra # not sure where to stick SRA link -- put in comments for now
  filled
}

# GSE179699
filled <- easy_fill("GSE179699", filled, parsed)
filled$GSE179699$modelSystemName <- "M3 MPNST"
filled$GSE179699$tissue <- "tumor"
filled$GSE179699$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE179699$genePerturbed <- ifelse(parsed$GSE179699$suz12 == "Knockout", "SUZ12", "")
filled$GSE179699$genePerturbationType <- ifelse(parsed$GSE179699$suz12 == "Knockout", "knockout", "")
filled$GSE179699$genePerturbationTechnology <- ifelse(parsed$GSE179699$suz12 == "Knockout", "CRISPR", "")
filled$GSE179699$experimentalCondition <- parsed$GSE179699$treatment

# GSE206527
filled <- easy_fill("GSE206527", filled, parsed)
filled$GSE206527$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE206527$tissue <- gsub("MPNST tumor", "tumor", parsed$GSE206527$tissue)

# GSE179703
filled <- easy_fill("GSE179703", filled, parsed)
filled$GSE179703$tissue <- "tumor"
filled$GSE179703$modelSystemName <- parsed$GSE179703$strain
filled$GSE179703$genePerturbed <- ifelse(parsed$GSE179703$eed == "Knockout", "EED", "")
filled$GSE179703$genePerturbationType <- ifelse(parsed$GSE179703$eed == "Knockout", "knockout", "")
filled$GSE179703$genePerturbationTechnology <- ifelse(parsed$GSE179703$eed == "Knockout", "CRISPR", "")


# GSE202555
filled <- easy_fill("GSE202555", filled, parsed)
filled$GSE202555$tissue <- "tumor"
filled$GSE202555$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE202555$modelSystemName <- "M3 MPNST" # parsed$GSE202555$cell.line
filled$GSE202555$genePerturbed <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "SUZ12", "")
filled$GSE202555$genePerturbationType <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "knockout", "")
filled$GSE202555$genePerturbationTechnology <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "CRISPR", "")

# GSE179587
filled <- easy_fill("GSE179587", filled, parsed)
filled$GSE179587$tissue <- "tumor"
filled$GSE179587$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE179587$modelSystemName <- "M3 MPNST"
filled$GSE179587$experimentalCondition <- parsed$GSE179587$treatment

#
filled <- lapply(filled, as.data.frame)
for(m in names(filled)) write.csv(filled[[m]], paste0("manifests/", m, ".csv"), row.names = F)
	# Convert GEO metadata to NF annotation templates
	# NOTE: This was ran while dataset was still under embargo.
	# If/when data is public, use GEOQuery pkg to avoid XML parsing...

	library(xml2)
	library(glue)

	geo <- read.csv("template_GEO.csv")

	# Retrieve xml files
	xml_files <- c()
	for(i in 1:nrow(geo)) {
	acc <- geo$acc[i]
	token <- geo$token[i]
	destfile <- glue("{acc}.xml")
	download.file(glue("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={acc}&targ=gsm&form=xml&view=full&token={token}"),
	destfile = destfile)
	xml_files <- c(xml_files, destfile)
	}

	# Note that sample characteristic meta can vary between datasets
	sampleCharAttrs <- function(s) unique(unlist(xml_attrs(xml_find_all(s, "./Channel/Characteristics"), "tag")))


	parsed <- list()
	for(xf in xml_files) {
	x <- xml_ns_strip(read_xml(xf))
	s <- xml_find_all(x, "//MINiML/Sample")
	meta <- list(
	sample_id_geo = xml_attr(xml_find_first(s, "."), "iid"),
	sampleID = xml_text(xml_find_first(s, "./Title"), trim = T),
	species = xml_text(xml_find_first(s, "./Channel/Organism"), trim = T),
	sample_info_1 = xml_text(xml_find_first(s, "./Channel/Source"), trim = T),
	assay = xml_text(xml_find_first(s, "./Library-Strategy"), trim = T),
	platform = xml_text(xml_find_first(s, "./Instrument-Model/Predefined"), trim = T),
	sra = xml_attr(xml_find_first(s, "./Relation[@type='SRA']"), "target")
	)
	meta_df <- meta[lengths(meta) != 0]
	meta_df <- data.frame(meta)
	# Parse characteristics separately
	sample_tags <- sampleCharAttrs(s)
	cat("Sample characteristics available:", paste(sample_tags, collapse = ", "), "\n")
	sample_chars <- lapply(sample_tags, function(tag) xml_text(xml_find_first(s, glue("./Channel/Characteristics[@tag='{tag}']")), trim = T))
	names(sample_chars) <- sample_tags
	sample_chars <- data.frame(sample_chars)
	parsed[[sub(".xml", "", xf)]] <- cbind(meta, sample_chars)
	}



	# Mapping has to be done somewhat manually
	assay_map <- c(`ATAC-seq` = "ATACseq", `ChIP-Seq` = "ChIPSeq", `RNA-Seq` = "rnaSeq", `Bisulfite-Seq` = "bisulfiteSeq", `OTHER` = "CUT&RUN")
	platform_map <- c(`Illumina HiSeq 2500` = "HiSeq 2500") # consider changing HiSeq 2500 to Illumina HiSeq 2500 in data model
	assay_data_type <- c(`ATAC-seq` = "chromatinActivity", `ChIP-Seq` = "chromatinActivity", `RNA-Seq` = "geneExpression",
	`Bisulfite-Seq` = "chromatinActivity", `OTHER` = "chromatinActivity")

	ext_template <- as.list(read.csv("GenomicsAssayTemplateExtended.csv", colClasses = "character"))
	ext_template$Component <- "GenomicsAssayTemplateExtended"
	ext_template$resourceType <- "experimentalData"
	ext_template$dataSubtype <- "raw"
	ext_template$fileFormat <- "fastq"

	filled <- rep(list(ext_template), length(parsed))
	names(filled) <- names(parsed)

	# These attributes can be translated pretty consistently
	easy_fill <- function(x, filled, parsed) {
	filled[[x]]$assay <- assay_map[parsed[[x]]$assay]
	# filled[[x]]$platform <- platform_map[parsed[[x]]$platform] # no longer need mapping because we'll change the model anyway
	filled[[x]]$platform <- parsed[[x]]$platform
	filled[[x]]$dataType <- assay_data_type[parsed[[x]]$assay]
	filled[[x]]$specimenID <- parsed[[x]]$sampleID
	filled[[x]]$species <- parsed[[x]]$species
	filled[[x]]$comments <- parsed[[x]]$sra # not sure where to stick SRA link -- put in comments for now
	filled
	}

	# GSE179699
	filled <- easy_fill("GSE179699", filled, parsed)
	filled$GSE179699$modelSystemName <- "M3 MPNST"
	filled$GSE179699$tissue <- "tumor"
	filled$GSE179699$tumorType <- "Malignant Peripheral Sheath Tumor"
	filled$GSE179699$genePerturbed <- ifelse(parsed$GSE179699$suz12 == "Knockout", "SUZ12", "")
	filled$GSE179699$genePerturbationType <- ifelse(parsed$GSE179699$suz12 == "Knockout", "knockout", "")
	filled$GSE179699$genePerturbationTechnology <- ifelse(parsed$GSE179699$suz12 == "Knockout", "CRISPR", "")
	filled$GSE179699$experimentalCondition <- parsed$GSE179699$treatment

	# GSE206527
	filled <- easy_fill("GSE206527", filled, parsed)
	filled$GSE206527$tumorType <- "Malignant Peripheral Sheath Tumor"
	filled$GSE206527$tissue <- gsub("MPNST tumor", "tumor", parsed$GSE206527$tissue)

	# GSE179703
	filled <- easy_fill("GSE179703", filled, parsed)
	filled$GSE179703$tissue <- "tumor"
	filled$GSE179703$modelSystemName <- parsed$GSE179703$strain
	filled$GSE179703$genePerturbed <- ifelse(parsed$GSE179703$eed == "Knockout", "EED", "")
	filled$GSE179703$genePerturbationType <- ifelse(parsed$GSE179703$eed == "Knockout", "knockout", "")
	filled$GSE179703$genePerturbationTechnology <- ifelse(parsed$GSE179703$eed == "Knockout", "CRISPR", "")


	# GSE202555
	filled <- easy_fill("GSE202555", filled, parsed)
	filled$GSE202555$tissue <- "tumor"
	filled$GSE202555$tumorType <- "Malignant Peripheral Sheath Tumor"
	filled$GSE202555$modelSystemName <- "M3 MPNST" # parsed$GSE202555$cell.line
	filled$GSE202555$genePerturbed <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "SUZ12", "")
	filled$GSE202555$genePerturbationType <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "knockout", "")
	filled$GSE202555$genePerturbationTechnology <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "CRISPR", "")

	# GSE179587
	filled <- easy_fill("GSE179587", filled, parsed)
	filled$GSE179587$tissue <- "tumor"
	filled$GSE179587$tumorType <- "Malignant Peripheral Sheath Tumor"
	filled$GSE179587$modelSystemName <- "M3 MPNST"
	filled$GSE179587$experimentalCondition <- parsed$GSE179587$treatment

	#
	filled <- lapply(filled, as.data.frame)
	for(m in names(filled)) write.csv(filled[[m]], paste0("manifests/", m, ".csv"), row.names = F)