Last active
July 3, 2025 21:25
-
-
Save anngvu/fbf3b3ce737f69bb5f66f94c8b696d63 to your computer and use it in GitHub Desktop.
Example for GEO metadata XML parsing. XML meta can be retrieved via API.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert GEO metadata to NF annotation templates | |
# NOTE: This was ran while dataset was still under embargo. | |
# If/when data is public, use GEOQuery pkg to avoid XML parsing... | |
library(xml2) | |
library(glue) | |
geo <- read.csv("template_GEO.csv") | |
# Retrieve xml files | |
xml_files <- c() | |
for(i in 1:nrow(geo)) { | |
acc <- geo$acc[i] | |
token <- geo$token[i] | |
destfile <- glue("{acc}.xml") | |
download.file(glue("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={acc}&targ=gsm&form=xml&view=full&token={token}"), | |
destfile = destfile) | |
xml_files <- c(xml_files, destfile) | |
} | |
# Note that sample characteristic meta can vary between datasets | |
sampleCharAttrs <- function(s) unique(unlist(xml_attrs(xml_find_all(s, "./Channel/Characteristics"), "tag"))) | |
parsed <- list() | |
for(xf in xml_files) { | |
x <- xml_ns_strip(read_xml(xf)) | |
s <- xml_find_all(x, "//MINiML/Sample") | |
meta <- list( | |
sample_id_geo = xml_attr(xml_find_first(s, "."), "iid"), | |
sampleID = xml_text(xml_find_first(s, "./Title"), trim = T), | |
species = xml_text(xml_find_first(s, "./Channel/Organism"), trim = T), | |
sample_info_1 = xml_text(xml_find_first(s, "./Channel/Source"), trim = T), | |
assay = xml_text(xml_find_first(s, "./Library-Strategy"), trim = T), | |
platform = xml_text(xml_find_first(s, "./Instrument-Model/Predefined"), trim = T), | |
sra = xml_attr(xml_find_first(s, "./Relation[@type='SRA']"), "target") | |
) | |
meta_df <- meta[lengths(meta) != 0] | |
meta_df <- data.frame(meta) | |
# Parse characteristics separately | |
sample_tags <- sampleCharAttrs(s) | |
cat("Sample characteristics available:", paste(sample_tags, collapse = ", "), "\n") | |
sample_chars <- lapply(sample_tags, function(tag) xml_text(xml_find_first(s, glue("./Channel/Characteristics[@tag='{tag}']")), trim = T)) | |
names(sample_chars) <- sample_tags | |
sample_chars <- data.frame(sample_chars) | |
parsed[[sub(".xml", "", xf)]] <- cbind(meta, sample_chars) | |
} | |
# Mapping has to be done somewhat manually | |
assay_map <- c(`ATAC-seq` = "ATACseq", `ChIP-Seq` = "ChIPSeq", `RNA-Seq` = "rnaSeq", `Bisulfite-Seq` = "bisulfiteSeq", `OTHER` = "CUT&RUN") | |
platform_map <- c(`Illumina HiSeq 2500` = "HiSeq 2500") # consider changing HiSeq 2500 to Illumina HiSeq 2500 in data model | |
assay_data_type <- c(`ATAC-seq` = "chromatinActivity", `ChIP-Seq` = "chromatinActivity", `RNA-Seq` = "geneExpression", | |
`Bisulfite-Seq` = "chromatinActivity", `OTHER` = "chromatinActivity") | |
ext_template <- as.list(read.csv("GenomicsAssayTemplateExtended.csv", colClasses = "character")) | |
ext_template$Component <- "GenomicsAssayTemplateExtended" | |
ext_template$resourceType <- "experimentalData" | |
ext_template$dataSubtype <- "raw" | |
ext_template$fileFormat <- "fastq" | |
filled <- rep(list(ext_template), length(parsed)) | |
names(filled) <- names(parsed) | |
# These attributes can be translated pretty consistently | |
easy_fill <- function(x, filled, parsed) { | |
filled[[x]]$assay <- assay_map[parsed[[x]]$assay] | |
# filled[[x]]$platform <- platform_map[parsed[[x]]$platform] # no longer need mapping because we'll change the model anyway | |
filled[[x]]$platform <- parsed[[x]]$platform | |
filled[[x]]$dataType <- assay_data_type[parsed[[x]]$assay] | |
filled[[x]]$specimenID <- parsed[[x]]$sampleID | |
filled[[x]]$species <- parsed[[x]]$species | |
filled[[x]]$comments <- parsed[[x]]$sra # not sure where to stick SRA link -- put in comments for now | |
filled | |
} | |
# GSE179699 | |
filled <- easy_fill("GSE179699", filled, parsed) | |
filled$GSE179699$modelSystemName <- "M3 MPNST" | |
filled$GSE179699$tissue <- "tumor" | |
filled$GSE179699$tumorType <- "Malignant Peripheral Sheath Tumor" | |
filled$GSE179699$genePerturbed <- ifelse(parsed$GSE179699$suz12 == "Knockout", "SUZ12", "") | |
filled$GSE179699$genePerturbationType <- ifelse(parsed$GSE179699$suz12 == "Knockout", "knockout", "") | |
filled$GSE179699$genePerturbationTechnology <- ifelse(parsed$GSE179699$suz12 == "Knockout", "CRISPR", "") | |
filled$GSE179699$experimentalCondition <- parsed$GSE179699$treatment | |
# GSE206527 | |
filled <- easy_fill("GSE206527", filled, parsed) | |
filled$GSE206527$tumorType <- "Malignant Peripheral Sheath Tumor" | |
filled$GSE206527$tissue <- gsub("MPNST tumor", "tumor", parsed$GSE206527$tissue) | |
# GSE179703 | |
filled <- easy_fill("GSE179703", filled, parsed) | |
filled$GSE179703$tissue <- "tumor" | |
filled$GSE179703$modelSystemName <- parsed$GSE179703$strain | |
filled$GSE179703$genePerturbed <- ifelse(parsed$GSE179703$eed == "Knockout", "EED", "") | |
filled$GSE179703$genePerturbationType <- ifelse(parsed$GSE179703$eed == "Knockout", "knockout", "") | |
filled$GSE179703$genePerturbationTechnology <- ifelse(parsed$GSE179703$eed == "Knockout", "CRISPR", "") | |
# GSE202555 | |
filled <- easy_fill("GSE202555", filled, parsed) | |
filled$GSE202555$tissue <- "tumor" | |
filled$GSE202555$tumorType <- "Malignant Peripheral Sheath Tumor" | |
filled$GSE202555$modelSystemName <- "M3 MPNST" # parsed$GSE202555$cell.line | |
filled$GSE202555$genePerturbed <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "SUZ12", "") | |
filled$GSE202555$genePerturbationType <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "knockout", "") | |
filled$GSE202555$genePerturbationTechnology <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "CRISPR", "") | |
# GSE179587 | |
filled <- easy_fill("GSE179587", filled, parsed) | |
filled$GSE179587$tissue <- "tumor" | |
filled$GSE179587$tumorType <- "Malignant Peripheral Sheath Tumor" | |
filled$GSE179587$modelSystemName <- "M3 MPNST" | |
filled$GSE179587$experimentalCondition <- parsed$GSE179587$treatment | |
# | |
filled <- lapply(filled, as.data.frame) | |
for(m in names(filled)) write.csv(filled[[m]], paste0("manifests/", m, ".csv"), row.names = F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment