Skip to content

Instantly share code, notes, and snippets.

@anngvu
Last active July 3, 2025 21:25
Show Gist options
  • Save anngvu/fbf3b3ce737f69bb5f66f94c8b696d63 to your computer and use it in GitHub Desktop.
Save anngvu/fbf3b3ce737f69bb5f66f94c8b696d63 to your computer and use it in GitHub Desktop.
Example for GEO metadata XML parsing. XML meta can be retrieved via API.
# Convert GEO metadata to NF annotation templates
# NOTE: This was ran while dataset was still under embargo.
# If/when data is public, use GEOQuery pkg to avoid XML parsing...
library(xml2)
library(glue)
geo <- read.csv("template_GEO.csv")
# Retrieve xml files
xml_files <- c()
for(i in 1:nrow(geo)) {
acc <- geo$acc[i]
token <- geo$token[i]
destfile <- glue("{acc}.xml")
download.file(glue("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={acc}&targ=gsm&form=xml&view=full&token={token}"),
destfile = destfile)
xml_files <- c(xml_files, destfile)
}
# Note that sample characteristic meta can vary between datasets
sampleCharAttrs <- function(s) unique(unlist(xml_attrs(xml_find_all(s, "./Channel/Characteristics"), "tag")))
parsed <- list()
for(xf in xml_files) {
x <- xml_ns_strip(read_xml(xf))
s <- xml_find_all(x, "//MINiML/Sample")
meta <- list(
sample_id_geo = xml_attr(xml_find_first(s, "."), "iid"),
sampleID = xml_text(xml_find_first(s, "./Title"), trim = T),
species = xml_text(xml_find_first(s, "./Channel/Organism"), trim = T),
sample_info_1 = xml_text(xml_find_first(s, "./Channel/Source"), trim = T),
assay = xml_text(xml_find_first(s, "./Library-Strategy"), trim = T),
platform = xml_text(xml_find_first(s, "./Instrument-Model/Predefined"), trim = T),
sra = xml_attr(xml_find_first(s, "./Relation[@type='SRA']"), "target")
)
meta_df <- meta[lengths(meta) != 0]
meta_df <- data.frame(meta)
# Parse characteristics separately
sample_tags <- sampleCharAttrs(s)
cat("Sample characteristics available:", paste(sample_tags, collapse = ", "), "\n")
sample_chars <- lapply(sample_tags, function(tag) xml_text(xml_find_first(s, glue("./Channel/Characteristics[@tag='{tag}']")), trim = T))
names(sample_chars) <- sample_tags
sample_chars <- data.frame(sample_chars)
parsed[[sub(".xml", "", xf)]] <- cbind(meta, sample_chars)
}
# Mapping has to be done somewhat manually
assay_map <- c(`ATAC-seq` = "ATACseq", `ChIP-Seq` = "ChIPSeq", `RNA-Seq` = "rnaSeq", `Bisulfite-Seq` = "bisulfiteSeq", `OTHER` = "CUT&RUN")
platform_map <- c(`Illumina HiSeq 2500` = "HiSeq 2500") # consider changing HiSeq 2500 to Illumina HiSeq 2500 in data model
assay_data_type <- c(`ATAC-seq` = "chromatinActivity", `ChIP-Seq` = "chromatinActivity", `RNA-Seq` = "geneExpression",
`Bisulfite-Seq` = "chromatinActivity", `OTHER` = "chromatinActivity")
ext_template <- as.list(read.csv("GenomicsAssayTemplateExtended.csv", colClasses = "character"))
ext_template$Component <- "GenomicsAssayTemplateExtended"
ext_template$resourceType <- "experimentalData"
ext_template$dataSubtype <- "raw"
ext_template$fileFormat <- "fastq"
filled <- rep(list(ext_template), length(parsed))
names(filled) <- names(parsed)
# These attributes can be translated pretty consistently
easy_fill <- function(x, filled, parsed) {
filled[[x]]$assay <- assay_map[parsed[[x]]$assay]
# filled[[x]]$platform <- platform_map[parsed[[x]]$platform] # no longer need mapping because we'll change the model anyway
filled[[x]]$platform <- parsed[[x]]$platform
filled[[x]]$dataType <- assay_data_type[parsed[[x]]$assay]
filled[[x]]$specimenID <- parsed[[x]]$sampleID
filled[[x]]$species <- parsed[[x]]$species
filled[[x]]$comments <- parsed[[x]]$sra # not sure where to stick SRA link -- put in comments for now
filled
}
# GSE179699
filled <- easy_fill("GSE179699", filled, parsed)
filled$GSE179699$modelSystemName <- "M3 MPNST"
filled$GSE179699$tissue <- "tumor"
filled$GSE179699$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE179699$genePerturbed <- ifelse(parsed$GSE179699$suz12 == "Knockout", "SUZ12", "")
filled$GSE179699$genePerturbationType <- ifelse(parsed$GSE179699$suz12 == "Knockout", "knockout", "")
filled$GSE179699$genePerturbationTechnology <- ifelse(parsed$GSE179699$suz12 == "Knockout", "CRISPR", "")
filled$GSE179699$experimentalCondition <- parsed$GSE179699$treatment
# GSE206527
filled <- easy_fill("GSE206527", filled, parsed)
filled$GSE206527$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE206527$tissue <- gsub("MPNST tumor", "tumor", parsed$GSE206527$tissue)
# GSE179703
filled <- easy_fill("GSE179703", filled, parsed)
filled$GSE179703$tissue <- "tumor"
filled$GSE179703$modelSystemName <- parsed$GSE179703$strain
filled$GSE179703$genePerturbed <- ifelse(parsed$GSE179703$eed == "Knockout", "EED", "")
filled$GSE179703$genePerturbationType <- ifelse(parsed$GSE179703$eed == "Knockout", "knockout", "")
filled$GSE179703$genePerturbationTechnology <- ifelse(parsed$GSE179703$eed == "Knockout", "CRISPR", "")
# GSE202555
filled <- easy_fill("GSE202555", filled, parsed)
filled$GSE202555$tissue <- "tumor"
filled$GSE202555$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE202555$modelSystemName <- "M3 MPNST" # parsed$GSE202555$cell.line
filled$GSE202555$genePerturbed <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "SUZ12", "")
filled$GSE202555$genePerturbationType <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "knockout", "")
filled$GSE202555$genePerturbationTechnology <- ifelse(parsed$GSE202555$suz12.status == "Knockout", "CRISPR", "")
# GSE179587
filled <- easy_fill("GSE179587", filled, parsed)
filled$GSE179587$tissue <- "tumor"
filled$GSE179587$tumorType <- "Malignant Peripheral Sheath Tumor"
filled$GSE179587$modelSystemName <- "M3 MPNST"
filled$GSE179587$experimentalCondition <- parsed$GSE179587$treatment
#
filled <- lapply(filled, as.data.frame)
for(m in names(filled)) write.csv(filled[[m]], paste0("manifests/", m, ".csv"), row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment