Skip to content

Instantly share code, notes, and snippets.

@dwinston
Forked from jeffbaumes/denormalize.py
Created July 20, 2023 20:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dwinston/b16922ff304c59b3c8ba675b9069b61a to your computer and use it in GitHub Desktop.
Save dwinston/b16922ff304c59b3c8ba675b9069b61a to your computer and use it in GitHub Desktop.
from pymongo import MongoClient
client = MongoClient()
aggregation = [
{
"$set": {
"collection_date.has_date_value": {
"$dateFromString": {
"dateString": "$collection_date.has_raw_value",
},
},
},
},
{
"$lookup": {
"from": "study_set",
"localField": "part_of.0",
"foreignField": "id",
"as": "study",
},
},
{
"$lookup": {
"from": "omics_processing_set",
"localField": "id",
"foreignField": "has_input.0",
"as": "omics_processing",
},
},
{
"$set": {
"multiomics": {
"$sortArray": {
"input": {
# This set difference removes duplicate omics types and removes lipidomics
"$setDifference": [
"$omics_processing.omics_type.has_raw_value",
["Lipidomics"],
],
},
"sortBy": 1,
},
},
},
},
]
activity_types = {
"mags_activity": "nmdc:MAGsAnalysisActivity",
"metabolomics_analysis_activity": "nmdc:MetabolomicsAnalysisActivity",
"metagenome_annotation_activity": "nmdc:MetagenomeAnnotation",
"metagenome_assembly": "nmdc:MetagenomeAssembly",
"metaproteomics_analysis_activity": "nmdc:MetaProteomicAnalysis",
"metatranscriptome_activity": "nmdc:metaT",
"nom_analysis_activity": "nmdc:NomAnalysisActivity",
}
for activity_type in activity_types:
# Pull in activities and data_objects associated with each omics_processing
aggregation.extend([
{
"$lookup": {
"from": f"{activity_type}_set",
"localField": "omics_processing.id",
"foreignField": "was_informed_by",
"as": activity_type,
},
},
{
"$lookup": {
"from": "data_object_set",
"localField": f"{activity_type}.has_output",
"foreignField": "id",
"as": f"{activity_type}_data_object",
"pipeline": [
{"$set": {"activity_type": activity_types[activity_type]}},
],
},
},
])
aggregation.extend([
# Lookup metagenome annotations
{
"$lookup": {
"from": "functional_annotation_agg",
"localField": "metagenome_annotation_activity.id",
"foreignField": "metagenome_annotation_id",
"as": "metagenome_annotation",
"pipeline": [
{
"$set": {
"id": "$gene_function_id",
"activity_id": "$metagenome_annotation_id",
},
},
{"$unset": ["_id", "metagenome_annotation_id", "gene_function_id"]},
],
},
},
# Lookup metaproteomics annotations
{
"$lookup": {
"from": "metap_gene_function_aggregation",
"localField": "metaproteomics_analysis_activity.id",
"foreignField": "metaproteomic_analysis_id",
"as": "metaproteomics_annotation",
"pipeline": [
{
"$set": {
"id": "$gene_function_id",
"activity_id": "$metaproteomic_analysis_id",
},
},
{"$unset": ["_id", "metaproteomic_analysis_id", "gene_function_id"]},
],
},
},
# Combine annotations into a single annotation array
{
"$set": {
"gene_function": {
"$concatArrays": ["$metagenome_annotation", "$metaproteomics_annotation"]
}
},
},
{
"$unset": ["metagenome_annotation", "metaproteomics_annotation"],
},
# Combine all analyses into a single activity array
{
"$set": {
"activity": {
"$concatArrays": [f"${activity_type}" for activity_type in activity_types]
}
}
},
# Remove the monstrous has_peptide_quantifications array
{
"$set": {
"activity": {
"$map": {
"input": "$activity",
"as": "d",
"in": {
"$setField": {
"field": "has_peptide_quantifications",
"value": "$$REMOVE",
"input": "$$d"
}
}
}
},
}
},
# We are done with the separate activity types since they are all in the activity array now
{
"$unset": list(activity_types.keys()),
},
# Add a count so we can sort by the number of analyses each sample has
{
"$set": {
"omics_processing_count": {
"$size": "$omics_processing"
}
}
}
])
aggregation.extend([
{
"$set": {
"data_object": {
"$concatArrays": [f"${activity_type}_data_object" for activity_type in activity_types]
}
}
},
{
"$unset": [f"{activity_type}_data_object" for activity_type in activity_types]
},
{
"$out": "denormalized",
},
])
q = client.nmdc.biosample_set.aggregate(aggregation)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment