Last active
April 5, 2019 20:29
-
-
Save jhpoelen/9cac3897fc930b17fa6dcd0cacfc5ec2 to your computer and use it in GitHub Desktop.
IVMOOC 2019 GloBI Kingdom To Kingdom Interactions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val taxa = spark.read.option("delimiter","""\t""").option("header","true").csv("taxonCache.tsv.bz2") | |
taxa.printSchema | |
import spark.implicits._ | |
val taxonCache = spark.read.option("delimiter","""\t""").option("header","true").csv("taxonCache.tsv.bz2") | |
val taxonIdsPaths = taxonCache.select("id", "pathNames", "path").as[(String, String, String)].filter(_._2 != null).filter( _._3 != null).filter(_._1 != null) | |
val taxaIdToKingdom = taxonIdsPaths.map( r=> (r._1, r._2.split("\\|").map(_.trim), r._3.split("\\|").map(_.trim))).map(r => (r._1, r._2.zip(r._3))).map(r => (r._1, r._2.filter(_._1 == "kingdom").map(_._2).mkString)).filter(_._2.nonEmpty).filter(r => List("GBIF", "ITIS","WORMS", "INAT_TAXON").contains(r._1.split(":").head)).filter(_._2 != "incertae sedis") | |
taxaIdToKingdom.write.option("delimiter","""\t""").csv("taxaIdToKingdom.tsv") | |
val interactions = spark.read.option("delimiter","""\t""").option("header", "true").csv("interactions.tsv.bz2") | |
val interactionIds = interactions.select("sourceTaxonIds", "interactionTypeName", "targetTaxonIds", "sourceNamespace") | |
val flatIdsDistinctByDataset = interactionIds.as[(String, String, String, String)].filter(_._1 != null).filter(_._3 != null).flatMap(r => r._1.split("""\|""").map(x => (x.trim, r._2, r._3, r._4))).filter(_._1 != "no:match").flatMap(r => r._3.split("""\|""").map(x => (r._1, r._2, x.trim, r._4))).filter(_._3 != "no:match").filter(r => List("GBIF", "ITIS","WORMS", "INAT_TAXON").contains(r._1.split(":").head)).filter(r => List("GBIF", "ITIS", "WORMS", "INAT_TAXON").contains(r._3.split(":").head)) | |
flatIdsDistinctByDataset.write.option("delimiter","""\t""").csv("idIteractionsWithNamespace.tsv") | |
val interactionsIds = spark.read.option("delimiter","""\t""").csv("idIteractionsWithNamespace.tsv") | |
val taxonIdKingdom = spark.read.option("delimiter","""\t""").csv("taxaIdToKingdom.tsv") | |
val idLookup = taxonIdKingdom.as[(String, String)].distinct.collect.toMap | |
val mappedInteractions = interactionsIds.as[(String, String, String, String)].map(r => (idLookup.get(r._1), r._2, idLookup.get(r._3), r._4)) | |
mappedInteractions.write.option("delimiter","""\t""").csv("kingdom2kingdom.tsv") | |
mappedInteractions.distinct.write.option("delimiter", """\t""").csv("kingdom2kingdom.distinct.tsv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
curl -L "https://depot.globalbioticinteractions.org/snapshot/target/data/tsv/interactions.tsv.gz" | gunzip | bzip2 > interactions.tsv.bz2 | |
curl -L "https://depot.globalbioticinteractions.org/snapshot/target/data/taxa/taxonCache.tsv.gz" | gunzip | bzip2 > taxonCache.tsv.bz2 | |
cat calculateKingdomToKingdomInteractions.scala | spark-shell --conf spark.sql.caseSensitive=true --driver-memory 4G --executor-memory 6G | |
cat kingdom2kingdom.distinct.tsv/* > kingdom2kingdom.tsv.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment