|
#!/bin/bash |
|
# |
|
# this script generates stats using Global Biotic Interactions https://data.globalbioticinteractions.org/interactions.tsv.gz |
|
# and unix tools like cat, gunzip, grep and wc. |
|
# |
|
set -xe |
|
# note that the url below is aspirational and does not resolve (yet): the current interaction.tsv.gz can be found at https://globalbioticinteractions.org/data. |
|
# curl "https://data.globalbioticinteractions.org/interactions.tsv.gz" > interactions.tsv.gz |
|
|
|
BEES="(Andrenidae|Apidae|Colletidae|Halictidae|Megachilidae|Melittidae|Stenotritidae)" |
|
PLANTS="(Plantae|Viridiplantae)" |
|
|
|
# calculate sha256 hash |
|
SHA256_OF_INTERACTIONS_DATA=$(cat interactions.tsv.gz | sha256sum) |
|
|
|
# calculate number of records that involve bee families and plants |
|
cat interactions.tsv.gz \ |
|
| gunzip \ |
|
| tail -n+2 \ |
|
| grep -E "$BEES" \ |
|
| grep -E "$PLANTS" > interactions-bee-plants.tsv |
|
|
|
NUMBER_OF_BEE_PLANT_INTERACTION_RECORDS=$(cat interactions-bee-plants.tsv | wc -l) |
|
|
|
# calulate distinct number of plant taxa that |
|
cat interactions-bee-plants.tsv \ |
|
| cut -f1-34 > sources.tsv |
|
|
|
cat interactions-bee-plants.tsv \ |
|
| cut -f37-70 > targets.tsv |
|
|
|
NUMBER_OF_DISTINCT_BEE_ASSOCIATED_PLANT_TAXA=$(cat sources.tsv targets.tsv \ |
|
| grep -E "$PLANTS" \ |
|
| cut -f1,2 \ |
|
| sort | uniq | wc -l) |
|
|
|
PARASITIC_INTERACTION_TYPES="(RO_0002208|RO_0002209|RO_0002227|RO_0002228|RO_0002235|RO_0002236|RO_0002237|RO_0002444|RO_0002445|RO_0002453|RO_0002454|RO_0002556|RO_0002557|RO_0002632|RO_0002633|RO_0002634|RO_0002635|RO_0002636|RO_0002637|RO_0002638|RO_0002639|RO_0002640|RO_0002641|RO_0008503|RO_0008504)" |
|
|
|
cat interactions.tsv.gz \ |
|
| gunzip \ |
|
| tail -n+2 \ |
|
| grep -E "$BEES" \ |
|
| grep -E "$PARASITIC_INTERACTION_TYPES" > interactions-bee-parasitic.tsv |
|
|
|
cat interactions-bee-parasitic.tsv \ |
|
| cut -f1-34 > sources-parasitic.tsv |
|
|
|
cat interactions-bee-parasitic.tsv \ |
|
| cut -f37-70 > targets-parasitic.tsv |
|
|
|
cat sources-parasitic.tsv targets-parasitic.tsv \ |
|
| grep -v -E "$BEES" > parasitic-bee-associates.tsv |
|
|
|
|
|
|
|
NUMBER_OF_NON_ARTHROPOD_BEE_INTERACTION_RECORDS=$(cat parasitic-bee-associates.tsv \ |
|
| grep -v "Arthropoda" \ |
|
| wc -l) |
|
|
|
NUMBER_OF_ARTHROPOD_BEE_INTERACTION_RECORDS=$(cat parasitic-bee-associates.tsv \ |
|
| grep "Arthropoda" \ |
|
| wc -l) |
|
|
|
# print header fields and associated number |
|
zcat interactions.tsv.gz | head -n1 | tr '\t' '\n' | cat -n |
|
# 1 sourceTaxonId |
|
# 2 sourceTaxonIds |
|
# 3 sourceTaxonName |
|
# 4 sourceTaxonRank |
|
# 5 sourceTaxonPathNames |
|
# 6 sourceTaxonPathIds |
|
# 7 sourceTaxonPathRankNames |
|
# 8 sourceTaxonSpeciesName |
|
# 9 sourceTaxonSpeciesId |
|
# 10 sourceTaxonGenusName |
|
# 11 sourceTaxonGenusId |
|
# 12 sourceTaxonFamilyName |
|
# 13 sourceTaxonFamilyId |
|
# 14 sourceTaxonOrderName |
|
# 15 sourceTaxonOrderId |
|
# 16 sourceTaxonClassName |
|
# 17 sourceTaxonClassId |
|
# 18 sourceTaxonPhylumName |
|
# 19 sourceTaxonPhylumId |
|
# 20 sourceTaxonKingdomName |
|
# 21 sourceTaxonKingdomId |
|
# 22 sourceId |
|
# 23 sourceOccurrenceId |
|
# 24 sourceCatalogNumber |
|
# 25 sourceBasisOfRecordId |
|
# 26 sourceBasisOfRecordName |
|
# 27 sourceLifeStageId |
|
# 28 sourceLifeStageName |
|
# 29 sourceBodyPartId |
|
# 30 sourceBodyPartName |
|
# 31 sourcePhysiologicalStateId |
|
# 32 sourcePhysiologicalStateName |
|
# 33 sourceSexId |
|
# 34 sourceSexName |
|
# 35 interactionTypeName |
|
# 36 interactionTypeId |
|
# 37 targetTaxonId |
|
# 38 targetTaxonIds |
|
# 39 targetTaxonName |
|
# 40 targetTaxonRank |
|
# 41 targetTaxonPathNames |
|
# 42 targetTaxonPathIds |
|
# 43 targetTaxonPathRankNames |
|
# 44 targetTaxonSpeciesName |
|
# 45 targetTaxonSpeciesId |
|
# 46 targetTaxonGenusName |
|
# 47 targetTaxonGenusId |
|
# 48 targetTaxonFamilyName |
|
# 49 targetTaxonFamilyId |
|
# 50 targetTaxonOrderName |
|
# 51 targetTaxonOrderId |
|
# 52 targetTaxonClassName |
|
# 53 targetTaxonClassId |
|
# 54 targetTaxonPhylumName |
|
# 55 targetTaxonPhylumId |
|
# 56 targetTaxonKingdomName |
|
# 57 targetTaxonKingdomId |
|
# 58 targetId |
|
# 59 targetOccurrenceId |
|
# 60 targetCatalogNumber |
|
# 61 targetBasisOfRecordId |
|
# 62 targetBasisOfRecordName |
|
# 63 targetLifeStageId |
|
# 64 targetLifeStageName |
|
# 65 targetBodyPartId |
|
# 66 targetBodyPartName |
|
# 67 targetPhysiologicalStateId |
|
# 68 targetPhysiologicalStateName |
|
# 69 targetSexId |
|
# 70 targetSexName |
|
# 71 decimalLatitude |
|
# 72 decimalLongitude |
|
# 73 localityId |
|
# 74 localityName |
|
# 75 eventDateUnixEpoch |
|
# 76 argumentTypeId |
|
# 77 referenceCitation |
|
# 78 referenceDoi |
|
# 79 referenceUrl |
|
# 80 sourceCitation |
|
# 81 sourceNamespace |
|
# 82 sourceArchiveURI |
|
# 83 sourceDOI |
|
# 84 sourceLastSeenAtUnixEpoch |
|
# |