Skip to content

Instantly share code, notes, and snippets.

@vjrj
Last active November 25, 2019 10:39
Show Gist options
  • Save vjrj/2b077a2bb786227a517ca03f7e30cf4c to your computer and use it in GitHub Desktop.
Save vjrj/2b077a2bb786227a517ca03f7e30cf4c to your computer and use it in GitHub Desktop.
Verification of mappings in LA biocache-store after load
#!/bin/bash
#
# Adapt urls and put this in your /usr/local/bin (for instance) and call it with your biocache-store like:
# check-mappings-in-ocurrences.sh dr100 dr102
#
# Depends on jq (apt install jq) and https://pypi.org/project/xq/
#
drs=$1
DEBUG=0
for dr in $(echo $drs | tr "," "\n")
do
DIR=/data/biocache-load/$dr
if [[ -d $DIR ]]
then
# TODO check that meta.xml exists
INST_IDX=$(cat $DIR/meta.xml | xq '.archive.core.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/institutionCode") | ."@index"' | sed 's/"//g')
COLL_IDX=$(cat $DIR/meta.xml | xq '.archive.core.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/collectionCode") | ."@index"' | sed 's/"//g')
if [[ -z $INST_IDX ]] ; then
INST_IDX=$(cat $DIR/meta.xml | xq '.archive.extension.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/institutionCode") | ."@index"' | sed 's/"//g')
fi
if [[ -z $COLL_IDX ]] ; then
COLL_IDX=$(cat $DIR/meta.xml | xq '.archive.extension.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/collectionCode") | ."@index"' | sed 's/"//g')
fi
if [[ -z $INST_IDX ]] ; then
echo "ERROR: Cannot find the institution for this resource"
continue
fi
if [[ -z $COLL_IDX ]] ; then
echo "ERROR: Cannot find the collection for this resource"
continue
fi
INST_IDX=$((INST_IDX + 1))
COLL_IDX=$((COLL_IDX + 1))
if [[ $DEBUG -eq 1 ]] ; then echo "institutionCode index: "$INST_IDX; fi
if [[ $DEBUG -eq 1 ]] ; then echo "collectionCode index: "$COLL_IDX; fi
INST=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$INST_IDX" '{print $a}' | sort | uniq)
COLL=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$COLL_IDX" '{print $a}' | sort | uniq)
NUM_INST=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$INST_IDX" '{print $a}' | sort | uniq | wc -w)
NUM_COLL=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$COLL_IDX" '{print $a}' | sort | uniq | wc -w)
NUM_INST_L=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$INST_IDX" '{print $a}' | sort | uniq | wc -l)
NUM_COLL_L=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$COLL_IDX" '{print $a}' | sort | uniq | wc -l)
if [[ $DEBUG -eq 1 ]] ; then echo "institutionCode: "$INST | awk -v len=60 '{ if (length($0) > len) print substr($0, 1, len-3) "..."; else print; }'; fi
if [[ $DEBUG -eq 1 ]] ; then echo "collectionCode: "$COLL | awk -v len=60 '{ if (length($0) > len) print substr($0, 1, len-3) "..."; else print; }'; fi
if [[ $NUM_INST_L > 1 ]]
then
echo "ERROR: Multiple institutions in this resource"
continue
fi
if [[ $NUM_COLL_L > 1 ]]
then
echo "WARN: This resource has multiple collections codes. I'm a dummy simple script, it's this ok?"
continue
fi
if [[ $NUM_INST > 1 ]]
then
echo "WARN: This resource a institution code with spaces. Is this ok?"
continue
fi
if [[ $NUM_COLL > 1 ]]
then
echo "WARN: This resource a collection code with spaces. Is this ok?"
continue
fi
OUT=$(curl -s https://colecciones.gbif.es/lookup/inst/$INST/coll/$COLL | jq .error | sed 's/"//g' )
if [[ $OUT != "null" ]]
then
echo "ERROR: "$OUT
echo "https://colecciones.gbif.es/lookup/inst/$INST/coll/$COLL"
else
echo "INFO: $dr mappings, institution: $INST, collection: $COLL are OK"
fi
else
echo "WARN: $DIR does not exists. This was ingested in other machine?"
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment