Last active
November 25, 2019 10:39
-
-
Save vjrj/2b077a2bb786227a517ca03f7e30cf4c to your computer and use it in GitHub Desktop.
Verification of mappings in LA biocache-store after load
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Adapt urls and put this in your /usr/local/bin (for instance) and call it with your biocache-store like: | |
# check-mappings-in-ocurrences.sh dr100 dr102 | |
# | |
# Depends on jq (apt install jq) and https://pypi.org/project/xq/ | |
# | |
drs=$1 | |
DEBUG=0 | |
for dr in $(echo $drs | tr "," "\n") | |
do | |
DIR=/data/biocache-load/$dr | |
if [[ -d $DIR ]] | |
then | |
# TODO check that meta.xml exists | |
INST_IDX=$(cat $DIR/meta.xml | xq '.archive.core.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/institutionCode") | ."@index"' | sed 's/"//g') | |
COLL_IDX=$(cat $DIR/meta.xml | xq '.archive.core.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/collectionCode") | ."@index"' | sed 's/"//g') | |
if [[ -z $INST_IDX ]] ; then | |
INST_IDX=$(cat $DIR/meta.xml | xq '.archive.extension.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/institutionCode") | ."@index"' | sed 's/"//g') | |
fi | |
if [[ -z $COLL_IDX ]] ; then | |
COLL_IDX=$(cat $DIR/meta.xml | xq '.archive.extension.field[] | select(."@term"=="http://rs.tdwg.org/dwc/terms/collectionCode") | ."@index"' | sed 's/"//g') | |
fi | |
if [[ -z $INST_IDX ]] ; then | |
echo "ERROR: Cannot find the institution for this resource" | |
continue | |
fi | |
if [[ -z $COLL_IDX ]] ; then | |
echo "ERROR: Cannot find the collection for this resource" | |
continue | |
fi | |
INST_IDX=$((INST_IDX + 1)) | |
COLL_IDX=$((COLL_IDX + 1)) | |
if [[ $DEBUG -eq 1 ]] ; then echo "institutionCode index: "$INST_IDX; fi | |
if [[ $DEBUG -eq 1 ]] ; then echo "collectionCode index: "$COLL_IDX; fi | |
INST=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$INST_IDX" '{print $a}' | sort | uniq) | |
COLL=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$COLL_IDX" '{print $a}' | sort | uniq) | |
NUM_INST=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$INST_IDX" '{print $a}' | sort | uniq | wc -w) | |
NUM_COLL=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$COLL_IDX" '{print $a}' | sort | uniq | wc -w) | |
NUM_INST_L=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$INST_IDX" '{print $a}' | sort | uniq | wc -l) | |
NUM_COLL_L=$(cat $DIR/occurrence.txt | tail -n +2 | awk -F" " -v a="$COLL_IDX" '{print $a}' | sort | uniq | wc -l) | |
if [[ $DEBUG -eq 1 ]] ; then echo "institutionCode: "$INST | awk -v len=60 '{ if (length($0) > len) print substr($0, 1, len-3) "..."; else print; }'; fi | |
if [[ $DEBUG -eq 1 ]] ; then echo "collectionCode: "$COLL | awk -v len=60 '{ if (length($0) > len) print substr($0, 1, len-3) "..."; else print; }'; fi | |
if [[ $NUM_INST_L > 1 ]] | |
then | |
echo "ERROR: Multiple institutions in this resource" | |
continue | |
fi | |
if [[ $NUM_COLL_L > 1 ]] | |
then | |
echo "WARN: This resource has multiple collections codes. I'm a dummy simple script, it's this ok?" | |
continue | |
fi | |
if [[ $NUM_INST > 1 ]] | |
then | |
echo "WARN: This resource a institution code with spaces. Is this ok?" | |
continue | |
fi | |
if [[ $NUM_COLL > 1 ]] | |
then | |
echo "WARN: This resource a collection code with spaces. Is this ok?" | |
continue | |
fi | |
OUT=$(curl -s https://colecciones.gbif.es/lookup/inst/$INST/coll/$COLL | jq .error | sed 's/"//g' ) | |
if [[ $OUT != "null" ]] | |
then | |
echo "ERROR: "$OUT | |
echo "https://colecciones.gbif.es/lookup/inst/$INST/coll/$COLL" | |
else | |
echo "INFO: $dr mappings, institution: $INST, collection: $COLL are OK" | |
fi | |
else | |
echo "WARN: $DIR does not exists. This was ingested in other machine?" | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment