Created
May 8, 2015 10:39
-
-
Save ianfieldhouse/20e32f7262108f457803 to your computer and use it in GitHub Desktop.
check geonetwork csw harvest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
SCRIPT_VERSION=1.0.5 | |
# service url and parameters | |
HOST='http://scotsdi.edina.ac.uk' | |
ENDPOINT='/geonetwork/srv/eng/csw' | |
SERVICE='CSW' | |
REQUEST='GetRecords' | |
CONSTRAINT_LANGUAGE='CQL_TEXT' | |
TYPE_NAMES='csw%3ARecord' | |
RESULT_TYPE='results' | |
VERSION='2.0.2' | |
ESN='brief' | |
OFFSET=20 | |
BASE_URL="$HOST$ENDPOINT?service=$SERVICE&request=$REQUEST&\ | |
constraintLanguage=$CONSTRAINT_LANGUAGE&typeNames=$TYPE_NAMES&\ | |
resultType=$RESULT_TYPE&version=$VERSION" | |
usage() { | |
echo -e "Usage: $0 [-o <int|default=20>]\ne.g.:\n$0\n$0 -o 50" 1>&2 | |
exit 1 | |
} | |
while getopts ":o:v" opt; do | |
case $opt in | |
o) | |
OFFSET=${OPTARG} | |
;; | |
v) | |
echo version $SCRIPT_VERSION | |
exit 0 | |
;; | |
*) | |
usage | |
;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
echo $SERVICE harvest from $HOST$ENDPOINT with offset of $OFFSET | |
# temporary file to hold identifiers | |
FILE='/tmp/identifiers.txt' | |
# empty temporary file if it already exists | |
if [ -e "$FILE" ] | |
then | |
cat /dev/null > $FILE | |
fi | |
# determine total number of records to harvest | |
NUM_RECORDS=`curl -s $BASE_URL |\ | |
grep numberOfRecordsMatched |\ | |
sed -n 's/.*numberOfRecordsMatched="\([0-9]*\)".*/\1/p'` | |
echo -e "\nNumber of records to harvest: $NUM_RECORDS\n" | |
# save record identifiers to temporaray file | |
RECORDS_LEFT=$NUM_RECORDS | |
for (( i=1; i<=$NUM_RECORDS; i=i+$OFFSET )) | |
do | |
if [ "$RECORDS_LEFT" -gt "$OFFSET" ] | |
then | |
NEXT_RECORD_BATCH=$OFFSET | |
else | |
NEXT_RECORD_BATCH=$RECORDS_LEFT | |
fi | |
echo "Harvesting $NEXT_RECORD_BATCH records from $i" | |
CURL_URL="$BASE_URL&esn=$ESN&maxrecords=$OFFSET&startposition=$i" | |
curl -s $CURL_URL | grep dc:identifier >> $FILE | |
RECORDS_LEFT=$((RECORDS_LEFT-OFFSET)) | |
done | |
# inform user of number of unique identifiers | |
echo -e "\nNumber of unique identifiers:" `cat $FILE | sort | uniq | wc -l` | |
# remove temporary file if it exists | |
if [ -e "$FILE" ] | |
then | |
rm $FILE | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment