Jorrit Poelen jhpoelen

## wikidata-taxa.sh
#!/bin/bash
#
# streams Wikidata taxon items (or items containing https://www.wikidata.org/wiki/Q16521)
# from latest data dump in line json (one json object per line)
#

curl --silent "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2"\
| bunzip2\
| grep -E "Q16521[^0-9]"\
| sed 's/,$//g'\

## ls-openaccess-doi.sh
OPEN_ALEX_VERSION=hash://sha256/f19011fe72234cb22f4326fcc95752647a11628cdaa0d24f7d55033b06cb5653

REMOTES=https://biokic6.rc.asu.edu/preston/openalex,https://linker.bio\


preston ls\
 --anchor "${OPEN_ALEX_VERSION}"\
 --remote "${REMOTES}"\
 | grep hasVersion\
 | grep -v csv\

## find-openalex-bumbus-pubs.sh
#!/bin/bash
#
# Find publications (aka "works") that mention "Bombus" (bumblebees) in a versioned copy of OpenAlex.
#
# Requirements:
#   preston - https://github.com/bio-guoda/preston
#   grep (comes with linux distro)
#   gunzip (comes with linux distro)
#   jq - https://jqlang.github.io/jq/
#   mlr - https://miller.readthedocs.io/

## edge-lists-by-repository.sh
#!/bin/bash
#
#
curl "https://depot.globalbioticinteractions.org/snapshot/target/data/tsv/datasets.tsv"\
| awk '{ print "https://depot.globalbioticinteractions.org/reviews/" $1 "/indexed-interactions.tsv" }'\
| xargs -L1 curl\
| xargs -L1 DiscretePowerLawfitter.sh\
> angel-review-of-globi-datasets.tsv


## gist:92e1cf7c647cd495e6d3d3b20d71f2de
#!/bin/bash
#
# related to https://github.com/Big-Bee-Network/UCSB-IZC00012194
#

preston ls\
| grep hasVersion\
| preston grep 'UCSB-IZC00012194.*body\ssize' --log tsv\
| grep value\
| cut -f1

## streaming-query.sh
#!/bin/bash
#
# prerequisites
# * preston https://github.com/bio-guoda/preston
# * pv pipeviewer https://linux.die.net/man/1/pv
# * mlr https://miller.readthedocs.io/en/6.7.0/
#
# executed/tested on 22.04.1-Ubuntu
#

## compile-data.sh
#!/bin/bash
#
# 2020-10-15
#
# This script is a way to select pollination and flower visits record from
# one of the data products provided via https://globalbioticinteractions.org/data .
#
# This particular example uses a July 2020 data publication.
#
# For more recent data, see https://globalbioticinteractions.org/data .

## README.md

      
              3 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                jhpoelen
                / README.md
            
            
              Last active
              September 25, 2020 18:43
            
          
    Script used for counting records / taxa
Attached big-bee-globi-stats.log was generated on 2020-09-25 using latest interactions.tsv.gz with
$ sha256sum interactions.tsv.gz 
436f1249dc71bc948483bac0d6f13c667e9d69456ef727037637516468e9d29d

To reproduce:

  
## count_geese.R

prepare_ebird_2018_id <- function() {
  ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
  # unfortunately, the eBird URL no longer work, but,
  # using a time machine, we went back in time and republished data via Zenodo from 2018
  ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip"

  ebird_data_id <- contentid::register(ebird_data_location)
  ebird_data_id
}

## create-checklist-cluster.sh
#!/bin/bash
#
#

WKT_STRING="POLYGON ((-72.77293810620904 -33.196074154826235, -72.77293810620904 6.59516197881252, -28.12450060620904 6.59516197881252, -28.12450060620904 -33.196074154826235, -72.77293810620904 -33.196074154826235))"

spark-submit \
  --master mesos://zk://mesos01:2181,mesos02:2181,mesos03:2181/mesos \
  --driver-memory 4G \
  --conf spark.sql.caseSensitive=true \
	#!/bin/bash
	#
	# streams Wikidata taxon items (or items containing https://www.wikidata.org/wiki/Q16521)
	# from latest data dump in line json (one json object per line)
	#

	curl --silent "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2"\
	\| bunzip2\
	\| grep -E "Q16521[^0-9]"\
	\| sed 's/,$//g'\
	OPEN_ALEX_VERSION=hash://sha256/f19011fe72234cb22f4326fcc95752647a11628cdaa0d24f7d55033b06cb5653

	REMOTES=https://biokic6.rc.asu.edu/preston/openalex,https://linker.bio\


	preston ls\
	--anchor "${OPEN_ALEX_VERSION}"\
	--remote "${REMOTES}"\
	\| grep hasVersion\
	\| grep -v csv\
	#!/bin/bash
	#
	# Find publications (aka "works") that mention "Bombus" (bumblebees) in a versioned copy of OpenAlex.
	#
	# Requirements:
	# preston - https://github.com/bio-guoda/preston
	# grep (comes with linux distro)
	# gunzip (comes with linux distro)
	# jq - https://jqlang.github.io/jq/
	# mlr - https://miller.readthedocs.io/
	#!/bin/bash
	#
	#
	curl "https://depot.globalbioticinteractions.org/snapshot/target/data/tsv/datasets.tsv"\
	\| awk '{ print "https://depot.globalbioticinteractions.org/reviews/" $1 "/indexed-interactions.tsv" }'\
	\| xargs -L1 curl\
	\| xargs -L1 DiscretePowerLawfitter.sh\
	> angel-review-of-globi-datasets.tsv
	#!/bin/bash
	#
	# related to https://github.com/Big-Bee-Network/UCSB-IZC00012194
	#

	preston ls\
	\| grep hasVersion\
	\| preston grep 'UCSB-IZC00012194.*body\ssize' --log tsv\
	\| grep value\
	\| cut -f1
	#!/bin/bash
	#
	# prerequisites
	# * preston https://github.com/bio-guoda/preston
	# * pv pipeviewer https://linux.die.net/man/1/pv
	# * mlr https://miller.readthedocs.io/en/6.7.0/
	#
	# executed/tested on 22.04.1-Ubuntu
	#
	#!/bin/bash
	#
	# 2020-10-15
	#
	# This script is a way to select pollination and flower visits record from
	# one of the data products provided via https://globalbioticinteractions.org/data .
	#
	# This particular example uses a July 2020 data publication.
	#
	# For more recent data, see https://globalbioticinteractions.org/data .

	prepare_ebird_2018_id <- function() {
	ebird_data_location <- "http://ebirddata.ornith.cornell.edu/downloads/gbiff/dwca-1.0.zip"
	# unfortunately, the eBird URL no longer work, but,
	# using a time machine, we went back in time and republished data via Zenodo from 2018
	ebird_data_location <- "https://zenodo.org/record/3858251/files/dwca-1.0.zip"

	ebird_data_id <- contentid::register(ebird_data_location)
	ebird_data_id
	}
	#!/bin/bash
	#
	#

	WKT_STRING="POLYGON ((-72.77293810620904 -33.196074154826235, -72.77293810620904 6.59516197881252, -28.12450060620904 6.59516197881252, -28.12450060620904 -33.196074154826235, -72.77293810620904 -33.196074154826235))"

	spark-submit \
	--master mesos://zk://mesos01:2181,mesos02:2181,mesos03:2181/mesos \
	--driver-memory 4G \
	--conf spark.sql.caseSensitive=true \