David Shorthouse dshorthouse

## dina_fetch_duplicate_images.rb
#!/usr/bin/env ruby
# encoding: utf-8

require 'csv'
require 'dina'

Dina.config = {
    authorization_url: 'https://dina.biodiversity.agr.gc.ca/auth',
    endpoint_url: 'https://dina.biodiversity.agr.gc.ca/api',
    server_name: 'dina-prod',

## bionomia_expeditions.rb

# Q1312945 Expedition

qids = ["Q108669", "Q63760", "Q62747", "Q104839", "Q96384", "Q96384", "Q85444", "Q101823", "Q347529", "Q43881351", "Q95248572"]

user_ids = qids.map do |q|
   u = User.find_by_identifier(q) rescue nil
   u.id if !u.nil?
end.compact

## open_alex_taxonomists_with_orcid.rb
#!/usr/bin/env ruby
# encoding: utf-8

require 'csv'
require 'rest_client'
require 'json'

BASE_URL = "https://api.openalex.org/works?filter=concepts.id:C58642233,has_orcid:true,publication_year:2023&per_page=50&page="

def get_data(page:)

## orcid_checksum_check.rb
# encoding: utf-8
class String
  def is_orcid?
    /(\d{4}-){3}\d{3}[0-9X]{1}$/.match?(self)
  end

  def valid_orcid?
    parts = self.scan(/[0-9X]/)
    mod = parts[0..14].map(&:to_i)
                      .inject { |sum, n| (sum + n)*2 }

## twitter_from_orcid.rb
# Install via command-line as 'gem install sparql-client'
require 'sparql/client'

headers = { 'User-Agent' => 'Ruby-Sparql-Client/1.0' }
@sparql = SPARQL::Client.new("https://query.wikidata.org/sparql", headers: headers, read_timeout: 120)

# A SPARQL query to find an item and an optional Twitter handle
def wikidata_by_orcid_query(orcid)
  %Q(
    SELECT ?item ?itemLabel ?twitter

## zookeys_orcid_scrape.rb
#!/usr/bin/env ruby
# encoding: utf-8

require 'rest_client'
require 'csv'
require 'nokogiri'
require 'colorize'

page_range = 0..50

## zootaxa_orcid_scrape.rb
#!/usr/bin/env ruby
# encoding: utf-8

require 'rest_client'
require 'csv'
require 'nokogiri'
require 'colorize'

page_range = 1..10

## ApacheSparkPlayground.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                dshorthouse
                / ApacheSparkPlayground.md
            
            
              Last active
              May 28, 2020 16:57
            
          
    Apache Spark in the Playground


"My data files are too large."


"I have many different files and I need to join them together."

Why/What Is Apache Spark?

https://spark.apache.org/

a fast and general-purpose cluster computing system


## All_the_GBIF_Mrs.csv
GBIF URL,recordedBy,eventDate,year,country,countryCode,GBIF Dataset
https://gbif.org/occurrence/2433942,"Mrs. C. Pease, Miss E. Butler",,1903,Jamaica,JM,https://gbif.org/dataset/40d2de00-0c6e-11dd-84d2-b8a03c50a862
https://gbif.org/occurrence/29404620,U.Mizushima (Mrs.),1954-09-06T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
https://gbif.org/occurrence/29408002,U.Mizushima (Mrs.),1954-09-19T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
https://gbif.org/occurrence/29426346,U.Mizushima (Mrs.),1952-08-01T01:00Z,1952,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
https://gbif.org/occurrence/29429161,U.Mizushima (Mrs.),1954-09-29T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
https://gbif.org/occurrence/29451087,U.Mizushima (Mrs.),1955-03-04T01:00Z,1955,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
https://gbif.org/occurrence/29451907,Mrs. Fay A. Mac Fadden,1926-08-16T01:00Z,1926,,

## 2020-03-17 Orphaned Specimens Bloodhound
{:user_id=>11771, :name=>"Cyrus Pringle", :orphaned=>716},
{:user_id=>14743, :name=>"Gerdt Guenther Hatschbach", :orphaned=>594},
{:user_id=>191, :name=>"Volker Framenau", :orphaned=>586},
{:user_id=>35074, :name=>"Martti Rautanen", :orphaned=>454},
{:user_id=>10182, :name=>"Georg August Zenker", :orphaned=>381},
{:user_id=>12169, :name=>"Paul Sintenis", :orphaned=>349},
{:user_id=>9829, :name=>"Joseph Friedrich Nicolaus Bornmüller", :orphaned=>302},
{:user_id=>10487, :name=>"José Arechavaleta", :orphaned=>250},
{:user_id=>11937, :name=>"Theodor Kotschy", :orphaned=>233},
{:user_id=>11853, :name=>"Ynes Mexia", :orphaned=>150},
	#!/usr/bin/env ruby
	# encoding: utf-8

	require 'csv'
	require 'dina'

	Dina.config = {
	authorization_url: 'https://dina.biodiversity.agr.gc.ca/auth',
	endpoint_url: 'https://dina.biodiversity.agr.gc.ca/api',
	server_name: 'dina-prod',

	# Q1312945 Expedition

	qids = ["Q108669", "Q63760", "Q62747", "Q104839", "Q96384", "Q96384", "Q85444", "Q101823", "Q347529", "Q43881351", "Q95248572"]

	user_ids = qids.map do \|q\|
	u = User.find_by_identifier(q) rescue nil
	u.id if !u.nil?
	end.compact
	# encoding: utf-8
	class String
	def is_orcid?
	/(\d{4}-){3}\d{3}[0-9X]{1}$/.match?(self)
	end

	def valid_orcid?
	parts = self.scan(/[0-9X]/)
	mod = parts[0..14].map(&:to_i)
	.inject { \|sum, n\| (sum + n)*2 }
	# Install via command-line as 'gem install sparql-client'
	require 'sparql/client'

	headers = { 'User-Agent' => 'Ruby-Sparql-Client/1.0' }
	@sparql = SPARQL::Client.new("https://query.wikidata.org/sparql", headers: headers, read_timeout: 120)

	# A SPARQL query to find an item and an optional Twitter handle
	def wikidata_by_orcid_query(orcid)
	%Q(
	SELECT ?item ?itemLabel ?twitter
	#!/usr/bin/env ruby
	# encoding: utf-8

	require 'rest_client'
	require 'csv'
	require 'nokogiri'
	require 'colorize'

	page_range = 0..50
	GBIF URL,recordedBy,eventDate,year,country,countryCode,GBIF Dataset
	https://gbif.org/occurrence/2433942,"Mrs. C. Pease, Miss E. Butler",,1903,Jamaica,JM,https://gbif.org/dataset/40d2de00-0c6e-11dd-84d2-b8a03c50a862
	https://gbif.org/occurrence/29404620,U.Mizushima (Mrs.),1954-09-06T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
	https://gbif.org/occurrence/29408002,U.Mizushima (Mrs.),1954-09-19T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
	https://gbif.org/occurrence/29426346,U.Mizushima (Mrs.),1952-08-01T01:00Z,1952,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
	https://gbif.org/occurrence/29429161,U.Mizushima (Mrs.),1954-09-29T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
	https://gbif.org/occurrence/29451087,U.Mizushima (Mrs.),1955-03-04T01:00Z,1955,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a
	https://gbif.org/occurrence/29451907,Mrs. Fay A. Mac Fadden,1926-08-16T01:00Z,1926,,
	{:user_id=>11771, :name=>"Cyrus Pringle", :orphaned=>716},
	{:user_id=>14743, :name=>"Gerdt Guenther Hatschbach", :orphaned=>594},
	{:user_id=>191, :name=>"Volker Framenau", :orphaned=>586},
	{:user_id=>35074, :name=>"Martti Rautanen", :orphaned=>454},
	{:user_id=>10182, :name=>"Georg August Zenker", :orphaned=>381},
	{:user_id=>12169, :name=>"Paul Sintenis", :orphaned=>349},
	{:user_id=>9829, :name=>"Joseph Friedrich Nicolaus Bornmüller", :orphaned=>302},
	{:user_id=>10487, :name=>"José Arechavaleta", :orphaned=>250},
	{:user_id=>11937, :name=>"Theodor Kotschy", :orphaned=>233},
	{:user_id=>11853, :name=>"Ynes Mexia", :orphaned=>150},