"My data files are too large."
"I have many different files and I need to join them together."
- a fast and general-purpose cluster computing system
# Q1312945 Expedition | |
qids = ["Q108669", "Q63760", "Q62747", "Q104839", "Q96384", "Q96384", "Q85444", "Q101823", "Q347529", "Q43881351", "Q95248572"] | |
user_ids = qids.map do |q| | |
u = User.find_by_identifier(q) rescue nil | |
u.id if !u.nil? | |
end.compact |
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'csv' | |
require 'rest_client' | |
require 'json' | |
BASE_URL = "https://api.openalex.org/works?filter=concepts.id:C58642233,has_orcid:true,publication_year:2023&per_page=50&page=" | |
def get_data(page:) |
# encoding: utf-8 | |
class String | |
def is_orcid? | |
/(\d{4}-){3}\d{3}[0-9X]{1}$/.match?(self) | |
end | |
def valid_orcid? | |
parts = self.scan(/[0-9X]/) | |
mod = parts[0..14].map(&:to_i) | |
.inject { |sum, n| (sum + n)*2 } |
# Install via command-line as 'gem install sparql-client' | |
require 'sparql/client' | |
headers = { 'User-Agent' => 'Ruby-Sparql-Client/1.0' } | |
@sparql = SPARQL::Client.new("https://query.wikidata.org/sparql", headers: headers, read_timeout: 120) | |
# A SPARQL query to find an item and an optional Twitter handle | |
def wikidata_by_orcid_query(orcid) | |
%Q( | |
SELECT ?item ?itemLabel ?twitter |
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'rest_client' | |
require 'csv' | |
require 'nokogiri' | |
require 'colorize' | |
page_range = 0..50 |
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
require 'rest_client' | |
require 'csv' | |
require 'nokogiri' | |
require 'colorize' | |
page_range = 1..10 |
"My data files are too large."
"I have many different files and I need to join them together."
GBIF URL,recordedBy,eventDate,year,country,countryCode,GBIF Dataset | |
https://gbif.org/occurrence/2433942,"Mrs. C. Pease, Miss E. Butler",,1903,Jamaica,JM,https://gbif.org/dataset/40d2de00-0c6e-11dd-84d2-b8a03c50a862 | |
https://gbif.org/occurrence/29404620,U.Mizushima (Mrs.),1954-09-06T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29408002,U.Mizushima (Mrs.),1954-09-19T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29426346,U.Mizushima (Mrs.),1952-08-01T01:00Z,1952,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29429161,U.Mizushima (Mrs.),1954-09-29T01:00Z,1954,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29451087,U.Mizushima (Mrs.),1955-03-04T01:00Z,1955,,JP,https://gbif.org/dataset/8346c3a8-f762-11e1-a439-00145eb45e9a | |
https://gbif.org/occurrence/29451907,Mrs. Fay A. Mac Fadden,1926-08-16T01:00Z,1926,, |
{:user_id=>11771, :name=>"Cyrus Pringle", :orphaned=>716}, | |
{:user_id=>14743, :name=>"Gerdt Guenther Hatschbach", :orphaned=>594}, | |
{:user_id=>191, :name=>"Volker Framenau", :orphaned=>586}, | |
{:user_id=>35074, :name=>"Martti Rautanen", :orphaned=>454}, | |
{:user_id=>10182, :name=>"Georg August Zenker", :orphaned=>381}, | |
{:user_id=>12169, :name=>"Paul Sintenis", :orphaned=>349}, | |
{:user_id=>9829, :name=>"Joseph Friedrich Nicolaus Bornmüller", :orphaned=>302}, | |
{:user_id=>10487, :name=>"José Arechavaleta", :orphaned=>250}, | |
{:user_id=>11937, :name=>"Theodor Kotschy", :orphaned=>233}, | |
{:user_id=>11853, :name=>"Ynes Mexia", :orphaned=>150}, |
This is a quick test of a modified version of the Bloodhound spark script to check it runs on the GBIF Cloudera cluster (CDH 5.16.2).
From the gateway, grab the file from HDFS (skip HTTP for speed), unzip (15-20 mins) and upload to HDFS:
hdfs dfs -getmerge /occurrence-download/prod-downloads/0002504-181003121212138.zip /mnt/auto/misc/bloodhound/data.zip
unzip /mnt/auto/misc/bloodhound/data.zip -d /mnt/auto/misc/bloodhound/data
hdfs dfs -rm /tmp/verbatim.txt
hdfs dfs -rm /tmp/occurrence.txt