Skip to content

Instantly share code, notes, and snippets.

@darkoromanov
Last active March 15, 2019 14:46
Show Gist options
  • Save darkoromanov/a2179dc37237acd30cb0df9bf12978fe to your computer and use it in GitHub Desktop.
Save darkoromanov/a2179dc37237acd30cb0df9bf12978fe to your computer and use it in GitHub Desktop.
PHP script to scrape the world spider catalog
<?php
/**
* This is a recursive script to download the whole World Spider Catalog (https://wsc.nmbe.ch/)
* I'm not responsible for any unauthorized use of the data.
* Please refer to WSC By laws https://wsc.nmbe.ch/resources/By-laws_WSCA.pdf
*/
/**
* Download PHP Simple HTML DOM Parser library from http://simplehtmldom.sourceforge.net/
*/
include 'lib/simple_html_dom.php';
/**
* Enter your database params
*/
$servername = "";
$username = "";
$password = "";
$dbname = "";
/**
* Database script
* -- Create syntax for TABLE 'taxon_family'
CREATE TABLE `taxon_family` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(255) COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
`link` varchar(1024) COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
-- Create syntax for TABLE 'taxon_genus'
CREATE TABLE `taxon_genus` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`family_id` int(11) NOT NULL,
`name` varchar(255) COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
`link` varchar(1024) COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=649 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
-- Create syntax for TABLE 'taxon_species'
CREATE TABLE `taxon_species` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(255) COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
`genus_id` int(11) NOT NULL,
`family_id` int(11) NOT NULL,
`link` varchar(1024) COLLATE utf8_unicode_ci NOT NULL DEFAULT '',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=8570 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
*/
function addFamily($name, $link)
{
global $servername, $username, $password, $dbname;
$conn = new mysqli($servername, $username, $password, $dbname);
$stmt = $conn->prepare("insert into taxon_family (name, link) values (?, ?)");
$stmt->bind_param("ss", $name, $link);
$stmt->execute();
return $stmt->insert_id;
}
function addGenus($name, $link, $familyId)
{
global $servername, $username, $password, $dbname;
$conn = new mysqli($servername, $username, $password, $dbname);
$stmt = $conn->prepare("insert into taxon_genus (name, link, family_id) values (?, ?, ?)");
$stmt->bind_param("ssi", $name, $link, $familyId);
$stmt->execute();
return $stmt->insert_id;
}
function addSpecies($name, $link, $genus_id, $familyId)
{
global $servername, $username, $password, $dbname;
$conn = new mysqli($servername, $username, $password, $dbname);
$stmt = $conn->prepare("insert into taxon_species (name, link, genus_id, family_id) values (?, ?, ?, ?)");
$stmt->bind_param("ssii", $name, $link, $genus_id, $familyId);
$stmt->execute();
return $stmt->insert_id;
}
function getFamily($id)
{
global $servername, $username, $password, $dbname;
$conn = new mysqli($servername, $username, $password, $dbname);
$stmt = $conn->prepare("select * from taxon_family where id=?");
$stmt->bind_param("i", $id);
$stmt->execute();
$result = $stmt->get_result();
$row = $result->fetch_assoc();
return $row;
}
function getFamilyByName($name)
{
global $servername, $username, $password, $dbname;
$conn = new mysqli($servername, $username, $password, $dbname);
$stmt = $conn->prepare("select * from taxon_family where name=?");
$stmt->bind_param("s", $name);
$stmt->execute();
$result = $stmt->get_result();
$row = $result->fetch_assoc();
return $row;
}
function getGenus($id)
{
global $servername, $username, $password, $dbname;
$conn = new mysqli($servername, $username, $password, $dbname);
$stmt = $conn->prepare("select * from taxon_genus where id=?");
$stmt->bind_param("i", $id);
$stmt->execute();
$result = $stmt->get_result();
$row = $result->fetch_assoc();
return $row;
}
error_reporting(E_ALL);
ini_set("display_errors", "On");
$baseURL = "https://wsc.nmbe.ch";
function scrapeFamilies()
{
$familiesURL = "https://wsc.nmbe.ch/statistics/";
$html = file_get_html($familiesURL);
foreach($html->find("table.narrow tr") as $tr)
{
$all = $tr->find("td");
if(count($all) > 1)
{
$td = $all[1];
if(is_object($td))
{
$text = trim($td->plaintext);
if($text != "Total")
{
$link = $td->find("a")[0]->href;
echo "Add family: $text\n";
addFamily($text, $link);
scrapeGenus($text);
}
}
}
}
}
function scrapeGenus($familyName)
{
global $baseURL;
$family = getFamilyByName($familyName);
$html = file_get_html($baseURL . $family["link"]);
foreach($html->find("table")[0]->find("tbody")[0]->find("tr") as $tr)
{
$all = $tr->find("td");
$td = $all[0]->find("i")[0];
$name = $td->plaintext;
$linkCell = $all[2];
$link = $linkCell->find("a")[1]->href;
echo "Add genus: $name\n";
$id = addGenus($name, $link, $family["id"]);
scrapeSpecies($id);
}
}
function scrapeSpecies($genusId)
{
global $baseURL;
$genus = getGenus($genusId);
$family = getFamily($genus["family_id"]);
$html = file_get_html($baseURL . $genus["link"]);
foreach($html->find("table")[0]->find("tbody")[0]->find("tr") as $tr)
{
$all = $tr->find("td");
$td = $all[0]->find("i")[0];
$name = $td->plaintext;
$linkCell = $all[2];
$link = $linkCell->find("a")[0]->href;
echo "Add species: $name $link\n";
addSpecies($name, $link, $genus["id"], $family["id"]);
}
}
scrapeFamilies();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment