Skip to content

Instantly share code, notes, and snippets.

@cookieguru
Last active September 1, 2016 05:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cookieguru/4d5af5ff3d0cb2b603c8a7d901b4b190 to your computer and use it in GitHub Desktop.
Save cookieguru/4d5af5ff3d0cb2b603c8a7d901b4b190 to your computer and use it in GitHub Desktop.
Scrapes the PNWPHP Dryfta site and generates a COD-style JSON document of the sessions
<?php
define('URL_BASE', 'https://pnwphp2016.dryfta.com');
class PageRetriever {
/** @var array */
private $cache = [];
/** @var resource */
private $curl;
public function __construct() {
$this->curl = curl_init();
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
}
/**
* @param string $url
* @return string
*/
public function get($url) {
if(isset($this->cache[$url])) {
return $this->cache[$url];
}
curl_setopt($this->curl, CURLOPT_URL, $url);
return $this->cache[$url] = curl_exec($this->curl);
}
}
class Session {
private $id;
private $nid;
private $title;
private $abstract;
private $type;
private $presenter;
private $category;
private $session_type ;
private $experience_level = 'all';
private $room;
private $date;
private $start;
private $end;
/**
* @param string $id
* @param string $href Link to the session page
* @param PageRetriever $pr
*/
public function __construct($id, $href, PageRetriever $pr) {
$this->id = $this->nid = $id;
$doc = new DOMDocument();
@$doc->loadHTML($pr->get($href));
$finder = new DomXPath($doc);
$this->title = trim($finder->query("//a[contains(@class,'sessionname')]")->item(0)->textContent);
$spans = $finder->query("//span[contains(@class,'sessionformat')]");
$date_text = $spans->item(0)->textContent;
$this->date = date('Y-m-d', strtotime($date_text));
$this->category = date('F d', strtotime($date_text));
$times = explode(' - ', trim($finder->query("//span[contains(@class,'session_type_dateB')]")->item(0)->textContent));
$this->start = date('Y-m-d H:i:s', strtotime("$date_text {$times[0]}"));
$this->end = date('Y-m-d H:i:s', strtotime("$date_text {$times[1]}"));
$this->room = trim($finder->query("//span[contains(@class,'sessionvenue')]")->item(0)->textContent);
$this->abstract = trim(strip_tags($doc->saveHTML($finder->query("//div[contains(@class,'sessiondisc')]")->item(0)->firstChild), '<a><del><ins><b><i><strong><em><big><small><br>'));
$force_session = $finder->query("//span[contains(@class,'sessiontrack_name')]")->length;
if($force_session || $spans->length > 1) {
$this->session_type = $spans->length > 1 ? trim($spans->item(1)->textContent) : 'Session';
if($this->session_type == 'Half-Day Workshop') {
$this->type = 'workshop';
} elseif($this->session_type == 'Special Event') {
$this->type = 'keynote';
} else {
$this->type = 'session';
}
if($this->title == 'Final Thoughts') {
$this->type = 'keynote';
$this->session_type = 'Special Event';
}
$presenter_box = $finder->query("//div[contains(@class,'profile_dryfta_user_attendee')]")->item(0);
$links = $finder->query('div/a', $presenter_box);
if($finder->query('div/a', $presenter_box)->length) {
$link = $links->item(0);
/** @var DOMElement $link */
$href = $link->getAttribute('href');
preg_match('/\?id=(\d+)/', $href, $matches);
$presenter = new Presenter($matches[1], $pr->get(URL_BASE . $href));
$this->presenter = [$presenter->__toArray()];
} else {
$this->presenter = [];
}
} else {
if(stripos($this->title, 'Lunch') !== false) {
$this->type = 'lunch';
} elseif(stripos($this->title, 'Networking') !== false || stripos($this->title, 'Reception') !== false) {
$this->type = 'networking';
} elseif(stripos($this->title, 'Registration') !== false) {
$this->type = 'registration';
} elseif(stripos($this->title, 'Raffle') !== false) {
$this->type = 'fun';
} else {
$this->type = 'break';
}
$this->session_type = ucfirst($this->type);
$this->presenter = [];
}
}
/**
* @return array
*/
public function __toArray() {
return get_object_vars($this);
}
}
class Presenter {
private $id;
private $fullname;
private $twitter;
private $picture;
private $bio;
private $jobtitle;
private $organization;
private $interests;
/**
* @param string $id
* @param string $html
*/
public function __construct($id, $html) {
$this->id = $id;
$doc = new DOMDocument();
@$doc->loadHTML($html);
$finder = new DomXPath($doc);
$this->fullname = trim($doc->getElementsByTagName('h2')->item(0)->textContent);
$this->jobtitle = trim($finder->query("//div[contains(@class,'dryfta_attendee_role')]")->item(0)->textContent);
$this->organization = trim($finder->query("//div[contains(@class,'dryfta_attendee_orgnization')]")->item(0)->textContent);
$this->picture = $doc->getElementById('prof_logo')->getAttribute('src');
$twitter_icon = $finder->query("//i[contains(@class,'fa-twitter')]");
if($twitter_icon->length) {
$twitter_link = $twitter_icon->item(0)->parentNode->parentNode;
/** @var DOMElement $twitter_link */
$this->twitter = $twitter_link->getAttribute('href');
}
foreach($finder->query("//div[contains(@class,'submittedAbstracts')]") as $div) {
if(stripos($div->textContent, 'Public Profile') !== false) {
/** @var DOMElement $div */
foreach($div->childNodes as $childNode) {
if($childNode->tagName == 'b' && trim($childNode->textContent) == 'Public Profile') {
$div->removeChild($childNode);
break;
}
}
$this->bio = trim($div->textContent);
}
}
}
/**
* @return array
*/
public function __toArray() {
return get_object_vars($this);
}
}
$doc = new DOMDocument();
@$doc->loadHTMLFile('https://pnwphp2016.dryfta.com/en/program-schedule');
$finder = new DomXPath($doc);
$session_links = [];
foreach($finder->query("//dl[@id='menu-pane']//div[contains(@class,'session_type_section')]") as $node) {
foreach($finder->query('div//a', $node) as $link) {
/** @var DOMElement $link */
if(in_array(trim($link->textContent), ['Add to gCal', 'Add to iCal'])) {
continue;
}
preg_match('/\/program\/(\d+)\//', $link->getAttribute('href'), $matches);
$session_links[$matches[1]] = URL_BASE . $link->getAttribute('href');
break;
}
}
$pr = new PageRetriever();
$output = [];
foreach($session_links as $id => $link) {
$session = new Session($id, $link, $pr);
$output[] = $session->__toArray();
}
header('Content-type: application/json');
echo json_encode($output, JSON_NUMERIC_CHECK | JSON_UNESCAPED_UNICODE);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment