Skip to content

Instantly share code, notes, and snippets.

@jaseclamp
Created March 13, 2015 00:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaseclamp/e2e20e81a10fbcb8db88 to your computer and use it in GitHub Desktop.
Save jaseclamp/e2e20e81a10fbcb8db88 to your computer and use it in GitHub Desktop.
Scrap Stackoverflow on a certain topic so top users can be mapped
<?php
//house cleaning
//scraperwiki::save_sqlite(array('id'), array('id'=>1, 'lat'=>1, 'lng'=>1), "users2");
//die;
require 'scraperwiki/simple_html_dom.php';
function neat_r($arr, $return = false) {
$out = array();
$oldtab = " ";
$newtab = "-";
$lines = explode("\n", print_r($arr, true));
foreach ($lines as $line) {
//remove numeric indexes like "[0] =>" unless the value is an array
if (substr($line, -5) != "Array") { $line = preg_replace("/^(\s*)\[[0-9]+\] => /", "$1", $line, 1); }
//garbage symbols
foreach (array(
"Array" => "",
"[" => "",
"]" => "",
" =>" => ":",
) as $old => $new) {
$out = str_replace($old, $new, $out);
}
//garbage lines
if (in_array(trim($line), array("Array", "(", ")", ""))) continue;
//indents
$indent = "";
$indents = floor((substr_count($line, $oldtab) - 1) / 2);
if ($indents > 0) { for ($i = 0; $i < $indents; $i++) { $indent .= $newtab; } }
$out[] = $indent . trim($line);
}
$out = implode("\n", $out) . "\n";
if ($return == true) return $out;
echo $out;
}
//FIRST go through most voted magento questions and capture question links...
//$i<=316 insert this when ready to run there are 316 pages
//make 1==1 below if we want to run this section
if (1==2) for($i=1; $i<=300; $i++) {
$url = "http://stackoverflow.com/questions/tagged/magento?page=".$i."&sort=votes&pagesize=50";
$html = scraperWiki::scrape($url);
$dom = new simple_html_dom();
$dom->load($html);
foreach($dom->find('a[class=question-hyperlink]') as $data ){
$questions['url']=$data->href;
preg_match("/\/questions\/(?<id>[0-9]*)\//",$questions['url'],$matches);
$questions['id'] = $matches[1];
$questions['scraped']=0;
scraperwiki::save_sqlite(array('id'), $questions, "questions");
}
}
//Second go to each question and capture all the user links
$questions = scraperwiki::sqliteexecute("select * from questions where scraped=0");
foreach($questions->keys as $key => $value) $keys[$value]=$key;
$questions = $questions->data;
if (1==2) foreach($questions as $question){
//echo $question[$keys['scraped']]."\n";
$url = "http://stackoverflow.com".$question[$keys['url']];
$html = scraperWiki::scrape($url);
$dom = new simple_html_dom();
$dom->load($html);
//get all the users boxes from main answerers
foreach($dom->find('div.user-details a') as $data )
{
if( strpos($data->href,"/users/")!==FALSE)
{
$users['url']=$data->href;
preg_match("/\/users\/(?<id>[0-9]*)\//",$users['url'],$matches);
$users['id'] = $matches[1];
$users['name'] = $data->innertext;
scraperwiki::save_sqlite(array('id'), $users, "users");
}
}
//get all the comment user links
foreach($dom->find('a.comment-user') as $data ){
if( strpos($data->href,"/users/")!==FALSE)
{
$users['url']=$data->href;
preg_match("/\/users\/(?<id>[0-9]*)\//",$users['url'],$matches);
$users['id'] = $matches[1];
$users['name'] = $data->innertext;
scraperwiki::save_sqlite(array('id'), $users, "users");
}
}
//now update this question saying it's been scraped.
$update_question['id'] = $question[$keys['id']];
$update_question['url'] = $question[$keys['url']];
$update_question['scraped'] = 1;
scraperwiki::save_sqlite(array('id'), $update_question, "questions");
}
//Third go through each user page and capture their info!
$users = scraperwiki::sqliteexecute("select * from users2 where scraped IS NULL");
foreach($users->keys as $key => $value) $keys[$value]=$key;
$_users = $users->data;
unset($users);
if (1==2) foreach($_users as $user){
//get out of here if there's no url or it's a generic user
if ( $user[$keys['url']] == '' ) {echo "empty url\n"; continue;}
if ( $user[$keys['name']] == 'Community' ) {echo "generic user\n";continue;}
$url = "http://stackoverflow.com".$user[$keys['url']];
$html = scraperWiki::scrape($url);
$dom = new simple_html_dom();
$dom->load($html);
//problem with the retrieval?
if ( ! method_exists($dom,"find") ) continue;
if ( ! $dom->find('html') ) continue;
//is the url wrong? account deleted?
if ( strpos( $dom->plaintext,'Page Not Found' ) !== FALSE ) { echo "page not found\n"; continue; }
$users['id'] = $user[$keys['id']];
$users['url'] = $user[$keys['url']];
$users['scraped'] = 1;
$users['name'] = $dom->find("h1[id=user-displayname]",0)->innertext;
$users['location'] = $dom->find("td[class=adr]",0)->innertext;
foreach($dom->find('div[class=data]',0)->find('table',0)->find('td') as $data ) {
if ( $data->innertext == "age" ) $users['age'] = $data->next_sibling()->innertext;
if ( $data->innertext == "profile views" ) $users['views'] = $data->next_sibling()->innertext;
}
$users['website'] = $dom->find('a[class=url]',0)->href;
$users['member_since'] = $dom->find('td[class=cool]',0)->innertext;
$users['last_seen'] = $dom->find('span[class=relativetime]',0)->innertext;
$users['about'] = $dom->find('div[class=user-about-me]',0)->innertext;
$users['logo'] = $dom->find('img[class=logo]',0)->src;
$users['reputation'] = $dom->find('div[class=reputation]',0)->find('span',0)->find('a',0)->innertext;
foreach($dom->find('div[class=subheader] h1 a') as $data ) {
$key = substr( strrchr($data->href,'='), 1);
$users[$key] = $data->find('span',0)->innertext;
}
//this is for tags. it will blow up a table horizontally so we need to link to another vertical table
foreach($dom->find('div[class=answer-votes]') as $data ) {
$value = $data->innertext;
$tag = preg_replace( '/[^a-z]/i', '', $data->next_sibling()->innertext );
$tags['tag'] = $tag;
$tags['value'] = $value;
$tags['user_id'] = $user[$keys['id']];
scraperwiki::save_sqlite( '', $tags, "tags");
}
scraperwiki::save_sqlite(array('id'), $users, "users2");
}
//now lets geocode
$users = scraperwiki::sqliteexecute("SELECT * FROM `users2` WHERE `location` NOT NULL AND `lat` IS NULL GROUP BY `location` ORDER BY `location`");
foreach($users->keys as $key => $value) $keys[$value]=$key;
$users = $users->data;
foreach($users as $user){
$addr = urlencode($user[$keys['location']]);
$url = 'http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address='.$addr;
$get = file_get_contents($url);
$records = json_decode($get,TRUE);
echo $addr.":";
if ( $records['status'] == 'OK' ) {
//neat_r($records['results'][]);
$lat = $records['results'][0]['geometry']['location']['lat'];
$lng = $records['results'][0]['geometry']['location']['lng'];
echo $lat."-".$lng."\n";
scraperwiki::sqliteexecute( "update `users2` set `lat`='".$lat."' where `location`='".$user[$keys['location']]."'" );
scraperwiki::sqliteexecute( "update `users2` set `lng`='".$lng."' where `location`='".$user[$keys['location']]."'" );
scraperwiki::sqlitecommit();
}else{
echo "N/A\n";
scraperwiki::sqliteexecute( "update `users2` set `lat`='XXX' where `location`='".$user[$keys['location']]."'" );
scraperwiki::sqliteexecute( "update `users2` set `lng`='XXX' where `location`='".$user[$keys['location']]."'" );
scraperwiki::sqlitecommit();
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment