Skip to content

Instantly share code, notes, and snippets.

@oomlaut
Created May 10, 2013 16:28
Show Gist options
  • Save oomlaut/5555568 to your computer and use it in GitHub Desktop.
Save oomlaut/5555568 to your computer and use it in GitHub Desktop.
Example of screen-scraping in PHP
<?
//include('dbconfig.inc.php');
//include('db.class.php');
//include('simplehtmldom/simple_html_dom.php');
$availableTeams = array("ARI","ATL","BAL","BOS","CIN","CHC","CHW","CLE","COL","DET","FLA","HOU","KCR","LAA","LAD","MIL","MIN","NYM","NYY","OAK","PHI","PIT","SD","SF","SEA","STL","TB","TEX","TOR","WSN");
foreach($availableTeams as $availableTeam)
{?>
<a href='batters.php?team=<?=$availableTeam?>'><?=$availableTeam?></a> |
<?}?>
<br><br>
<?
$year = ($_GET['year'] ? $_GET['year'] : 2011);
if( $_GET['team'] )
{
$teams = array($_GET['team']);
}
else
{
exit("No team specified");
}
//$teams = array("ARI","ATL","BAL","BOS","CIN","CHC","CHW","CLE","COL","DET","FLA","HOU","KCR","LAA","LAD","MIL","MIN","NYM","NYY","OAK","PHI","PIT","SD","SF","SEA","STL","TB","TEX","TOR","WSN");
foreach($teams as $team)
{
set_time_limit(30);
$url = "http://www.baseball-reference.com/teams/$team/$year.shtml";
print "<div><a href='$url'>$team Page ($year)</a> - <span style='color:red;'>Red indicates a low power/high average player</span></div>";
print "<hr>";
$raw = file_get_contents($url);
$newlines = array("\t","\n","\r","\x20\x20","\0","\x0B");
$content = str_replace($newlines, "", html_entity_decode($raw));
$start = strpos($content,"id=\"team_batting\"");
$end = strpos($content,'</table>',$start) + 8;
$table = substr($content,$start,$end-$start);
preg_match_all("|<tr(.*)</tr>|U",$table,$rows);
foreach ($rows[0] as $row){
$data = "";
if ((strpos($row,'<th')===false)){
preg_match_all("|<td(.*)</td>|U",$row,$cells);
if( is_numeric(strip_tags($cells[0][0])) )
{
print "<div>";
for($i = 0; $i <=27; $i++)
{
$data .= strip_tags($cells[0][$i]) . ", ";
}
//print $team . ", ";
preg_match("/.*?players\/*.?\/(.*?)\.shtml.*?/",$cells[0][2],$matches);
//print $matches[1]; // ID
//print $data;
//Rk,Pos,Name,Age,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,ID
$ID = $matches[1];
$key = "$ID";
$x[$key]['Team'] = $team;
$x[$key]['Position'] = strip_tags($cells[0][1]);
$x[$key]['Name'] = strip_tags($cells[0][2]);
$x[$key]['G'] += strip_tags($cells[0][4]);
$x[$key]['AB'] += strip_tags($cells[0][6]);
$x[$key]['R'] += strip_tags($cells[0][7]);
$x[$key]['H'] += strip_tags($cells[0][8]);
$x[$key]['2B'] += strip_tags($cells[0][9]);
$x[$key]['3B'] += strip_tags($cells[0][10]);
$x[$key]['HR'] += strip_tags($cells[0][11]);
$x[$key]['RBI'] += strip_tags($cells[0][12]);
$x[$key]['BB'] += strip_tags($cells[0][15]);
$x[$key]['HBP'] += strip_tags($cells[0][24]);
$x[$key]['SF'] += strip_tags($cells[0][26]);
$x[$key]['SB'] += strip_tags($cells[0][13]);
$x[$key]['BA'] = GetBattingAverage($x[$key]);
$x[$key]['OBP'] = GetOBP($x[$key]);
print "</div>";
}
}
}
}
if( $x )
{
print "<div>Batter, Bats, Avg, HR, Contact, Power, Speed</div>";
foreach($x as $key => $player)
{
//Batter Name (6 chars), Bats, Avg, HR, Contact, Power, Speed
$BA = str_replace("0.","",$player['BA']);
$bats = (strstr($player['Name'], "*") || strstr($player['Name'], "#") ? "L":"R" );
$PWR = GetPower($player['HR']);
$CNT = GetContact($BA);
$SPD = GetSpeed($player['3B'], $player['SB']);
//$HRAB = number_format($player['HR']/$player['AB'],3) * 100;
$warn = ($BA >= 280 && $PWR < 830 ? "red" : "black"); // check for good hitters with low power
//$extras = "$player[AB], $HRAB, $player[OBP]";
$url = "http://www.baseball-reference.com/players/".substr($key,0,1)."/$key.shtml";
print "<div><a href='$url'>$player[Name]</a>, $bats, $BA, $player[HR], $CNT, <span style='color:$warn;'>$PWR</span>, $SPD</div>";
//return "<tr><td>$team</td><td>$name</td><td>$bats</td><td>$BA</td><td>$x[HR]</td><td>$CNT</td><td><span style='color:$warn;'>$PWR</span></td><td>$SPD</td><td>$extras, $x[Year]</td></tr>";
}
}
else exit("team data not found");
function GetBattingAverage($player)
{
$AB = ($player['AB']);
return ($AB ? number_format($player['H']/$AB,3) : .000);
}
function GetOBP($player)
{
$top = $player['H'] + $player['BB'] + $player['HBP'];
$bottom = $player['AB'] + $player['BB'] + $player['HBP'] + $player['SF'];
return ($bottom ? number_format($top/$bottom,3) : .000);
}
function GetSpeed($triples, $SB)
{
$SPD = (.5*$SB) + $triples + 120;
return ceil($SPD <= 144 ? $SPD : 144);
}
function GetPower($HR)
{
$max = 954;
$min = 725;
$maxValue = 40;
$value = ($HR/$maxValue) * ($max - $min) + $min;
return ceil($value <= $max ? $value : $max);
}
function GetContact($AVG)
{
$max = 64;
$min = 0;
$maxValue = 400;
$value = $max - (($AVG/$maxValue) * ($max - $min));
return floor(($value >= $min ? $value : $min));
}
?>
<?
class scraper
{
public $rawContent;
public $sanitizedContent;
public $startPosition;
public $endPosition;
private $url;
public function __construct($url)
{
$this->url = $url;
set_time_limit(30);
$this->rawContent = file_get_contents($this->url);
$newlines = array("\t","\n","\r","\x20\x20","\0","\x0B");
$this->sanitizedContent = str_replace($newlines, "", html_entity_decode($this->rawContent));
}
// Fetches everything from start to finish, including the start text and end text. This function is
// called by every other fetch function.
public function fetch($start, $end)
{
$this->startPosition = strpos($this->sanitizedContent, $start);
$this->endPosition = strpos($this->sanitizedContent, $end, $this->startPosition) + strlen($end);
return substr($this->sanitizedContent,$this->startPosition,$this->endPosition-$this->startPosition);
}
// Fetches everything between the start and the finish text
public function fetchBetween($start, $end)
{
$data = $this->fetch($start, $end);
$startPosition = strlen($start);
$length = strlen($start) + strlen($end)-1;
$length = strlen($data) - strlen($start) - strlen($end);
return trim(substr($data, $startPosition, $length));
}
// Takes scraped tabular data and returns it in a dimensioned array
public function fetchTable($start, $end, $retainTags=false)
{
$data = $this->fetch($start, $end);
preg_match_all("|<tr(.*)</tr>|U",$data,$rows);
$arr = array();
foreach($rows[0] as $row)
{
preg_match_all("|<td(.*)</td>|U",$row,$cells);
$row = array();
foreach($cells[0] as $cell)
{
$cellData = ($retainTags ? $cell : strip_tags($cell) );
$row[] = $cellData;
}
if( sizeof($row) )$arr[] = $row;
}
return $arr;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment