Created
May 16, 2011 11:54
-
-
Save omundy/974315 to your computer and use it in GitHub Desktop.
Basic scraping demo with "foreach" and "regex" parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* Basic scraping demo with "foreach" and "regex" parsing | |
* Owen Mundy Copyright 2011 GNU/GPL */ | |
// url to start | |
$url = "http://www.bbc.co.uk/news/"; | |
// get contents of url in an array | |
$lines = file($url); | |
// look for the string | |
foreach ($lines as $line_num => $line) | |
{ | |
// find opening string | |
if(strpos($line, '<h2 class="top-story-header ">')) | |
{ | |
$get_content = true; | |
} | |
// if opening string is found | |
// then print content until closing string appears | |
if($get_content == true) | |
{ | |
$data .= $line . "\n"; | |
} | |
// closing string | |
if(strpos($line, "</h2>")) | |
{ | |
$get_content = false; | |
} | |
} | |
// use regular expressions to extract only what we need... | |
// png, jpg, or gif inside a src="..." or src='...' | |
$pattern = "/src=[\"']?([^\"']?.*(png|jpg|gif))[\"']?/i"; | |
preg_match_all($pattern, $data, $images); | |
// text from link | |
$pattern = "/(<a.*>)(\w.*)(<.*>)/ismU"; | |
preg_match_all($pattern, $data, $text); | |
// link | |
$pattern = "/(href=[\"'])(.*?)([\"'])/i"; | |
preg_match_all($pattern, $data, $link); | |
/* | |
// test if you like | |
print "<pre>"; | |
print_r($images); | |
print_r($text); | |
print_r($link); | |
print "</pre>"; | |
*/ | |
?> | |
<html> | |
<head> | |
<style> | |
body { margin:0; } | |
.textblock { position:absolute; top:600px; left:0px; } | |
span { font:5.0em/1.0em Arial, Helvetica, sans-serif; line-height:normal; | |
background:url(trans.png); color:#fff; font-weight:bold; padding:5px } | |
a { text-decoration:none; color:#900 } | |
</style> | |
</head> | |
<body> | |
<img src="<?php print $images[1][0] ?>" height="100%"> </div> | |
<div class="textblock"><span><a href="<?php print "http://www.bbc.co.uk".$link[2][0] ?>"><?php print $text[2][0] ?></a></span><br> | |
</div> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment