Skip to content

Instantly share code, notes, and snippets.

@ganmahmud
Created January 18, 2016 19:24
Show Gist options
  • Save ganmahmud/7308ebe2678f18525464 to your computer and use it in GitHub Desktop.
Save ganmahmud/7308ebe2678f18525464 to your computer and use it in GitHub Desktop.
Scrapping example with PHP (used for learning purpose only)
<?php
include("simple_html_dom.php");
$allLink = array();
$AllContent = array();
// Retrieve the DOM from a given URL
$html = file_get_html('http://www.bbc.co.uk/bengali');
foreach($html->find('.list li-plain,a') as $e){
$pos = $e->href;
if ((strpos($pos, "/201")) && !(strpos($pos, "/multimedia")))
{
$full_link= "http://www.bbc.co.uk".$pos;
array_push($allLink, $full_link);
}
}
$counter = 1;
foreach ($allLink as $link)
{
$singlePost =file_get_html($link);
//the title
$title = $singlePost->find("h1");
foreach($title as $elemet2)
{
$newsTitle=base64_encode($elemet2->plaintext); ########### checkpoint ###########
break;
}
//image
$image = $singlePost->find('.image img');
$Ztest_img= count($image);
if ($Ztest_img>0)
{
foreach($image as $elemet4){
$pic=base64_encode($elemet4->src); ########### checkpoint ###########
break;
}
}
else
{
// import checkpoint
$pic = base64_encode("img/flyerBDHolder.jpg");
}
//the detail
$detail = $singlePost->find('.bodytext p');
$txt_beta="";
foreach($detail as $elemet3){
$newsDetail=$elemet3->plaintext;
$txt_beta.=$newsDetail; ########### checkpoint ###########
}
$txt=base64_encode($txt_beta);
$temparr= array('id'=>$counter,'title' => $newsTitle, 'newsImage'=>$pic, 'detail'=>$txt);
$counter++;
array_push($AllContent, $temparr);
}
$fp = fopen('bbc.json', 'w');
fwrite($fp,json_encode($AllContent));
fclose($fp);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment