Skip to content

Instantly share code, notes, and snippets.

@juanpablocs
Last active January 25, 2016 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save juanpablocs/19b29d5ad53656bce067 to your computer and use it in GitHub Desktop.
Save juanpablocs/19b29d5ad53656bce067 to your computer and use it in GitHub Desktop.
scraping for program guide. get info in json and save result data to xml
<?php
// programador: juanpablocs21@gmail.com
//
class directv
{
var $url = "http://www.directv.com.ve/movil/ProgramGuide/ProgramGuide";
public function getProgramation($ids)
{
$html = $this->html($this->url);
preg_match('#<table id="program-guide"(?:[^>]+)>(.*?)</table>#si', $html, $data);
preg_match_all('#<tr>(.*?)</tr>#si', $data[1], $tbody);
$results = empty($tbody[1]) ? [] : $tbody[1];
$tmp = [];
foreach ($results as $result) {
$data = $this->matchInfoData($result);
if($data){
$tmp[] = $data;
}
}
if(count($tmp)<1) return ['error'=>true, 'message'=>'error get scraping'];
if(is_array($ids) && count($ids)>0)
{
$tmp2 = [];
foreach ($ids as $id) {
$key = array_search($id, array_column($tmp, 'channel_number'));
if($key !== false){
$tmp2[] = $tmp[$key];
}
}
$tmp = $tmp2;
}
return $tmp;
}
private function matchInfoData($html){
$html = preg_replace('/\s+/', ' ', $html);
preg_match('#href="/movil/ProgramGuide/ProgrammingDetails\?strEventID=([^"]+)"#si', $html, $r);
preg_match('#<strong>([^<]+)</strong>#si', $html, $t);
preg_match('#<dd><strong>Duraci&\#243\;n:</strong> (.*?) </dd>#si', $html, $d);
if(!empty($r[1])){
preg_match('#<p>([^<]+)<br />([^<]+)</p>#si', $html, $channel);
return [
'id' => $r[1],
'title' => $t[1],
'duration' => $d[1],
'channel_number' => $channel[1],
'channel_name' => $channel[2],
];
}
return false;
}
private function html($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_REFERER, "http://www.google.com/");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_COOKIEJAR, __dir__."/cookie.txt");
curl_setopt($ch, CURLOPT_COOKIEFILE, __dir__."/cookie.txt");
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 4); // controla tiempo de espera al buscar.
curl_setopt($ch, CURLOPT_URL, $url);
$buffer = curl_exec($ch);
curl_close($ch);
return $buffer;
}
}
// run baby
$ids = @$_GET['ids'];
$ids = ( !empty($ids) ) ? explode(',', $ids) : false;
$d = new directv;
$json = $d->getProgramation($ids);
if(!empty($_GET['save'])){
$file = 'xmls/channels.xml';
if(!is_writable($file))
die('error folder o archivo no existe o no tiene permiso 777');
$dom = new DomDocument('1.0', 'UTF-8');
$channels = $dom->appendChild($dom->createElement('canales'));
for ($i = 0; $i < count($json); $i++) {
$channel = $dom->createElement('canal');
$channels->appendChild($channel);
$id = $dom->createAttribute('id');
$id->appendChild($dom->createTextNode($json[$i]['id']));
$channel->appendChild($id);
$channel_name = $dom->createAttribute('canal');
$channel_name->appendChild($dom->createTextNode($json[$i]['channel_name']));
$channel->appendChild($channel_name);
$title = $dom->createAttribute('programa');
$title->appendChild($dom->createTextNode($json[$i]['title']));
$channel->appendChild($title);
$duration = $dom->createAttribute('duracion');
$duration->appendChild($dom->createTextNode($json[$i]['duration']));
$channel->appendChild($duration);
}
$dom->formatOutput = true;
$dom->saveXML();
$dom->save($file);
die('guardo correctamente');
}
print json_encode($json);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment