Skip to content

Instantly share code, notes, and snippets.

@gbili
Last active December 16, 2015 21:50
Show Gist options
  • Save gbili/5503007 to your computer and use it in GitHub Desktop.
Save gbili/5503007 to your computer and use it in GitHub Desktop.
this is the script that im trying to run
<?php
use Gbili\Miner\BluePrint,
Gbili\Miner\BluePrint\Action\Extract\Savable as ExtractSavable,
Gbili\Miner\BluePrint\Action\GetContents\Savable as GetContentsSavable,
Gbili\Vid\Savable\Lexer;
$a = array();//actions
$b = new BluePrint\Savable();
$b->setHost('myhost.com');
$b->setBasePath('./../');
$a[0] = new GetContentsSavable();
$a[0]->setBluePrint($b);
$a[0]->setData('http://myhost.com');
//child
$a[2] = new ExtractSavable();
$a[0]->addChild($a[2]);
$a[2]->setUseMatchAll(false);
$a[2]->setData('<h3>Categories</h3>[^<]*<table width="100%" border="0" cellspacing="0" cellpadding="0">[^<]*<tr>[^<]*<td width="50%" valign="top"><strong>#</strong><br />(.+?)</td></tr>');
//CATEGORIES
$a[3] = new ExtractSavable();
$a[2]->addChild($a[3]);
$a[3]->setUseMatchAll(true);
$a[3]->spitGroupAsEntity(2, Lexer::CATEGORY);
$a[3]->interceptGroupsOneByOne(1, 'prependHostToUrl');
$a[3]->setData('<a href="([^"]+)" target="_blank">([^<]+)</a> ?\(\d*\)<br />');
//CATEGORY and page
$a[4] = new GetContentsSavable();
$a[3]->addChild($a[4]);
$a[4]->setInputParentRegexGroupNumber(1);
//VIDEO
$a[5] = new ExtractSavable();
$a[4]->addChild($a[5]);
$a[5]->setUseMatchAll(true);
$a[5]->setAsNewInstanceGeneratingPoint();
$a[5]->spitGroupAsEntity(2, Lexer::TITLE)
->spitGroupAsEntity(3, Lexer::IMAGE)
->spitGroupAsEntity(4, Lexer::TIME_LENGTH)
->spitGroupAsEntity(5, Lexer::DATE)
->spitGroupAsEntity(6, Lexer::HOST_NAME);
//works : <table cellpadding=2 cellspacing=0 width="185">[^<]*<tr><td colspan=2><a href="([^"]+)" target="_blank">([^<]+)</a><br /></td></tr>[^<]*<tr><td colspan=2><a href="[^"]+" class="thumb" target="_blank"><img id="[^"]+" src="([^"]+)" width="180" height="135"></a><br /></td></tr>[^<]*<tr><td><font class="s">([^<]+)</font><br /></td><td align=right><font class="s">([^<]+)</font><br /><a href="/search/\\?rs=1\\&amp;c=0\\&amp;s=\\d+" target="_blank">([^<]+)</a><br /></td></tr>[^<]*</table>[^<]*<br />
$a[5]->setData('<table cellpadding=2 cellspacing=0 width="185">[^<]*<tr><td colspan=2><a href="([^"]+)" target="_blank">([^<]+)</a><br /></td></tr>[^<]*<tr><td colspan=2><a href="[^"]+" class="thumb" target="_blank"><img id="[^"]+" src="([^"]+)" width="180" height="135"></a><br /></td></tr>[^<]*<tr><td><font class="s">([^<]+)</font><br /></td><td align=right><font class="s">([^<]+)</font><br /><a href="/search/\?rs=1\&amp;c=0\&amp;s=\d+" target="_blank">([^<]+)</a><br /></td></tr>[^<]*</table>[^<]*<br />');
//VIDEO SOURCE
$a[7] = new ExtractSavable();
$a[5]->addChild($a[7]);
$a[7]->setInputParentRegexGroupNumber(1);
$a[7]->setUseMatchAll(false);
$a[7]->spitGroupAsEntity(1, Lexer::SOURCE);
$a[7]->setData('(http://.+)$');
//loop GET CATOGORY GETCONTENTS CALLBACK LOOP PARAMS
$a[6] = new ExtractSavable();
$a[4]->addChild($a[6]);
$a[6]->injectResultTo($a[4]);
$a[6]->setUseMatchAll(false);
$a[6]->setData('<h1>\W*((?:(:?,\s)?(?:(?:[\w\d]+)(?:\s[\w\d]+)*))+)[^<]*</h1>[^<]*<table width="97%" border="0" cellspacing="0" cellpadding="0">[^<]*<tr>[^<]*<td width="40%">\W*showing ?(\d+)-(\d+) of (\d+)\.[^<]*</td>');
$b->save();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment