Created
January 25, 2018 18:45
-
-
Save pentatonicfunk/3418a20a00dd48d1940baa2763a9f6a5 to your computer and use it in GitHub Desktop.
PHP API/Scraper Sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Hapenesia\GsmArena; | |
use App\Color; | |
use App\Spec; | |
use Carbon\Carbon; | |
use GuzzleHttp\Client; | |
use GuzzleHttp\Cookie\FileCookieJar; | |
use GuzzleHttp\Handler\CurlHandler; | |
use GuzzleHttp\HandlerStack; | |
use GuzzleHttp\Middleware; | |
use GuzzleHttp\TransferStats; | |
use Illuminate\Support\Facades\Cache; | |
use Sunra\PhpSimple\HtmlDomParser; | |
use Zend\Json\Json; | |
use function Stringy\create as s; | |
class Scraper | |
{ | |
const PROVIDER_NAME = 'GSMARENA'; | |
/** | |
* @var \GuzzleHttp\Client | |
*/ | |
private $client; | |
private $testValue = 'test'; | |
public function __construct() | |
{ | |
$stack = new HandlerStack(); | |
$stack->setHandler(new CurlHandler()); | |
$stack->push(Middleware::cookies()); | |
$stack->push(Middleware::prepareBody()); | |
$stack->push(Middleware::redirect()); | |
$stack->push(Middleware::httpErrors()); | |
$cookieJar = new FileCookieJar(storage_path('app/scraper_cookies/'.uniqid(self::PROVIDER_NAME, true).'.txt'), true); | |
$this->client = new Client( | |
[ | |
'handler' => $stack, | |
'cookies' => $cookieJar, | |
] | |
); | |
} | |
private function getFormattedGadgets() | |
{ | |
/** @var Gadget[] $formattedGadgets */ | |
$formattedGadgets = []; | |
if (Cache::has('gsmarena_autocomplete')) { | |
// | |
$data = Cache::get('gsmarena_autocomplete'); | |
try { | |
if (!isset($data['url'])) { | |
throw new \Exception('NO URL'); | |
} | |
if (!isset($data['formatted'])) { | |
throw new \Exception('NO formatted'); | |
} | |
if ($data['url'] !== config('hapenesia.gsmarena.autocomplete_list_url')) { | |
throw new \Exception('MisMatch URL'); | |
} | |
$formattedGadgets = $data['formatted']; | |
} catch (\Exception $e) { | |
$body = false; | |
Cache::forget('gsmarena_autocomplete'); | |
} | |
} | |
if (!$formattedGadgets) { | |
$res = $this->client->request( | |
'GET', | |
config('hapenesia.gsmarena.autocomplete_list_url'), | |
['verify' => false] | |
); | |
$expiresAt = Carbon::now()->addMinute(config('hapenesia.gsmarena.minute_expired')); | |
$body = $res->getBody()->getContents(); | |
$gadgets = Json::decode($body, Json::TYPE_ARRAY); | |
$manufacturers = $gadgets[0]; | |
$series = $gadgets[1]; | |
foreach ($series as $serie) { | |
if (!isset($formattedGadgets[$serie[1]])) { | |
$formattedGadgets[$serie[1]] = self::buildMatchedGadgetItem($serie, $manufacturers); | |
} | |
} | |
Cache::put( | |
'gsmarena_autocomplete', | |
[ | |
'url' => config('hapenesia.gsmarena.autocomplete_list_url'), | |
'body' => $body, | |
'formatted' => $formattedGadgets, | |
], | |
$expiresAt | |
); | |
} | |
return $formattedGadgets; | |
} | |
public function search($gadgetName) | |
{ | |
$formattedGadgets = $this->getFormattedGadgets(); | |
$gadgetsMatched = []; | |
$searchParams = explode(' ', $gadgetName); | |
foreach ($formattedGadgets as $formattedGadget) { | |
$matched = s($formattedGadget->search)->containsAll($searchParams, false); | |
if ($matched) { | |
$gadgetsMatched[] = $formattedGadget; | |
} | |
} | |
return $gadgetsMatched; | |
} | |
public function clearListCache() | |
{ | |
Cache::forget('gsmarena_autocomplete'); | |
} | |
public static function buildMatchedGadgetItem($serie, $manufacturers) | |
{ | |
$gadget = new Gadget(); | |
$gadget->id = $serie[1]; | |
$gadget->name = $serie[2]; | |
$gadget->alias = $serie[3]; | |
$gadget->thumb = 'https://cdn2.gsmarena.com/vv/bigpic/'.$serie[4]; | |
$gadget->manufacturer = new Manufacturer(); | |
$gadget->manufacturer->id = $serie[0]; | |
$gadget->manufacturer->name = $manufacturers[$serie[0]]; | |
$gadget->search = implode( | |
' ', | |
[ | |
$gadget->manufacturer->name, | |
$gadget->name, | |
$gadget->alias, | |
] | |
); | |
return $gadget; | |
} | |
/** | |
* @param $gadgetId | |
* | |
* @return mixed | |
* @throws \Exception | |
*/ | |
public function getSpecs($gadgetId) | |
{ | |
$specs = []; | |
//scrap main features | |
$providerLink = false; | |
$res = $this->client->request( | |
'GET', | |
config('hapenesia.gsmarena.gadget_url').$gadgetId.'.php', | |
[ | |
'verify' => false, | |
'allow_redirects' => array( | |
'max' => 2, | |
'referer' => false, | |
'protocols' => array( | |
'http', | |
'https', | |
), | |
), | |
'on_stats' => function (TransferStats $stats) use (&$providerLink) { | |
$providerLink = $stats->getEffectiveUri(); | |
}, | |
] | |
); | |
$providerLink = (string)$providerLink; | |
$body = $res->getBody()->getContents(); | |
$dom = HtmlDomParser::str_get_html($body); | |
$specList = $dom->find('#specs-list', 0); | |
$tables = $specList->find('table'); | |
foreach ($tables as $table) { | |
/** | |
* -spec | |
* -- spec name | |
* -- spec value | |
*/ | |
$spec = $table->find('tr th', 0)->plaintext; | |
$normalizedSpec = s($spec)->slugify(); | |
$specItems = $table->find('tr'); | |
$specDetail = []; | |
foreach ($specItems as $specItem) { | |
$specName = $specItem->find('td.ttl', 0); | |
if ($specName) { | |
$specName = $specName->plaintext; | |
$normalizedSpecName = s($specName)->slugify(); | |
$specValue = $specItem->find('td.nfo', 0)->plaintext; | |
$specValue = s($specValue)->trim(); | |
$specDetail[(string)$normalizedSpecName] = [ | |
'key' => (string)$normalizedSpecName, | |
'name' => $specName, | |
'value' => (string)$specValue, | |
]; | |
} | |
} | |
$specs [(string)$normalizedSpec] = [ | |
'key' => (string)$normalizedSpec, | |
'name' => $spec, | |
'specs' => $specDetail, | |
]; | |
} | |
/** | |
* $table->unique( | |
* [ | |
* 'provider_id', | |
* 'provider_name', | |
* ], | |
* 'provider_name_id' | |
* ); | |
* | |
* $table->index('battery'); | |
* $table->index('colors'); | |
*/ | |
//memory | |
$memories = self::getMemories($specs); | |
$screenSize = self::getScreenSize($specs); | |
$mainCameraRes = self::getMainCameraResolutions($specs); | |
$selfieCameraRes = self::getSelfieCameraResolutions($specs); | |
$cpuCore = self::getCpuCore($specs); | |
$battery = self::getBattery($specs); | |
$colors = self::getColors($specs); | |
$formattedGadgets = $this->getFormattedGadgets(); | |
if (!isset($formattedGadgets[$gadgetId])) { | |
throw new \Exception('Gadget Tidak ditemukan'); | |
} | |
if (!$providerLink) { | |
throw new \Exception('Provider Link '.self::PROVIDER_NAME.' tidak ditemukan'); | |
} | |
$gadget = $formattedGadgets[$gadgetId]; | |
//find manufacturer | |
$manufacturer = \App\Manufacturer::where('name', $gadget->manufacturer->name)->first(); | |
if (!$manufacturer) { | |
$manufacturer = new \App\Manufacturer(); | |
$manufacturer->name = $gadget->manufacturer->name; | |
$manufacturer->save(); | |
} | |
$gadgetColors = []; | |
foreach ($colors as $colorName) { | |
$color = Color::where('name', $colorName)->first(); | |
if (!$color) { | |
$color = new Color(); | |
$color->name = $colorName; | |
$color->save(); | |
} | |
$gadgetColors[] = $color; | |
} | |
$hapenesiaGadget = new \App\Gadget(); | |
$hapenesiaGadget->name = $gadget->name; | |
$hapenesiaGadget->alias = $gadget->alias; | |
$hapenesiaGadget->searchable = $gadget->search; | |
$hapenesiaGadget->provider_id = $gadget->id; | |
$hapenesiaGadget->provider_name = self::PROVIDER_NAME; | |
$hapenesiaGadget->thumb_pic = $gadget->thumb; | |
$hapenesiaGadget->provider_link = $providerLink; | |
$hapenesiaGadget->manufacturer_name = $manufacturer->name; | |
$hapenesiaGadget->screen_size = $screenSize; | |
$hapenesiaGadget->cpu_core = $cpuCore[0]; | |
$hapenesiaGadget->cpu_core_name = $cpuCore[1]; | |
//multies | |
$hapenesiaGadget->main_camera_resolutions = $mainCameraRes; | |
$hapenesiaGadget->selfie_camera_resolutions = $selfieCameraRes; | |
$hapenesiaGadget->battery = $battery; | |
$hapenesiaGadget->memories = $memories; | |
$hapenesiaGadget->colors = $gadgetColors; | |
$hapenesiaGadget->manufacturer = $manufacturer; | |
// $hapenesiaGadget->save(); | |
// $hapenesiaGadget->specs()->create($specs); | |
// $hapenesiaGadget->memories()->create($memories); | |
// $manufacturer->gadgets()->save($hapenesiaGadget); | |
// $hapenesiaGadget->searchable = $gadget->search; | |
$specData['provider'] = $specs; | |
$specData['hapenesia_gadget'] = $hapenesiaGadget; | |
return $specData; | |
} | |
/** | |
* @param $specs | |
* | |
* @return array | |
* @throws \Exception | |
*/ | |
private static function getMemories($specs) | |
{ | |
if (!isset($specs['memory']) || !$specs['memory']) { | |
throw new \Exception('Memory Tidak ditemukan'); | |
} | |
if (!isset($specs['memory']['specs']['internal']['value'])) { | |
throw new \Exception('Memory internal Tidak ditemukan'); | |
} | |
$memoryInternal = $specs['memory']['specs']['internal']['value']; | |
$parseds = explode(' or ', $memoryInternal); | |
$memories = []; | |
foreach ($parseds as $parse) { | |
$ints = explode(', ', $parse); | |
if (!isset($ints[1]) || !$ints[1]) { | |
throw new \Exception('Memory RAM internal Tidak ditemukan'); | |
} | |
$ram = s($ints[1])->replace(' GB RAM', ''); | |
$ram = (string)$ram; | |
if (!(int)$ram) { | |
throw new \Exception('RAM Tidak ditemukan'); | |
} | |
$memories[] = ['ram' => (int)$ram]; | |
} | |
if (!$memories) { | |
throw new \Exception('Data Memory Tidak ditemukan'); | |
} | |
return $memories; | |
} | |
private static function getScreenSize($specs) | |
{ | |
if (!isset($specs['display']) || !$specs['display']) { | |
throw new \Exception('display Tidak ditemukan'); | |
} | |
if (!isset($specs['display']['specs']['size']['value'])) { | |
throw new \Exception('display size Tidak ditemukan'); | |
} | |
$screenSize = $specs['display']['specs']['size']['value']; | |
$parsed = explode(', ', $screenSize); | |
$screenSize = s($parsed[0])->replace(' INCHES', ''); | |
$screenSize = (string)$screenSize; | |
$screenSize = (double)$screenSize; | |
if (!$screenSize) { | |
throw new \Exception('screenSize Tidak ditemukan'); | |
} | |
return $screenSize; | |
} | |
private static function getMainCameraResolutions($specs) | |
{ | |
$mainCameraReses = []; | |
if (!isset($specs['camera']) || !$specs['camera']) { | |
throw new \Exception('camera Tidak ditemukan'); | |
} | |
if (!isset($specs['camera']['specs']['primary']['value'])) { | |
throw new \Exception('camera primary Tidak ditemukan'); | |
} | |
$mainCameraRes = $specs['camera']['specs']['primary']['value']; | |
$mainCameraResString = s($mainCameraRes); | |
if ($mainCameraResString->contains('Dual', false)) { | |
$mainCameraResString = $mainCameraResString->regexReplace('\([^)]*\)', ''); | |
$mainCameraResString = $mainCameraResString->replace('Dual: ', ''); | |
$parsed = explode(', ', (string)$mainCameraResString); | |
$parsed = explode(' + ', (string)$parsed[0]); | |
foreach ($parsed as $item) { | |
$mainCameraRes = s($item)->replace(' MP', ''); | |
$mainCameraRes = $mainCameraRes->replace(' ', ''); | |
$mainCameraRes = (string)$mainCameraRes; | |
$mainCameraRes = (double)$mainCameraRes; | |
if (!$mainCameraRes) { | |
throw new \Exception('$mainCameraRes Tidak ditemukan'); | |
} | |
$mainCameraReses[] = ['resolution' => $mainCameraRes]; | |
} | |
} else { | |
$parsed = explode(', ', $mainCameraRes); | |
$mainCameraRes = s($parsed[0])->replace(' MP', ''); | |
$mainCameraRes = $mainCameraRes->regexReplace('\s.*', ''); | |
$mainCameraRes = (string)$mainCameraRes; | |
$mainCameraRes = (double)$mainCameraRes; | |
if (!$mainCameraRes) { | |
throw new \Exception('$mainCameraRes Tidak ditemukan'); | |
} | |
$mainCameraReses[] = ['resolution' => $mainCameraRes]; | |
} | |
if (!$mainCameraReses) { | |
throw new \Exception('$mainCameraRes Tidak ditemukan'); | |
} | |
return $mainCameraReses; | |
} | |
private static function getSelfieCameraResolutions($specs) | |
{ | |
$mainCameraReses = []; | |
if (!isset($specs['camera']) || !$specs['camera']) { | |
throw new \Exception('camera Tidak ditemukan'); | |
} | |
if (!isset($specs['camera']['specs']['secondary']['value'])) { | |
throw new \Exception('camera secondary Tidak ditemukan'); | |
} | |
$mainCameraRes = $specs['camera']['specs']['secondary']['value']; | |
$mainCameraResString = s($mainCameraRes); | |
if ($mainCameraResString->contains('Dual', false)) { | |
$mainCameraResString = $mainCameraResString->regexReplace('\([^)]*\)', ''); | |
$mainCameraResString = $mainCameraResString->replace('Dual ', ''); | |
$parsed = explode(', ', (string)$mainCameraResString); | |
$parsed = explode(' + ', (string)$parsed[0]); | |
foreach ($parsed as $item) { | |
$mainCameraRes = s($item)->replace(' MP', ''); | |
$mainCameraRes = $mainCameraRes->replace(' ', ''); | |
$mainCameraRes = (string)$mainCameraRes; | |
$mainCameraRes = (double)$mainCameraRes; | |
if (!$mainCameraRes) { | |
throw new \Exception('$secondCameraRes Tidak ditemukan'); | |
} | |
$mainCameraReses[] = ['resolution' => $mainCameraRes]; | |
} | |
} else { | |
$parsed = explode(', ', $mainCameraRes); | |
$mainCameraRes = s($parsed[0])->replace(' MP', ''); | |
$mainCameraRes = $mainCameraRes->regexReplace('\s.*', ''); | |
$mainCameraRes = (string)$mainCameraRes; | |
$mainCameraRes = (double)$mainCameraRes; | |
if (!$mainCameraRes) { | |
throw new \Exception('$secondCameraRes Tidak ditemukan'); | |
} | |
$mainCameraReses[] = ['resolution' => $mainCameraRes]; | |
} | |
if (!$mainCameraReses) { | |
throw new \Exception('$secondCameraRes Tidak ditemukan'); | |
} | |
return $mainCameraReses; | |
} | |
private static function getCpuCore($specs) | |
{ | |
if (!isset($specs['platform']) || !$specs['platform']) { | |
throw new \Exception('platform Tidak ditemukan'); | |
} | |
if (!isset($specs['platform']['specs']['cpu']['value'])) { | |
throw new \Exception('platform cpu Tidak ditemukan'); | |
} | |
$cpu = s($specs['platform']['specs']['cpu']['value']); | |
$cpu = $cpu->replace(' ', ''); | |
$cpu = $cpu->replace('-', ''); | |
if ($cpu->contains('OctaCore', false)) { | |
return [ | |
8, | |
'Octa Core', | |
]; | |
} elseif ($cpu->contains('DualCore', false)) { | |
return [ | |
2, | |
'Dual Core', | |
]; | |
} elseif ($cpu->contains('QuadCore', false)) { | |
return [ | |
4, | |
'Quad Core', | |
]; | |
} elseif ($cpu->contains('HexaCore', false)) { | |
return [ | |
6, | |
'Hexa Core', | |
]; | |
} elseif ($cpu->contains('DecaCore', false)) { | |
return [ | |
10, | |
'Deca Core', | |
]; | |
} else { | |
return [ | |
1, | |
'Single Core', | |
]; | |
} | |
} | |
private static function getBattery($specs) | |
{ | |
if (!isset($specs['battery']) || !$specs['battery']) { | |
throw new \Exception('battery Tidak ditemukan'); | |
} | |
if (!isset($specs['battery']['specs']['nbsp']['value'])) { | |
throw new \Exception('battery nbsp Tidak ditemukan'); | |
} | |
$bacteryV = false; | |
$battery = s($specs['battery']['specs']['nbsp']['value']); | |
$batteries = $battery->split(' '); | |
foreach ($batteries as $key => $battery) { | |
if ($battery->contains('mAH', false)) { | |
$bacteryV = $batteries[$key - 1]; | |
break; | |
} | |
} | |
if (!$bacteryV) { | |
throw new \Exception('$bacteryV Tidak ditemukan'); | |
} | |
$bacteryV = (string)$bacteryV; | |
$bacteryV = (double)$bacteryV; | |
return $bacteryV; | |
} | |
private static function getColors($specs) | |
{ | |
if (!isset($specs['misc']) || !$specs['misc']) { | |
throw new \Exception('misc Tidak ditemukan'); | |
} | |
if (!isset($specs['misc']['specs']['colors']['value'])) { | |
throw new \Exception('misc colors Tidak ditemukan'); | |
} | |
$colors = false; | |
$color = s($specs['misc']['specs']['colors']['value']); | |
$colorsParsed = $color->split(', '); | |
foreach ($colorsParsed as $key => $colorsPars) { | |
$colorsPars = $colorsPars->regexReplace('\([^)]*\)', '')->titleize(); | |
$colors[] = (string)$colorsPars; | |
} | |
if (!$colors) { | |
throw new \Exception('$colors Tidak ditemukan'); | |
} | |
return $colors; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment