Skip to content

Instantly share code, notes, and snippets.

@ilyasozkurt
Created September 24, 2021 08:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilyasozkurt/7d26423e5889c7b899860ed66fede63f to your computer and use it in GitHub Desktop.
Save ilyasozkurt/7d26423e5889c7b899860ed66fede63f to your computer and use it in GitHub Desktop.
<?php
namespace App\Console\Commands;
use App\Models\Earthquake;
use Carbon\Carbon;
use Carbon\CarbonTimeZone;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\Http;
use PHPHtmlParser\Dom;
use PHPHtmlParser\Options;
class ScrapeEarthquakes extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'scrape:earthquakes';
/**
* The console command description.
*
* @var string
*/
protected $description = 'This command tries to scrape earthquakes from http://www.koeri.boun.edu.tr/scripts/lasteq.asp';
/**
* Unwanted rows that will exclude from data.
*
* @var string[]
*/
private $excluded = [
'RECENT EARTHQUAKES IN TURKEY',
'KOERI REGIONAL EARTHQUAKE-TSUNAMI MONITORING CENTER',
'(QUICK EPICENTER DETERMINATIONS)',
' Magnitude',
'Date Time Latit(N) Long(E) Depth(km) MD ML Mw Region',
'---------- -------- -------- ------- ---------- ------------ -----------',
];
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
//Format unwanted rows before use
foreach ($this->excluded as $index => $row) {
//Remove spaces before compare their equality
$this->excluded[$index] = $this->clearSpaces($row);
}
parent::__construct();
}
/**
* Execute the console command.
*
* @return int
*/
public function handle()
{
//Make a request to remote target html source which has data.
$response = Http::get('http://www.koeri.boun.edu.tr/scripts/lasteq.asp');
//Check is client successful
if ($response->successful()) {
//Create an instance from dom parser.
$parser = new Dom();
//Change options with that we need.
$parser->setOptions(
(new Options())
->setWhitespaceTextNode(false)
->setCleanupInput(false)
->setPreserveLineBreaks(false)
->setRemoveDoubleSpace(false)
);
//Load html source to dom parser
$parser->loadStr($response->body());
//Get raw data from html source
$rawData = $parser->find('pre')[0]->innerHtml ?? false;
//If is there raw data from source.
if ($rawData) {
//Explode raw data to rows
$rows = explode(PHP_EOL, $rawData);
//Process all rows with progressbar.
$this->withProgressBar($rows, function ($row) {
$this->processEarthquake($row);
});
}
}
}
/**
* @param $row
*/
protected function processEarthquake($row)
{
//Check row if it is unwanted row or empty
if (!empty($this->clearSpaces($row)) && !in_array($this->clearSpaces($row), $this->excluded)) {
//Replace the row's double spaces with # to extract data easier
$row = $this->convertDoubleSpaces($row);
//Slice the row with #
$rowParts = explode('#', $row);
//A temporary variable to keep row data as columns
$earthquakeData = [];
//Process all sliced row parts
foreach ($rowParts as $part) {
//Remove spaces from left and right like " hello " to "hello"
$part = trim($part);
//Check if part is not empty or not equals # then add it as data
if ($part !== '#' && !empty($part)) {
$earthquakeData[] = $part;
}
}
//Check is not empty array, this means row has data
if (count($earthquakeData)){
//Create a hash value to better check
$earthquakeHash = hash('crc32', implode(',', $earthquakeData));
//Create a carbon instance from locale formatted date time
$happenedAt = Carbon::createFromFormat('Y.m.d H:i:s', $earthquakeData[0], new CarbonTimeZone('Europe/Istanbul'));
//Check if it is not already saved
if (!Earthquake::where('hash', $earthquakeHash)->count()){
Earthquake::create([
'latitude' => $earthquakeData[1],
'longitude' => $earthquakeData[2],
'depth' => $earthquakeData[3],
'md' => ($earthquakeData[4] !== '-.-' ? $earthquakeData[4] : null),
'ml' => $earthquakeData[5],
'mw' => ($earthquakeData[6] !== '-.-' ? $earthquakeData[6] : null),
'region' => $earthquakeData[7],
'happened_at' => $happenedAt,
'hash' => $earthquakeHash,
]);
}
}
}
}
/**
* @param $text string The text which we want to clear spaces
* @return string
*/
protected function clearSpaces($text)
{
return str_replace(' ', '', trim($text));
}
/**
* @param $text string The text which we want to clear
* @return string
*/
protected function convertDoubleSpaces($text)
{
return str_replace(' ', '##', trim($text));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment