Skip to content

Instantly share code, notes, and snippets.

@HelgeSverre
Created December 3, 2023 15:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HelgeSverre/b36d38081b5fe65859b9266f9779d6ad to your computer and use it in GitHub Desktop.
Save HelgeSverre/b36d38081b5fe65859b9266f9779d6ad to your computer and use it in GitHub Desktop.
AI-Powered Web Crawler with crwlr/crawler

Install the required packages

composer require crwlr/crawler
composer require helgesverre/extractor

php artisan vendor:publish --tag="extractor-config"
php artisan vendor:publish --provider="OpenAI\Laravel\ServiceProvider"

Make a command for running the crawler

php artisan make:command Crawl

Then copy the code for the Command from the previous gist, and the put the ExtractWithAi into app/Crawler or wherever you want. (remember to update the namespaces)

<?php
namespace App\Console\Commands;
use App\Crawler\ExtractWithAI;
use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Stores\SimpleCsvFileStore;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use HelgeSverre\Extractor\Engine;
use Illuminate\Console\Command;
class Crawl extends Command
{
protected $signature = 'crawl';
public function handle(): void
{
$crawler = new class extends HttpCrawler {
protected function userAgent(): UserAgentInterface
{
return BotUserAgent::make('AI-Crawler');
}
};
$crawler
->setStore(new SimpleCsvFileStore(storePath: base_path()))
->input('https://promptengineering.no/')
->addStep(Http::get())
->addStep(Html::getLinks('a.text-gray-700'))
->addStep(Http::get()->maxOutputs(5))
->addStep(
ExtractWithAI::fields([
'jobTitle',
'companyName',
'applicationDueDate' => 'the application due date as Y-m-d',
'tags' => 'list of tags for the job post',
])->withModel(Engine::GPT_3_TURBO_INSTRUCT)
)
->runAndTraverse();
}
}
<?php
namespace App\Crawler;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Step;
use Generator;
use HelgeSverre\Extractor\Facades\Extractor;
use HelgeSverre\Extractor\Facades\Text;
use InvalidArgumentException;
use Psr\Http\Message\ResponseInterface;
class ExtractWithAI extends Step
{
protected ?string $model = 'gpt-3.5-turbo-1106';
protected int $maxTokens = 2000;
protected float $temperature = 0.2;
public function __construct(protected readonly array $fields)
{
}
public function withMaxTokens(int $maxTokens): self
{
$this->maxTokens = $maxTokens;
return $this;
}
public function withTemperature(float $temperature): self
{
$this->temperature = $temperature;
return $this;
}
public function withModel(?string $model): self
{
$this->model = $model;
return $this;
}
public static function fields(array $fields): self
{
return new self($fields);
}
protected function validateAndSanitizeInput(mixed $input): mixed
{
if (is_string($input)) {
return Text::html($input);
}
if ($input instanceof ResponseInterface || $input instanceof RespondedRequest) {
return Text::html(Http::getBodyString($input));
}
throw new InvalidArgumentException('Input must be string, PSR-7 Response or RespondedRequest.');
}
protected function invoke(mixed $input): Generator
{
yield Extractor::fields($input, fields: $this->fields);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment