Skip to content

Instantly share code, notes, and snippets.

@harrygr
Created November 26, 2015 15:57
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harrygr/aec8180cb43d992a3af1 to your computer and use it in GitHub Desktop.
Save harrygr/aec8180cb43d992a3af1 to your computer and use it in GitHub Desktop.
Paul Graham Essay Generator
{
"require": {
"fabpot/goutte": "^3.1",
"ucsdmath/pdf": "^1.4"
},
"autoload": {
"psr-4": {
"": ""
}
}
}
<?php
/**
* Paul Graham Essay Generator
*
* Author: Harry G (http://github.com/harrygr)
* Copyright: Licensed under the GPL-3 (http://www.gnu.org/licenses/gpl-3.0.html)
*
* This script scrapes Paul Graham's site and generates a PDF file of all the essays
*
* Usage:
* - copy the file and composer.json into a directory
* - run `composer install` to install the dependencies
* - run it from your terminal with `php pgessays.php`
*
* A PDF file named "PGs Essays.pdf" will be saved in the current directory
*
* To prevent repeated DDoS of Paul's site the script caches the results of the scrape
* in a sections.json file. Delete this file if you want a fresh copy of the essays (i.e. if some new ones are added).
*
* L
*
*/
require "vendor/autoload.php";
use Goutte\Client;
use UCSDMath\Pdf\Pdf;
class PgEssayGenerator {
private $client;
private $pdf;
private $main_url = 'http://www.paulgraham.com/articles.html';
private $sections = [];
public function __construct(Client $client, Pdf $pdf)
{
$this->client = $client;
$this->pdf = $pdf;
}
public function generate()
{
$crawler = $this->client->request('GET', $this->main_url);
if (!file_exists('sections.json')) {
$crawler->filter('table td[width=435] a')->each(function ($node) {
$title = $node->text();
$url = $node->link()->getUri();
$this->addSection($url, $title);
});
file_put_contents('sections.json', json_encode($this->sections));
} else {
$this->sections = json_decode(file_get_contents('sections.json'), true);
}
$this->buildPdf();
}
private function addSection($link, $title)
{
echo "Generating section '$title'" . PHP_EOL;
$crawler = $this->client->request('GET', $link);
$node = $crawler->filter('table[width=435] td[width=435] font, table[width=374] td[width=374] font');
if (count($node)) {
$body = $node->html();
} else {
$body = file_get_contents($link);
}
$this->sections[] = [
'title' => $title,
'body' => $body,
];
}
private function buildPdf()
{
$this->pdf
->initializePageSetup('A4', 'Portrait')
->setOutputDestination('F')
->setFilename('PGs Essays.pdf');
$count = count($this->sections);
foreach ($this->sections as $index => $section) {
$n = $index + 1;
echo "Adding section $n of $count: '{$section['title']}' to PDF" . PHP_EOL;
if (is_string($section['title']) and is_string($section['body'])) {
$this->pdf->appendPageContent('<h1>' . $section['title'] . '</h1>');
$this->pdf->appendPageContent($section['body']);
}
}
$this->pdf->render();
}
}
$generator = new PgEssayGenerator(new Client, new Pdf);
$generator->generate();
@Lambik
Copy link

Lambik commented Apr 14, 2020

Hi,

I had to change line 49 to $crawler->filter('table:nth-child(6) td[width=435] a')->each(function ($node) { to avoid scraping the top table (with 'suggested links').

Also, the PDF library no longer works well with modern php (it loads an old version of mpdf), but I haven't had the chance to look into upgrading that..

Thanks for the script!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment