wsdookadr/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Problem

The official Elasticsearch documentation site is protected with Cloudflare Captcha
so it can't be directly scraped.
The built-docs repo isn't self-contained
(there are external resources, both images and js) and I don't understand how their
air_gapped docs work, nor are there
instructions on how to use it.
I want to read documentation offline.
Purpose

What this setup does is bundle the entirety of ES docs into an offline searchable archive.
Operation:

Do a shallow clone of the documentation git clone --depth=10 https://github.com/elastic/built-docs
Put the scripts here inside of it
Run ./start.sh

Details

Browsertrix is used to crawl the
documentation which is hosted and served locally from nginx.
A custom sitemap is generated to help browsertrix crawl the entire thing.

  
## config.yaml
seeds:
  - url: http://web/
    sitemap: "http://web/sitemap_custom.xml"
    depth: 1
    blockRules:
      - url: googleanalytics.com
      - url: www.googletagmanager.com
      - url: googletagmanager.com

combineWARC: true

## docker-compose.yml
version: '3.1'

services:
  web:
    image: nginx:1.25.2
    ports:
      - "80:80"
    healthcheck:
      test: ["CMD", "curl", "-I", "http://localhost"]
      interval: 1s
      start_period: 3s
    volumes:
      - ./html:/usr/share/nginx/html:ro
      - ./nginx.default.conf:/etc/nginx/conf.d/default.conf:ro
    shm_size: 1gb
    privileged: true

  crawler:
    image: webrecorder/browsertrix-crawler:0.11.1
    depends_on:
      web:
        condition: service_healthy
    environment:
      - DISPLAY=:0.0
    volumes:
      - ./crawls:/crawls/
      - ./config.yaml:/app/crawl-config.yaml
      - /var/run/dbus:/var/run/dbus
      - /run/dbus:/run/dbus
      - /tmp/.X11-unix:/tmp/.X11-unix
      - /home/user/.Xauthority:/root/.Xauthority

    cap_add:
      - NET_ADMIN
      - SYS_ADMIN

    shm_size: 3gb
    privileged: true

    command: "crawl --behaviors autoscroll,autofetch --behaviorTimeout 8 --pageLoadTimeout 10 --workers 30 --headless --config /app/crawl-config.yaml"


## fix_sitemap.sh
#!/bin/bash

cat <<EOF > html/sitemap_custom.xml
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
EOF

cat html/sitemap.xml | grep "<loc>" | grep "/guide/en" | \
perl -ne '
($url)=$_=~m{<loc>(https?://www.elastic.co/[^<>]*?)</loc>};

$url =~ s{https://www.elastic.co/guide}{http://web}g;

print "
<url>
    <loc>$url</loc>
    <lastmod>2023-07-10T16:05:35+00:00</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.5</priority>
</url>
";

END{ print "</urlset>\n"; }
' >> html/sitemap_custom.xml

## monitor.sh
#!/bin/bash

LATEST=$(find crawls/ -name "crawl*.log" -printf "%T@ %p\n" | sort -n | tail -1 | awk '{print $2}')
tail -f $LATEST | \
jq -c -r 'select(.context | contains("crawlStatus")) | .details | del(.pendingPages)'

## nginx.default.conf
server {
    listen       80;
    listen  [::]:80;
    server_name  localhost;
    location ~ ^/guide/static {
	rewrite ^/guide/(static.*)$ /$1 last;
    }
    location ~ ^/static-res {
	rewrite ^/static-res/styles/font-files/(.*)$ /static/$1 last;
    }
    location / {
        root   /usr/share/nginx/html;
        index  index.html index.htm;
    }
    error_page   500 502 503 504  /50x.html;
    location = /50x.html {
        root   /usr/share/nginx/html;
    }
}


## start.sh
#!/bin/bash
# rm -rf crawls/*
./fix_sitemap.sh
docker-compose rm --force
time docker-compose up --exit-code-from crawler
	seeds:
	- url: http://web/
	sitemap: "http://web/sitemap_custom.xml"
	depth: 1
	blockRules:
	- url: googleanalytics.com
	- url: www.googletagmanager.com
	- url: googletagmanager.com

	combineWARC: true
	version: '3.1'

	services:
	web:
	image: nginx:1.25.2
	ports:
	- "80:80"
	healthcheck:
	test: ["CMD", "curl", "-I", "http://localhost"]
	interval: 1s
	start_period: 3s
	volumes:
	- ./html:/usr/share/nginx/html:ro
	- ./nginx.default.conf:/etc/nginx/conf.d/default.conf:ro
	shm_size: 1gb
	privileged: true

	crawler:
	image: webrecorder/browsertrix-crawler:0.11.1
	depends_on:
	web:
	condition: service_healthy
	environment:
	- DISPLAY=:0.0
	volumes:
	- ./crawls:/crawls/
	- ./config.yaml:/app/crawl-config.yaml
	- /var/run/dbus:/var/run/dbus
	- /run/dbus:/run/dbus
	- /tmp/.X11-unix:/tmp/.X11-unix
	- /home/user/.Xauthority:/root/.Xauthority

	cap_add:
	- NET_ADMIN
	- SYS_ADMIN

	shm_size: 3gb
	privileged: true

	command: "crawl --behaviors autoscroll,autofetch --behaviorTimeout 8 --pageLoadTimeout 10 --workers 30 --headless --config /app/crawl-config.yaml"
	#!/bin/bash

	cat <<EOF > html/sitemap_custom.xml
	<?xml version="1.0" encoding="UTF-8"?>
	<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	EOF

	cat html/sitemap.xml \| grep "<loc>" \| grep "/guide/en" \| \
	perl -ne '
	($url)=$_=~m{<loc>(https?://www.elastic.co/[^<>]*?)</loc>};

	$url =~ s{https://www.elastic.co/guide}{http://web}g;

	print "
	<url>
	<loc>$url</loc>
	<lastmod>2023-07-10T16:05:35+00:00</lastmod>
	<changefreq>weekly</changefreq>
	<priority>0.5</priority>
	</url>
	";

	END{ print "</urlset>\n"; }
	' >> html/sitemap_custom.xml
	#!/bin/bash

	LATEST=$(find crawls/ -name "crawl*.log" -printf "%T@ %p\n" \| sort -n \| tail -1 \| awk '{print $2}')
	tail -f $LATEST \| \
	jq -c -r 'select(.context \| contains("crawlStatus")) \| .details \| del(.pendingPages)'
	server {
	listen 80;
	listen [::]:80;
	server_name localhost;
	location ~ ^/guide/static {
	rewrite ^/guide/(static.*)$ /$1 last;
	}
	location ~ ^/static-res {
	rewrite ^/static-res/styles/font-files/(.*)$ /static/$1 last;
	}
	location / {
	root /usr/share/nginx/html;
	index index.html index.htm;
	}
	error_page 500 502 503 504 /50x.html;
	location = /50x.html {
	root /usr/share/nginx/html;
	}
	}
	#!/bin/bash
	# rm -rf crawls/*
	./fix_sitemap.sh
	docker-compose rm --force
	time docker-compose up --exit-code-from crawler