Skip to content

Instantly share code, notes, and snippets.

@toxatoor
Created November 13, 2016 23:31
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save toxatoor/58febad8add868d0a664c8887732e045 to your computer and use it in GitHub Desktop.
Save toxatoor/58febad8add868d0a664c8887732e045 to your computer and use it in GitHub Desktop.
HTTP Crawler
# HTTP crawler inside nginx.
#
# start crawling by curl http://127.0.0.1:18080/?host=<START URL>
#
worker_processes auto;
events {
worker_connections 16384;
}
http {
include mime.types;
default_type application/octet-stream;
sendfile on;
keepalive_timeout 65;
lua_shared_dict queue 1M;
resolver 8.8.8.8 ;
server {
listen 18080;
server_name localhost;
access_log /dev/null ;
if ($arg_host ~* '^http://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; }
if ($arg_host ~* '^https://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; }
if ($crawlfile ~* '^(.+)/$') { set $crawlfile "$1/index.html"; }
location / {
proxy_pass $arg_host;
proxy_set_header Host $crawlhost ;
header_filter_by_lua '
allowed_types = {
["text/html"] = true ,
["text/plain"] = true ,
}
local ctype = string.gsub(ngx.resp.get_headers()["Content-Type"], ";.+", "")
if allowed_types[ctype] == nil then
return ngx.exit(ngx.HTTP_NOT_FOUND)
end
';
body_filter_by_lua '
local shared = ngx.shared.queue
local body = string.lower(ngx.arg[1])
local hreflist = {}
local prefix = "http://" .. ngx.var.crawlhost
local size = string.len(prefix)
for href in string.gmatch(body, \'a href="%U-"\') do
local link = string.gsub(string.gsub(href, \'"\', ""), "a href=", "") .. " "
if string.sub(link, 1, 1) == "/" then link = prefix .. link end
if string.sub(link, 1, size) == prefix then do
shared.set(shared, link, "1")
end
end
end
';
proxy_store /tmp/store/$crawlhost/$crawlfile;
post_action @queue;
}
location /proxy {
proxy_pass http://127.0.0.1:18080/?$args ;
}
location @queue {
content_by_lua '
local keys = ngx.shared.queue:get_keys(1)
local loc = "/proxy/?host=" .. keys[1]
ngx.shared.queue:delete(keys[1])
res = ngx.location.capture(loc)
';
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment