-
-
Save nordicdyno/97e90a7fcbf25f518884fca3275c4ed4 to your computer and use it in GitHub Desktop.
HTTP Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# HTTP crawler inside nginx. | |
# | |
# start crawling by curl http://127.0.0.1:18080/?host=<START URL> | |
# | |
worker_processes auto; | |
events { | |
worker_connections 16384; | |
} | |
http { | |
include mime.types; | |
default_type application/octet-stream; | |
sendfile on; | |
keepalive_timeout 65; | |
lua_shared_dict queue 1M; | |
resolver 8.8.8.8 ; | |
server { | |
listen 18080; | |
server_name localhost; | |
access_log /dev/null ; | |
if ($arg_host ~* '^http://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; } | |
if ($arg_host ~* '^https://([\d|\w|\.]+)/(.*)$') { set $crawlfile $2; set $crawlhost $1 ; } | |
if ($crawlfile ~* '^(.+)/$') { set $crawlfile "$1/index.html"; } | |
location / { | |
proxy_pass $arg_host; | |
proxy_set_header Host $crawlhost ; | |
header_filter_by_lua ' | |
allowed_types = { | |
["text/html"] = true , | |
["text/plain"] = true , | |
} | |
local ctype = string.gsub(ngx.resp.get_headers()["Content-Type"], ";.+", "") | |
if allowed_types[ctype] == nil then | |
return ngx.exit(ngx.HTTP_NOT_FOUND) | |
end | |
'; | |
body_filter_by_lua ' | |
local shared = ngx.shared.queue | |
local body = string.lower(ngx.arg[1]) | |
local hreflist = {} | |
local prefix = "http://" .. ngx.var.crawlhost | |
local size = string.len(prefix) | |
for href in string.gmatch(body, \'a href="%U-"\') do | |
local link = string.gsub(string.gsub(href, \'"\', ""), "a href=", "") .. " " | |
if string.sub(link, 1, 1) == "/" then link = prefix .. link end | |
if string.sub(link, 1, size) == prefix then do | |
shared.set(shared, link, "1") | |
end | |
end | |
end | |
'; | |
proxy_store /tmp/store/$crawlhost/$crawlfile; | |
post_action @queue; | |
} | |
location /proxy { | |
proxy_pass http://127.0.0.1:18080/?$args ; | |
} | |
location @queue { | |
content_by_lua ' | |
local keys = ngx.shared.queue:get_keys(1) | |
local loc = "/proxy/?host=" .. keys[1] | |
ngx.shared.queue:delete(keys[1]) | |
res = ngx.location.capture(loc) | |
'; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment