Skip to content

Instantly share code, notes, and snippets.

@amokan
Created April 25, 2016 18:53
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save amokan/57a64fc5d96e6ed5b9d1156e0afa72ee to your computer and use it in GitHub Desktop.
Save amokan/57a64fc5d96e6ed5b9d1156e0afa72ee to your computer and use it in GitHub Desktop.
Elixir Crawling With Pools For Fun and Profit

PoolCrawler

This is a simple example in response to https://www.reddit.com/r/elixir/comments/4gcnmi/struggling_with_getting_this_worker_pool_and/ as well as a reminder for myself

I used https://github.com/midas/conqueuer to do something similar to the original poster was trying to accomplish.

Some code has been removed to make it simple to follow and just dropping this into a project will not compile. Be sure to look at the comments.

I combined and/or renamed files in this gist to group things together as well.

Also note that I noticed you can get data on the queue sizes from Conqueuer by doing the following (in relation to my code here):

Conqueuer.Queue.size(:CrawlersQueue)
Conqueuer.Queue.size(:ParsersQueue)
defmodule PoolCrawler do
use Application
def start(_type, _args) do
import Supervisor.Spec, warn: false
children = [
supervisor(PoolCrawler.CrawlerPool.Supervisor, [[], [name: :CrawlerPoolSupervisor]]),
worker(Conqueuer.Queue, [[], [name: :CrawlersQueue]], id: :crawler_queue),
worker(Conqueuer.Foreman, [[name: :crawlers], [name: :CrawlersForeman]], id: :crawler_foreman),
supervisor(PoolCrawler.ParserPool.Supervisor, [[], [name: :ParserPoolSupervisor]]),
worker(Conqueuer.Queue, [[], [name: :ParsersQueue]], id: :parser_queue),
worker(Conqueuer.Foreman, [[name: :parsers], [name: :ParsersForeman]], id: :parser_foreman)
]
opts = [strategy: :one_for_one, name: PoolCrawler.Supervisor]
Supervisor.start_link(children, opts)
end
def crawl(url) do
# likely want to validate the url here
Conqueuer.work(:crawlers, url)
end
end
defmodule PoolCrawler.CrawlerPool.Supervisor do
use Conqueuer.Pool, name: :crawlers,
worker: PoolCrawler.CrawlerPool.Worker,
size: 4,
max_overflow: 1
end
defmodule PoolCrawler.ParserPool.Supervisor do
use Conqueuer.Pool, name: :parsers,
worker: PoolCrawler.ParserPool.Worker,
size: 30,
max_overflow: 10
end
defmodule PoolCrawler.CrawlerPool.Worker do
use Conqueuer.Worker
@default_user_agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36"
@default_width 1720
@default_height 1340
def perform({url, width, height, user_agent}, _state) do
IO.puts "CrawlerWorker.peform"
IO.puts "url: #{url}"
IO.puts "width: #{width}"
IO.puts "height: #{height}"
IO.puts "user_agent: #{user_agent}"
crawl({url, width, height, user_agent}) |> handle_html(url)
end
def perform(url, _state) do
IO.puts "CrawlerWorker.perform"
IO.puts "url: #{url}"
user_agent = @default_user_agent
width = @default_width
height = @default_height
crawl({url, width, height, user_agent}) |> handle_html(url)
end
defp crawl({url, width, height, user_agent}) do
# probably want to keep track of counts or some other metrics here
# call your module that actually does the crawl. I've used PhantomJS via Elixir/Erlang 'Ports' with success
# i found some good code to get that working by looking at the Elixir WebDriver implentation. I didn't use that directly, but was able to put
# together a decent PhantomJS GenServer/Port that does what I need
#
# example call:
# `{:ok, pid} = SuperDuperPhantomServer.start_link(width, height, user_agent)`
# `PhantomServer.crawl_page(pid, url)`
end
# if we get no html back, probably want to keep track of that
defp handle_html(html, url) when is_nil(html), do: something... # <-- this will not compile, obviously
defp handle_html(html, _) do
# send HTML results to parsing queue
Conqueuer.work(:parsers, html)
end
defp already_crawled?(url) do
# if you want to avoid hitting the same url - store previously crawled
# links someplace. Maybe a GenServer key/value store?
end
end
defmodule PoolCrawler.ParserPool.Worker do
use Conqueuer.Worker
def perform(html, _state) do
# call your parsing module. In my case this was a GenServer that used Floki for parsing logic.
# I then piped the results into process_links
# like: `MagicParserModule.parse_html(html)[:links] |> process_links
end
defp process_links([%{url: nil}]), do: nil
defp process_links([link = %{url: url}|_]) do
# validate the link before we bother doing anything else
case validate_url(link) do
true -> queue_link(link)
false -> IO.puts "Url issues: `#{url}`"
end
end
defp process_links([]), do: nil
defp validate_url(link = %{url: url}) do
# makes sense to validate the url somehow based on your needs.
# maybe you should strip anchor tags or anything else unique?
end
# lets push this url into the crawl pool, starting a new cycle
defp queue_link(%{no_follow: false, url: url, text: _}) do
# push to the :crawlers pool
Conqueuer.work(:crawlers, url)
end
# when parsing the HTML, I was adhering to 'no follow' attributes - so skip this link
defp queue_link(%{no_follow: true}) do
# maybe keep track of how many of these no follow links in a stats module?
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment