amokan/README.md

## README.md

      
    Raw
  

              README.md
            
          
    PoolCrawler

This is a simple example in response to https://www.reddit.com/r/elixir/comments/4gcnmi/struggling_with_getting_this_worker_pool_and/ as well as a reminder for myself
I used https://github.com/midas/conqueuer to do something similar to the original poster was trying to accomplish.
Some code has been removed to make it simple to follow and just dropping this into a project will not compile.
Be sure to look at the comments.
I combined and/or renamed files in this gist to group things together as well.
Also note that I noticed you can get data on the queue sizes from Conqueuer by doing the following (in relation to my code here):
Conqueuer.Queue.size(:CrawlersQueue)
Conqueuer.Queue.size(:ParsersQueue)


## pool_setup.ex
defmodule PoolCrawler do
  use Application

  def start(_type, _args) do
    import Supervisor.Spec, warn: false

    children = [
      supervisor(PoolCrawler.CrawlerPool.Supervisor, [[], [name: :CrawlerPoolSupervisor]]),
      worker(Conqueuer.Queue, [[], [name: :CrawlersQueue]], id: :crawler_queue),
      worker(Conqueuer.Foreman, [[name: :crawlers], [name: :CrawlersForeman]], id: :crawler_foreman),
      supervisor(PoolCrawler.ParserPool.Supervisor, [[], [name: :ParserPoolSupervisor]]),
      worker(Conqueuer.Queue, [[], [name: :ParsersQueue]], id: :parser_queue),
      worker(Conqueuer.Foreman, [[name: :parsers], [name: :ParsersForeman]], id: :parser_foreman)
    ]

    opts = [strategy: :one_for_one, name: PoolCrawler.Supervisor]
    Supervisor.start_link(children, opts)
  end


  def crawl(url) do
    # likely want to validate the url here
    Conqueuer.work(:crawlers, url)
  end

end

## pool_supervisors.ex
defmodule PoolCrawler.CrawlerPool.Supervisor do
  use Conqueuer.Pool, name: :crawlers,
                      worker: PoolCrawler.CrawlerPool.Worker,
                      size: 4,
                      max_overflow: 1
end

defmodule PoolCrawler.ParserPool.Supervisor do
  use Conqueuer.Pool, name: :parsers,
                      worker: PoolCrawler.ParserPool.Worker,
                      size: 30,
                      max_overflow: 10
end

## worker_crawler.ex
defmodule PoolCrawler.CrawlerPool.Worker do
  use Conqueuer.Worker

  @default_user_agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36"
  @default_width 1720
  @default_height 1340

  def perform({url, width, height, user_agent}, _state) do
    IO.puts "CrawlerWorker.peform"
    IO.puts "url: #{url}"
    IO.puts "width: #{width}"
    IO.puts "height: #{height}"
    IO.puts "user_agent: #{user_agent}"

    crawl({url, width, height, user_agent}) |> handle_html(url)
  end

  def perform(url, _state) do
    IO.puts "CrawlerWorker.perform"
    IO.puts "url: #{url}"

    user_agent = @default_user_agent
    width = @default_width
    height = @default_height

    crawl({url, width, height, user_agent}) |> handle_html(url)
  end

  defp crawl({url, width, height, user_agent}) do
    # probably want to keep track of counts or some other metrics here

    # call your module that actually does the crawl. I've used PhantomJS via Elixir/Erlang 'Ports' with success
    # i found some good code to get that working by looking at the Elixir WebDriver implentation. I didn't use that directly, but was able to put
    # together a decent PhantomJS GenServer/Port that does what I need
    #
    # example call:
    # `{:ok, pid} = SuperDuperPhantomServer.start_link(width, height, user_agent)`
    # `PhantomServer.crawl_page(pid, url)`
  end

  # if we get no html back, probably want to keep track of that
  defp handle_html(html, url) when is_nil(html), do: something... # <-- this will not compile, obviously
  defp handle_html(html, _) do
    # send HTML results to parsing queue
    Conqueuer.work(:parsers, html)
  end

  defp already_crawled?(url) do
    # if you want to avoid hitting the same url - store previously crawled
    # links someplace. Maybe a GenServer key/value store?
  end

end

## worker_parser.ex
defmodule PoolCrawler.ParserPool.Worker do
  use Conqueuer.Worker

  def perform(html, _state) do
    # call your parsing module. In my case this was a GenServer that used Floki for parsing logic.

    # I then piped the results into process_links
    # like: `MagicParserModule.parse_html(html)[:links] |> process_links
  end

  defp process_links([%{url: nil}]), do: nil
  defp process_links([link = %{url: url}|_]) do
    # validate the link before we bother doing anything else
    case validate_url(link) do
      true -> queue_link(link)
      false -> IO.puts "Url issues: `#{url}`"
    end
  end
  defp process_links([]), do: nil

  defp validate_url(link = %{url: url}) do
    # makes sense to validate the url somehow based on your needs.
    # maybe you should strip anchor tags or anything else unique?
  end

  # lets push this url into the crawl pool, starting a new cycle
  defp queue_link(%{no_follow: false, url: url, text: _}) do
    # push to the :crawlers pool
    Conqueuer.work(:crawlers, url)
  end

  # when parsing the HTML, I was adhering to 'no follow' attributes - so skip this link
  defp queue_link(%{no_follow: true}) do
    # maybe keep track of how many of these no follow links in a stats module?
  end
end
	defmodule PoolCrawler do
	use Application

	def start(_type, _args) do
	import Supervisor.Spec, warn: false

	children = [
	supervisor(PoolCrawler.CrawlerPool.Supervisor, [[], [name: :CrawlerPoolSupervisor]]),
	worker(Conqueuer.Queue, [[], [name: :CrawlersQueue]], id: :crawler_queue),
	worker(Conqueuer.Foreman, [[name: :crawlers], [name: :CrawlersForeman]], id: :crawler_foreman),
	supervisor(PoolCrawler.ParserPool.Supervisor, [[], [name: :ParserPoolSupervisor]]),
	worker(Conqueuer.Queue, [[], [name: :ParsersQueue]], id: :parser_queue),
	worker(Conqueuer.Foreman, [[name: :parsers], [name: :ParsersForeman]], id: :parser_foreman)
	]

	opts = [strategy: :one_for_one, name: PoolCrawler.Supervisor]
	Supervisor.start_link(children, opts)
	end


	def crawl(url) do
	# likely want to validate the url here
	Conqueuer.work(:crawlers, url)
	end

	end
	defmodule PoolCrawler.CrawlerPool.Supervisor do
	use Conqueuer.Pool, name: :crawlers,
	worker: PoolCrawler.CrawlerPool.Worker,
	size: 4,
	max_overflow: 1
	end

	defmodule PoolCrawler.ParserPool.Supervisor do
	use Conqueuer.Pool, name: :parsers,
	worker: PoolCrawler.ParserPool.Worker,
	size: 30,
	max_overflow: 10
	end
	defmodule PoolCrawler.CrawlerPool.Worker do
	use Conqueuer.Worker

	@default_user_agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36"
	@default_width 1720
	@default_height 1340

	def perform({url, width, height, user_agent}, _state) do
	IO.puts "CrawlerWorker.peform"
	IO.puts "url: #{url}"
	IO.puts "width: #{width}"
	IO.puts "height: #{height}"
	IO.puts "user_agent: #{user_agent}"

	crawl({url, width, height, user_agent}) \|> handle_html(url)
	end

	def perform(url, _state) do
	IO.puts "CrawlerWorker.perform"
	IO.puts "url: #{url}"

	user_agent = @default_user_agent
	width = @default_width
	height = @default_height

	crawl({url, width, height, user_agent}) \|> handle_html(url)
	end

	defp crawl({url, width, height, user_agent}) do
	# probably want to keep track of counts or some other metrics here

	# call your module that actually does the crawl. I've used PhantomJS via Elixir/Erlang 'Ports' with success
	# i found some good code to get that working by looking at the Elixir WebDriver implentation. I didn't use that directly, but was able to put
	# together a decent PhantomJS GenServer/Port that does what I need
	#
	# example call:
	# `{:ok, pid} = SuperDuperPhantomServer.start_link(width, height, user_agent)`
	# `PhantomServer.crawl_page(pid, url)`
	end

	# if we get no html back, probably want to keep track of that
	defp handle_html(html, url) when is_nil(html), do: something... # <-- this will not compile, obviously
	defp handle_html(html, _) do
	# send HTML results to parsing queue
	Conqueuer.work(:parsers, html)
	end

	defp already_crawled?(url) do
	# if you want to avoid hitting the same url - store previously crawled
	# links someplace. Maybe a GenServer key/value store?
	end

	end
	defmodule PoolCrawler.ParserPool.Worker do
	use Conqueuer.Worker

	def perform(html, _state) do
	# call your parsing module. In my case this was a GenServer that used Floki for parsing logic.

	# I then piped the results into process_links
	# like: `MagicParserModule.parse_html(html)[:links] \|> process_links
	end

	defp process_links([%{url: nil}]), do: nil
	defp process_links([link = %{url: url}\|_]) do
	# validate the link before we bother doing anything else
	case validate_url(link) do
	true -> queue_link(link)
	false -> IO.puts "Url issues: `#{url}`"
	end
	end
	defp process_links([]), do: nil

	defp validate_url(link = %{url: url}) do
	# makes sense to validate the url somehow based on your needs.
	# maybe you should strip anchor tags or anything else unique?
	end

	# lets push this url into the crawl pool, starting a new cycle
	defp queue_link(%{no_follow: false, url: url, text: _}) do
	# push to the :crawlers pool
	Conqueuer.work(:crawlers, url)
	end

	# when parsing the HTML, I was adhering to 'no follow' attributes - so skip this link
	defp queue_link(%{no_follow: true}) do
	# maybe keep track of how many of these no follow links in a stats module?
	end
	end