Skip to content

Instantly share code, notes, and snippets.

@crova
Created February 16, 2021 15:06
Show Gist options
  • Save crova/9d67e853737c54c8301132e72375361a to your computer and use it in GitHub Desktop.
Save crova/9d67e853737c54c8301132e72375361a to your computer and use it in GitHub Desktop.
Crawly Spider
defmodule RampeFle.PhonetiqueCa do
use Crawly.Spider
@impl Crawly.Spider
#def base_url(), do: "http://www.phonetique.ca"
@base_url "http://www.phonetique.ca"
@impl Crawly.Spider
def init() do
[
start_urls: [
"http://www.phonetique.ca/exercices_discrimination.html"
]
]
end
@impl Crawly.Spider
def parse_item(response) do
# Parse response body to document
{:ok, document} = Floki.parse_document(response.body)
# Extract all exercices URLs
exercice_urls =
document
|> Floki.find("div#accordion.panel-group p a")
|> Floki.attribute("href")
# Extract the practice URLs
practice_urls =
for url <- exercice_urls do
resp = Crawly.fetch(@base_url <> "/" <> url)
{:ok, doc} = Floki.parse_document(resp.body)
doc |> Floki.find("h3 a") |> Floki.attribute("href")
end
practice_urls = List.flatten(practice_urls)
urls =
for url <- practice_urls do
@base_url <> "/documents/exercices/" <> url
end
# Convert Practice URLs into Requests
requests =
urls |> Enum.uniq()
|> Enum.map(&Crawly.Utils.request_from_url/1)
# |> Enum.map(&build_absolute_url/1)
# Create item for pages where items exists
item = %{
title:
document
|> Floki.find("h2.ExerciseTitle")
|> Floki.text()
|> :binary.bin_to_list()
|> List.to_string()
}
%Crawly.ParsedItem{:items => [item], :requests => requests}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment