Created
February 16, 2021 15:06
-
-
Save crova/9d67e853737c54c8301132e72375361a to your computer and use it in GitHub Desktop.
Crawly Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule RampeFle.PhonetiqueCa do | |
use Crawly.Spider | |
@impl Crawly.Spider | |
#def base_url(), do: "http://www.phonetique.ca" | |
@base_url "http://www.phonetique.ca" | |
@impl Crawly.Spider | |
def init() do | |
[ | |
start_urls: [ | |
"http://www.phonetique.ca/exercices_discrimination.html" | |
] | |
] | |
end | |
@impl Crawly.Spider | |
def parse_item(response) do | |
# Parse response body to document | |
{:ok, document} = Floki.parse_document(response.body) | |
# Extract all exercices URLs | |
exercice_urls = | |
document | |
|> Floki.find("div#accordion.panel-group p a") | |
|> Floki.attribute("href") | |
# Extract the practice URLs | |
practice_urls = | |
for url <- exercice_urls do | |
resp = Crawly.fetch(@base_url <> "/" <> url) | |
{:ok, doc} = Floki.parse_document(resp.body) | |
doc |> Floki.find("h3 a") |> Floki.attribute("href") | |
end | |
practice_urls = List.flatten(practice_urls) | |
urls = | |
for url <- practice_urls do | |
@base_url <> "/documents/exercices/" <> url | |
end | |
# Convert Practice URLs into Requests | |
requests = | |
urls |> Enum.uniq() | |
|> Enum.map(&Crawly.Utils.request_from_url/1) | |
# |> Enum.map(&build_absolute_url/1) | |
# Create item for pages where items exists | |
item = %{ | |
title: | |
document | |
|> Floki.find("h2.ExerciseTitle") | |
|> Floki.text() | |
|> :binary.bin_to_list() | |
|> List.to_string() | |
} | |
%Crawly.ParsedItem{:items => [item], :requests => requests} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment