Skip to content

Instantly share code, notes, and snippets.

@zkessin
Created June 27, 2019 08:08
Show Gist options
  • Save zkessin/4add77aa33ed7dc2a9873986efcad426 to your computer and use it in GitHub Desktop.
Save zkessin/4add77aa33ed7dc2a9873986efcad426 to your computer and use it in GitHub Desktop.
defmodule FlightschoolParser do
@moduledoc """
Documentation for FlightschoolParser.
"""
alias :hackney, as: Hackney
alias :timer, as: Timer
def run() do
result = get_states("http://www.flightschoollist.com/", "airplane-flight-schools/")
:file.write("airport_data.csv", :io_lib.format("~p~n", [result]))
result
end
def format_airport(nil), do: ""
def format_airport(a) do
a
end
def format(data, filename) do
lines =
Enum.map(
data,
fn
[
school,
airport,
city,
state,
list_uri,
address,
_airport,
phone,
email,
url
] ->
dquote = "\""
comma = ","
qcq = [dquote, comma, dquote]
[
dquote,
school,
qcq,
format_airport(airport),
qcq,
city,
qcq,
state,
qcq,
list_uri,
qcq,
address,
qcq,
phone,
qcq,
email,
qcq,
url,
dquote,
"\n"
]
[] ->
[]
end
)
File.write(filename, lines)
lines
end
def test() do
get_state(
"http://www.flightschoollist.com/",
"/florida-airplane-flight-schools/",
"",
"Florida"
)
end
def next([_, _, _, nth | _]), do: nth
def next_url([{"a", [{"href", uri}], _}]),
do: uri
def get_airports(body, state) do
body
|> Floki.find("tr")
|> List.delete_at(0)
|> Enum.map(fn {"tr", _,
[
_,
{"td", _, [{"a", [{"href", fs_uri}], [fs_name]}]},
{"td", _, airport},
{"td", _, [city]}
]} ->
{fs_uri, fs_name, airport, String.trim(city), state}
end)
end
def rn(nil) do
""
end
def rn(x) do
x
end
def get_airport_data([address, airport, phone, email, url], fs_uri) do
[
fs_uri,
Floki.text(address) |> rn(),
Floki.text(airport) |> rn(),
Floki.text(phone) |> rn(),
Floki.text(email) |> rn(),
url
|> Floki.find("a")
|> Floki.attribute("href")
|> List.first()
|> rn()
]
end
def get_airport(root, {{fs_uri, fs_name, airport, city, state}, index}) do
Timer.sleep(5_000)
ref = make_ref()
master = self()
spawn(fn ->
IO.puts("#{inspect(self())} #{index} *** Flight School #{root <> fs_uri}")
{:ok, 200, _headers, client_ref} = Hackney.request(:get, root <> fs_uri)
{:ok, body} = Hackney.body(client_ref)
result =
body
|> Floki.find("ul.list-icons-style-3")
|> Floki.find("li")
|> get_airport_data(root <> fs_uri)
send(master, {ref, [fs_name, List.first(airport), city, state | result]})
end)
receive do
{^ref, data} -> data
after
500 -> []
end
end
def get_contents(_, _), do: :nyi
def get_state(_root, uri, uri, _), do: []
def get_state(root, uri, _prior_page, state) do
{:ok, 200, _headers, client_ref} = Hackney.request(:get, root <> uri)
{:ok, body} = Hackney.body(client_ref)
next_page =
Floki.find(body, ".pagination")
|> Floki.find("li")
|> next()
|> Floki.find("a")
|> next_url()
next = get_state(root, next_page, uri, state)
current = get_contents(body, state)
next ++ current
end
def get_states(root, uri) do
{:ok, 200, _headers, client_ref} = Hackney.request(:get, root <> uri)
{:ok, body} = Hackney.body(client_ref)
body
|> Floki.find("#europe")
|> Floki.find("a")
|> Enum.map(fn {"a", [{"href", uri}], [state]} ->
{uri, state}
end)
|> Enum.map(fn {uri, state} ->
get_state(root, uri, "", state)
end)
|> List.flatten()
|> List.keysort(1)
|> Enum.uniq_by(fn airport -> elem(airport, 0) end)
|> Enum.with_index()
|> Enum.map(&get_airport(root, &1))
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment