Last active
June 26, 2017 04:24
-
-
Save tamanugi/6ccececefde54226f5f0b261037dedac to your computer and use it in GitHub Desktop.
wikipediaのhttpstatusからスクレイピングを行う
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule ScrapingWikiHttpstatus.Mixfile do | |
use Mix.Project | |
def project do | |
[app: :scraping_wiki_httpstatus, | |
version: "0.1.0", | |
elixir: "~> 1.4", | |
build_embedded: Mix.env == :prod, | |
start_permanent: Mix.env == :prod, | |
elixirc_paths: ["."], | |
deps: deps()] | |
end | |
# Configuration for the OTP application | |
# | |
# Type "mix help compile.app" for more information | |
def application do | |
# Specify extra applications you'll use from Erlang/Elixir | |
[ | |
extra_applications: [:logger], | |
applications: [:httpoison] | |
] | |
end | |
# Dependencies can be Hex packages: | |
# | |
# {:my_dep, "~> 0.3.0"} | |
# | |
# Or git/path repositories: | |
# | |
# {:my_dep, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"} | |
# | |
# Type "mix help deps" for more examples and options | |
defp deps do | |
[ | |
{:floki, "~> 0.17.2"}, | |
{:httpoison, "~> 0.11.2"} | |
] | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Parser do | |
def parse(nodes), do: parse(nodes, [], %{}) | |
def parse([], result, _), do: result | |
def parse([head | tail], result, tmp) do | |
{tag, _, children_nodes} = head | |
text = Floki.text(children_nodes) | |
{result, tmp} = case tag do | |
"dt" -> | |
status_code = text |> String.replace(~r/ .*/, "") |> String.to_integer | |
{[tmp | result], %{summary: text, status_code: status_code}} | |
"dd" -> | |
description = | |
case tmp[:description] do | |
nil -> text | |
desc -> "#{desc}\n#{text}" | |
end | |
{result, tmp |> Map.put(:description, description)} | |
end | |
parse(tail, result, tmp) | |
end | |
end | |
%{body: html} = HTTPoison.get! "https://ja.wikipedia.org/wiki/HTTP%E3%82%B9%E3%83%86%E3%83%BC%E3%82%BF%E3%82%B9%E3%82%B3%E3%83%BC%E3%83%89" | |
result = html | |
|> Floki.find("dt, dd") | |
|> Parser.parse | |
IO.puts inspect result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment