Skip to content

Instantly share code, notes, and snippets.

@tamanugi
Last active June 26, 2017 04:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tamanugi/6ccececefde54226f5f0b261037dedac to your computer and use it in GitHub Desktop.
Save tamanugi/6ccececefde54226f5f0b261037dedac to your computer and use it in GitHub Desktop.
wikipediaのhttpstatusからスクレイピングを行う
defmodule ScrapingWikiHttpstatus.Mixfile do
use Mix.Project
def project do
[app: :scraping_wiki_httpstatus,
version: "0.1.0",
elixir: "~> 1.4",
build_embedded: Mix.env == :prod,
start_permanent: Mix.env == :prod,
elixirc_paths: ["."],
deps: deps()]
end
# Configuration for the OTP application
#
# Type "mix help compile.app" for more information
def application do
# Specify extra applications you'll use from Erlang/Elixir
[
extra_applications: [:logger],
applications: [:httpoison]
]
end
# Dependencies can be Hex packages:
#
# {:my_dep, "~> 0.3.0"}
#
# Or git/path repositories:
#
# {:my_dep, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}
#
# Type "mix help deps" for more examples and options
defp deps do
[
{:floki, "~> 0.17.2"},
{:httpoison, "~> 0.11.2"}
]
end
end
defmodule Parser do
def parse(nodes), do: parse(nodes, [], %{})
def parse([], result, _), do: result
def parse([head | tail], result, tmp) do
{tag, _, children_nodes} = head
text = Floki.text(children_nodes)
{result, tmp} = case tag do
"dt" ->
status_code = text |> String.replace(~r/ .*/, "") |> String.to_integer
{[tmp | result], %{summary: text, status_code: status_code}}
"dd" ->
description =
case tmp[:description] do
nil -> text
desc -> "#{desc}\n#{text}"
end
{result, tmp |> Map.put(:description, description)}
end
parse(tail, result, tmp)
end
end
%{body: html} = HTTPoison.get! "https://ja.wikipedia.org/wiki/HTTP%E3%82%B9%E3%83%86%E3%83%BC%E3%82%BF%E3%82%B9%E3%82%B3%E3%83%BC%E3%83%89"
result = html
|> Floki.find("dt, dd")
|> Parser.parse
IO.puts inspect result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment