enpedasi/Imagenet.ex

## Imagenet.ex
defmodule Imagenet do
  @urls_filename "urls/fall11_urls.txt"
  @words_filename "urls/words.txt"

  @moduledoc """
  # Image-netのスクレイピング用スクリプト

  ```
  defp deps do
    [
      {:flow, "~> 0.14"},
      {:httpoison, "~> 1.2"}
    ]
  end
  ```

  # usage

  1. download fall11_urls.txt
    Image-netからfall11_urls.txtをダウンロードしてください。
    http://image-net.org/download-imageurls

  1. download classes see http://image-net.org/archive/words.txt
    words.txtをダウンロードしてください。

  1. `iex -S mix`で`Imagenet.labels_count`を実行すると、labels.txtができるので、スクレイピングで使いたいラベルを選んでください。

  1.`list = Imagenet.get_urls("n03405725")` URLのリストを取得します。（ラベル名より例のようにIDを指定したほうが速いです。）

  1.`Imagenet.scraping(list)`で`images\`に画像をダウンロードします。

  # directory

  ```
  urls\
    imagenetのURLファイルと、words_txtをここにダウンロードしてください。
  images\
    スクレイピングの結果(.jpg)を格納
  ```

  # reference

  [ImageNetから画像データをダウンロードする方法](http://murayama.hatenablog.com/entry/2017/11/18/160818)


  """

  @doc """
  分類ID =>ラベル名のMapを返す

  ## Examples
    iex(1)> labels_to_map
    %{
      "n12410205" => "Trema, genus Trema",
      "n14923207" => "Payne's grey, Payne's gray",
      "n02953056" => "cantilever",
      ...
     %}
  """
  def labels_to_map do
    words_file = File.read! @words_filename

    labels =
    words_file
    |> String.split("\n")
    |> Enum.reduce(%{}, fn rec, acc ->  r  = String.split(rec, "\t")
                         Map.put(acc, Enum.at(r,0), Enum.at(r,1))
                   end)
  end

  @doc """
  分類IDをラベル名から抽出してリスト化

  ## Examples
    iex(1)> labels_to_list("peoples")
    ["n07943300"]

  """
  def labels_to_list(word) do
    words_file = File.read!  @words_filename

    labels = words_file
    |> String.split("\n")
    |> Enum.filter(&String.contains?(&1, word))
    |> Enum.map(&(String.split(&1,"\t") |> Enum.at(0) ))
  end

  @doc """
  ラベル名から、URLのリストを得る
  fall11_urls.txtは1400万件。ラベルIDが1件hitで90秒程度かかる(2コア4スレッド)

  ## Examples
    iex(1)> url_list("peoples")
    ["http://..",...]

  """
  defp url_list(word) do
    urls = File.stream! @urls_filename
    :timer.tc( fn ->
      labels = labels_to_list(word)
      IO.inspect labels: labels
      urls
      |> Flow.from_enumerable
      |> Flow.filter(&String.contains?(&1,labels))
      |> Enum.to_list
    end)
    |> case do
        {elapsed, result} -> IO.inspect elapsed: elapsed/1_000_000
                             IO.inspect count: Enum.count(result)
                             result
       end
  end

  @doc """
  汎用関数 : 変数をファイルに書き出す
  """
  def file_write(val, fname) do
    File.write! fname, inspect(val, [pretty: true, limit: :infinity, printable_limit: :infinity])
    val
  end

  @doc """
  汎用関数 : file_writeで書き出した値をロードして返す
  """
  def load_val(fname \\ "debug.txt") do
    Code.eval_file(fname) |> elem(0)
  end

  @doc """
  ラベル名から、URLのリストを得る
  fall11_urls.txtは1400万件。ラベルIDが1件hitで90秒程度かかる(2コア4スレッド)

  ## Examples
    iex(1)> get_urls("peoples")
    ["http://..",...]

  """
  def get_urls(word) do
    urls = url_list(word)
    |> Enum.map( &(String.split(&1, "\t") |> Enum.at(1) |> String.replace( "\n", "" ) ))
    |> file_write "urls.txt"
    urls
  end

  @doc """
  HTTPリクエストを行い{:ok, body}を返す。

  """
  def get_image(url) do
    try do
      response = HTTPoison.get url
      case response do
       {:ok, %HTTPoison.Response{status_code: x, body: body}} when x in 200..299 -> {:ok, body}
       {:ok, %HTTPoison.Response{status_code: c}}  -> {:error, c} # Poison.decode!(body)
       {_, %HTTPoison.Error{reason: reason}} -> {:error, reason}
        _        -> {:error, "** Unknown error"}
      end
    catch
      x -> {:error, "exception #{x}"}
    end
  end

  @doc """
  URL一覧に使用されている、ラベル数を集計する。
  {ラベルID, ラベル名, 使用数}のリストを返す
  """
  def labels_count do
    label_map = labels_to_map

    urls = File.stream! @urls_filename
    urls
    # |> Stream.take(1_000)
    |> Flow.from_enumerable
    |> Flow.map( &(String.split(&1, "\t") |> Enum.at(0) |> String.slice(0, 9) ))
    |> Flow.partition
    |> Flow.reduce(
          fn -> %{} end, fn( name, acc )
          -> Map.update( acc, name, 1, &( &1 + 1 ) ) end )
    |> Enum.sort( &( elem( &1, 1 ) > elem( &2, 1 ) ) )
    |> Enum.map( & { elem(&1 ,0) , Map.get(label_map, elem(&1, 0)), elem(&1, 1) })
    |> file_write "labels.txt"
  end

  @doc """
  URLのリストに基づき、スクレイピングを行う。
  imagesの配下に画像を格納する。
  """

  def scraping(urls) do
    urls
    |> Flow.from_enumerable(max_demand: 1,stages: 10)
    |> Flow.map( & get_image(&1)
                   |> case do
                      {:ok, image}     -> path = URI.parse(&1).path |> Path.basename
                                          File.write! "images/#{path}", image
                                          IO.inspect "success=#{&1}"
                      {:error, status} -> IO.inspect "error  =#{&1} #{status}"
                      _ ->
                      end
                    )
    |> Enum.to_list
  end
end
	defmodule Imagenet do
	@urls_filename "urls/fall11_urls.txt"
	@words_filename "urls/words.txt"

	@moduledoc """
	# Image-netのスクレイピング用スクリプト

	```
	defp deps do
	[
	{:flow, "~> 0.14"},
	{:httpoison, "~> 1.2"}
	]
	end
	```

	# usage

	1. download fall11_urls.txt
	Image-netからfall11_urls.txtをダウンロードしてください。
	http://image-net.org/download-imageurls

	1. download classes see http://image-net.org/archive/words.txt
	words.txtをダウンロードしてください。

	1. `iex -S mix`で`Imagenet.labels_count`を実行すると、labels.txtができるので、スクレイピングで使いたいラベルを選んでください。

	1.`list = Imagenet.get_urls("n03405725")` URLのリストを取得します。（ラベル名より例のようにIDを指定したほうが速いです。）

	1.`Imagenet.scraping(list)`で`images\`に画像をダウンロードします。

	# directory

	```
	urls\
	imagenetのURLファイルと、words_txtをここにダウンロードしてください。
	images\
	スクレイピングの結果(.jpg)を格納
	```

	# reference

	[ImageNetから画像データをダウンロードする方法](http://murayama.hatenablog.com/entry/2017/11/18/160818)


	"""

	@doc """
	分類ID =>ラベル名のMapを返す

	## Examples
	iex(1)> labels_to_map
	%{
	"n12410205" => "Trema, genus Trema",
	"n14923207" => "Payne's grey, Payne's gray",
	"n02953056" => "cantilever",
	...
	%}
	"""
	def labels_to_map do
	words_file = File.read! @words_filename

	labels =
	words_file
	\|> String.split("\n")
	\|> Enum.reduce(%{}, fn rec, acc -> r = String.split(rec, "\t")
	Map.put(acc, Enum.at(r,0), Enum.at(r,1))
	end)
	end

	@doc """
	分類IDをラベル名から抽出してリスト化

	## Examples
	iex(1)> labels_to_list("peoples")
	["n07943300"]

	"""
	def labels_to_list(word) do
	words_file = File.read! @words_filename

	labels = words_file
	\|> String.split("\n")
	\|> Enum.filter(&String.contains?(&1, word))
	\|> Enum.map(&(String.split(&1,"\t") \|> Enum.at(0) ))
	end

	@doc """
	ラベル名から、URLのリストを得る
	fall11_urls.txtは1400万件。ラベルIDが1件hitで90秒程度かかる(2コア4スレッド)

	## Examples
	iex(1)> url_list("peoples")
	["http://..",...]

	"""
	defp url_list(word) do
	urls = File.stream! @urls_filename
	:timer.tc( fn ->
	labels = labels_to_list(word)
	IO.inspect labels: labels
	urls
	\|> Flow.from_enumerable
	\|> Flow.filter(&String.contains?(&1,labels))
	\|> Enum.to_list
	end)
	\|> case do
	{elapsed, result} -> IO.inspect elapsed: elapsed/1_000_000
	IO.inspect count: Enum.count(result)
	result
	end
	end

	@doc """
	汎用関数 : 変数をファイルに書き出す
	"""
	def file_write(val, fname) do
	File.write! fname, inspect(val, [pretty: true, limit: :infinity, printable_limit: :infinity])
	val
	end

	@doc """
	汎用関数 : file_writeで書き出した値をロードして返す
	"""
	def load_val(fname \\ "debug.txt") do
	Code.eval_file(fname) \|> elem(0)
	end

	@doc """
	ラベル名から、URLのリストを得る
	fall11_urls.txtは1400万件。ラベルIDが1件hitで90秒程度かかる(2コア4スレッド)

	## Examples
	iex(1)> get_urls("peoples")
	["http://..",...]

	"""
	def get_urls(word) do
	urls = url_list(word)
	\|> Enum.map( &(String.split(&1, "\t") \|> Enum.at(1) \|> String.replace( "\n", "" ) ))
	\|> file_write "urls.txt"
	urls
	end

	@doc """
	HTTPリクエストを行い{:ok, body}を返す。

	"""
	def get_image(url) do
	try do
	response = HTTPoison.get url
	case response do
	{:ok, %HTTPoison.Response{status_code: x, body: body}} when x in 200..299 -> {:ok, body}
	{:ok, %HTTPoison.Response{status_code: c}} -> {:error, c} # Poison.decode!(body)
	{_, %HTTPoison.Error{reason: reason}} -> {:error, reason}
	_ -> {:error, "** Unknown error"}
	end
	catch
	x -> {:error, "exception #{x}"}
	end
	end

	@doc """
	URL一覧に使用されている、ラベル数を集計する。
	{ラベルID, ラベル名, 使用数}のリストを返す
	"""
	def labels_count do
	label_map = labels_to_map

	urls = File.stream! @urls_filename
	urls
	# \|> Stream.take(1_000)
	\|> Flow.from_enumerable
	\|> Flow.map( &(String.split(&1, "\t") \|> Enum.at(0) \|> String.slice(0, 9) ))
	\|> Flow.partition
	\|> Flow.reduce(
	fn -> %{} end, fn( name, acc )
	-> Map.update( acc, name, 1, &( &1 + 1 ) ) end )
	\|> Enum.sort( &( elem( &1, 1 ) > elem( &2, 1 ) ) )
	\|> Enum.map( & { elem(&1 ,0) , Map.get(label_map, elem(&1, 0)), elem(&1, 1) })
	\|> file_write "labels.txt"
	end

	@doc """
	URLのリストに基づき、スクレイピングを行う。
	imagesの配下に画像を格納する。
	"""

	def scraping(urls) do
	urls
	\|> Flow.from_enumerable(max_demand: 1,stages: 10)
	\|> Flow.map( & get_image(&1)
	\|> case do
	{:ok, image} -> path = URI.parse(&1).path \|> Path.basename
	File.write! "images/#{path}", image
	IO.inspect "success=#{&1}"
	{:error, status} -> IO.inspect "error =#{&1} #{status}"
	_ ->
	end
	)
	\|> Enum.to_list
	end
	end