Last active
January 24, 2020 01:54
-
-
Save by77er/301d851623e1630b7be38357f743eb4d to your computer and use it in GitHub Desktop.
sequentially scrape DeviantArt for Wix image URLs (Obsolete)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule DA.API do | |
def main(args) do | |
# prepare :inets | |
setup() | |
# arg validation | |
{named, unnamed, _} = OptionParser.parse(args, aliases: [o: :out], strict: [out: :string]) | |
{first, last} = case unnamed do | |
[f, l] -> {f, l} | |
_ -> print_help() | |
end | |
# convert args to ints | |
{first, last} = case {Integer.parse(first), Integer.parse(last)} do | |
{{f, ""}, {l, ""}} when f >= 0 and f < l -> {f, l} | |
_ -> print_help() | |
end | |
IO.puts("[*] Scraping from ID #{Integer.to_charlist(first, 36)} to #{Integer.to_charlist(last, 36)}...") | |
# concurrent tasks | |
num_tasks = System.schedulers_online() * 20 | |
# process all the things | |
results = first..last | |
|> Task.async_stream(&DA.API.do_request/1, max_concurrency: num_tasks, on_timeout: :kill_task) | |
|> Stream.map(&DA.API.sort_result/1) | |
|> Stream.filter(&(&1 != :error)) | |
|> Enum.to_list() | |
IO.puts("[*] Finished scraping") | |
# output | |
case named do | |
# output to file | |
[ out: filename ] -> {:ok, file} = File.open(filename, [:write]) | |
results |> Enum.each(&(IO.puts(file, &1))) | |
File.close(file) | |
IO.puts("[*] Output written to " <> filename) | |
# output to stdout | |
_ -> results |> Enum.each(&(IO.puts("[>] " <> &1))) | |
end | |
end | |
def print_help do | |
IO.puts("Usage: elixir da_scrape.exs <start-id> <end-id (> start-id)> [--out <filename>]") | |
System.halt(1) | |
end | |
def setup do | |
:inets.start() | |
end | |
def do_request(i) do | |
url = "http://orig01.deviantart.net/1234/-d#{Integer.to_charlist(i, 36)}.png" | |
case :httpc.request(:get, {String.to_charlist(url), []}, [{:autoredirect, false}], []) do | |
{:ok, {{_version, 301, _reason}, headers, _body}} -> {url, 301, elem(List.keyfind(headers, 'location', 0), 1)} | |
{:ok, {{_version, 404, _reason}, _headers, _body}} -> {url, 404} | |
_ -> :error | |
end | |
end | |
def sort_result(res) do | |
case res do | |
{:error, _} -> :error | |
{:ok, :error} -> :error | |
{:ok, {url, 301, redirect}} -> "#{url} -> 301 -> #{redirect}" | |
{:ok, {url, 404}} -> "#{url} -> 404" | |
_ -> :error | |
end | |
end | |
end | |
DA.API.main(System.argv()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment