Skip to content

Instantly share code, notes, and snippets.

@mpfund
Last active April 3, 2023 08:57
Show Gist options
  • Save mpfund/16dbe9f911dc3daa1a6dd56b2b67b7b6 to your computer and use it in GitHub Desktop.
Save mpfund/16dbe9f911dc3daa1a6dd56b2b67b7b6 to your computer and use it in GitHub Desktop.
crawler in elixir. finding secret files in alexa top 1m csv
Mix.install([
:req,
:csv,
:parallel_stream
])
Logger.configure(level: :info)
# Desktop.ini, /.git/config, /.DS_Store, /.subversion/config,
# Dockerfile, package.json, .env, appsettings.json, secrets.json
# deploy.sh,
# .travis.yml, config.yml, docker-compose.yml, secrets.yml
# secret.php, secrets.php, settings.php, config.php
# config.py, settings.py, main.py, application.py
# config.js, dev.js, app.js
# config.json, default.json, appsettings.json, credentials.json
# db.properties, application.properties, app.properties, main.properties,
# dev.properties, config.properties
# log.log, app.log, db.log, staging.log, stage.log, production.log,prod.log
# user.log
# strings.xml, config.xml
# app.module.ts, environment.ts
files=[
".env",
"appsettings.json",
"secrets.json",
"db.log",
"app.log",
"strings.xml",
"config.js",
"dev.js",
"app.js",
"secrets.php",
"httpd.conf",
".conf",
"dump.sql",
"db.sql",
"all.sql",
"backup_file.tar",
"db.dump",
"db.tar",
"db.dump.gz",
"env.js",
"dotenv",
"process.env",
"config",
"test.env",
"prod.env",
"stage.env",
"production.env",
"stage.env",
"app_key.pem",
"key.pem",
".env.js",
".env.json",
"Dockerfile",
"package.json",
"dev.properties",
"prod.properties",
"production.properties"
]
outFile = File.open!("out.txt", [:write, :utf8])
"~/Downloads/top-1m.csv"
|>Path.expand()
|>File.stream!()
|>CSV.decode()
#|>Enum.reverse()
|>Stream.drop(60000)
#|>Stream.map(&IO.inspect/1)
|>Stream.filter(fn ({:ok,_})->true end)
|>Stream.flat_map(fn {:ok, [num,url]} ->
IO.puts("num# #{num}, #{url}")
Enum.map(files, &"https://"<>url<>"/"<>&1)
end)
|>ParallelStream.map(fn (url)->
IO.puts("sending #{url}")
try do
%{url: url, req: Req.get(url, retry: false)}
rescue
Jason.DecodeError -> %{url: url, req: nil}
e in RuntimeError -> IO.puts("runtime error #{e.message}")
ErlangError -> IO.puts("erlang error")
end
end, num_workers: 5)
|>Stream.filter(fn
(%{req: {:ok, req}})->
contentType = Req.Response.get_header(req,"content-type")
req.status==200 && length(contentType)>0 &&
not String.contains?(Enum.at(contentType,0), "text/html") &&
not String.starts_with?(req.body, "<doctype") # these files are not html
(_) -> false
end)
|>Stream.map(fn (a)->
content = String.slice(elem(a.req,1).body, 0..20)
outStr = "#{a.url}: #{content}"
IO.puts(outStr)
IO.write(outFile, outStr)
end )
|>Enum.take(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment