Created
February 25, 2018 02:06
-
-
Save btbytes/4c894cf1ab5dd8cc99f5161b7b5ab92d to your computer and use it in GitHub Desktop.
Download documents matching a pattern from a webpage.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[ | |
downthemall.nim | |
Download documents matching a pattern from a webpage. | |
Build: | |
nim c -r -d:ssl -d:release downthemall.nim | |
Example: (downloads all the PDFs linked in the URL) | |
./downthemall https://www.btbytes.com/pl.html -A .pdf | |
]# | |
import htmlparser | |
import httpclient | |
import xmltree | |
import strtabs | |
import os | |
import strutils | |
import streams | |
import parseopt2 | |
when isMainModule: | |
var url: string | |
var extns: seq[string] | |
for kind, key, val in getopt(): | |
case kind | |
of cmdArgument: | |
url = key | |
of cmdShortOption: | |
case key | |
of "A": extns.add(val) | |
of cmdLongOption: discard | |
of cmdEnd: discard | |
if url == nil or extns == nil: | |
echo "usage: downthemall URL -A .extension [-A .another]" | |
quit(0) | |
let client = newHttpClient() | |
let content = client.getContent(url) | |
let html = parseHtml(newStringStream(content)) | |
for a in html.findall("a"): | |
let href = a.attrs["href"] | |
for extn in extns: | |
if href.endswith(extn): | |
let fname = href.split('/')[^1] | |
downloadFile(href, fname) | |
echo "Downloaded: $1 to $2" % [href, fname] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment