Last active
December 2, 2020 23:38
-
-
Save iamtakingiteasy/dbab385394975c4e43993f5f482f781f to your computer and use it in GitHub Desktop.
Sankaku grabber
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
tags="$(echo $@ | tr ' ' '+')" | |
BOORU=${BOORU:-idol} | |
[ -z "$tags" ] || echo $@ | grep -q '\-\-' && { echo "BOORU=(idol|chan) $0 <tags>"; exit; } | |
awk -v "tags=$tags" -v "booru=$BOORU" ' | |
function doexit(message) { | |
print(message) | |
exit | |
} | |
function extract(instr, reg, off) { | |
if (match(instr,reg)) | |
return substr(instr,RSTART+off,RLENGTH-off) | |
else | |
return "" | |
} | |
function timeout(seconds) { | |
for (rem=1; rem <= period; rem++) { | |
printf("\rWent too fast, sleeping for %" length(period) "d/%d seconds...", rem, period) | |
if (system("sleep 1")) | |
doexit("") | |
} | |
print("") | |
return seconds * 2 | |
} | |
function size(ofile) { | |
cmd = "wc -c \"" ofile "\" 2>&1" | |
cmd | getline bytes | |
if (r = close(cmd)) { | |
return 0 | |
} | |
return extract(bytes, "^[0-9]*", 0) | |
} | |
function human(value) { | |
sfx[0] = "B " | |
sfx[1] = "KB" | |
sfx[2] = "MB" | |
sfx[3] = "GB" | |
sfx[4] = "TB" | |
for (x = 0; value >= 1024; x++) | |
value /= 1024 | |
return sprintf("%6s", sprintf("%.0f%s", value, sfx[x])); | |
} | |
function progress(ofile, content) { | |
s=0 | |
speed=0 | |
last=size(ofile) | |
while (1) { | |
bytes = size(ofile) | |
percent = sprintf("%.02f%%", bytes / content * 100) | |
percent = sprintf("%7s", percent) | |
if (s > 0) { | |
for (z = 0; z < length(str); z++) | |
printf("\b") | |
speed = sprintf("%.0f", (bytes-last) / (s+1)) | |
} | |
filllen = length(str) | |
str=sprintf("downloading %s (%s / %s) ~%s / s", percent, human(bytes), human(content), human(speed)) | |
printf("%s", str) | |
if (s > 0) { | |
diff = filllen - length(str) | |
for (z = 0; z < diff; z++) | |
printf(" ") | |
for (z = 0; z < diff; z++) | |
printf("\b") | |
} | |
if (bytes == content) { | |
for (z = 0; z < length(str); z++) | |
printf("\b") | |
break | |
} else { | |
if (system("sleep 1")) | |
doexit("") | |
} | |
s++ | |
} | |
for (z = 0; z < length(str); z++) | |
printf(" ") | |
for (z = 0; z < length(str)+15; z++) | |
printf("\b") | |
for (z = 0; z < 15; z++) | |
printf(" ") | |
for (z = 0; z < 15; z++) | |
printf("\b") | |
print("complete") | |
} | |
function fetch(arr, url, ofile, flags) { | |
delete arr | |
period=16 | |
headers=1 | |
done=0 | |
while (1) { | |
dobreak=1 | |
if (ofile != "-") { | |
precontent=size(ofile) | |
} else { | |
precontent=0 | |
} | |
cmd = "sleep 1 && wget -q -S -U CirnOS \"" url "\" -O \"" ofile "\"" flags " 2>&1" | |
for (i = 0; (cmd | getline out) > 0;) { | |
if (headers && match(out, /^ /)) { | |
code=extract(out, "HTTP/1.1 [0-9]*", 9) | |
if (code != "") { | |
if (code == 429) { | |
period = timeout(period) | |
dobreak=0 | |
break | |
} | |
if (code == 416 || code == 406) { | |
dobreak=0 | |
done=1 | |
break | |
} | |
} | |
content = extract(out, "Content-Length: [0-9]*", 16) | |
if (content != "") | |
progress(ofile, precontent+content) | |
continue | |
} | |
headers=0 | |
arr[i++]=out | |
} | |
if (close(cmd) && dobreak) | |
doexit("") | |
if (dobreak || done) | |
break | |
} | |
} | |
function postfrom(postid, posturl) { | |
fname=postid "-" extract(posturl,"/[^/]*$",1) | |
fetch(nil, posturl, "tmp-" fname, " -c") | |
system("mv \"tmp-" fname "\" \"" fname "\"") | |
} | |
BEGIN { | |
baseurl="https://" booru ".sankakucomplex.com" | |
page=1 | |
print("Downloading tags " tags) | |
url=baseurl "/post/index.content?tags=" tags | |
while (1) { | |
print("Processing page " page++) | |
fetch(data, url, "-", "") | |
nextpage=extract(data[0], "next-page-url=\"?[^\"]*", 15) | |
url=baseurl "/post/index.content" nextpage | |
pidx=0 | |
delete posts | |
for (m in data) { | |
postid=extract(data[m], "href=\"/post/show/[0-9]*", 17) | |
if (postid) | |
posts[pidx++]=postid | |
} | |
for (m in posts) { | |
postid=posts[m] | |
printf("post %" length(length(posts)) "d/%d (#%s) -- ",m+1,length(posts),postid) | |
if (system("[ -e " postid "-* ]") == 0) { | |
print("complete") | |
continue | |
} | |
printf("getting url... ") | |
fetch(post, baseurl "/post/show/" postid, "-", "") | |
for (lineidx in post) { | |
line=post[lineidx] | |
if (match(line, /id=\"?image/)) { | |
if (match(line, /<a.*class=\"sample/)) { | |
posturl="https:" extract(line,"href=\"//[^\"?]*", 6) | |
postfrom(postid, posturl) | |
break | |
} else if (match(line, /<(img|video)/)) { | |
posturl="https:" extract(line,"src=\"//[^\"?]*", 5) | |
postfrom(postid, posturl) | |
break | |
} | |
} | |
} | |
} | |
if (nextpage == "") | |
doexit("done") | |
} | |
} | |
' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment