Skip to content

Instantly share code, notes, and snippets.

@nucular
Created January 8, 2015 14:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nucular/a48224bc40f312fc1988 to your computer and use it in GitHub Desktop.
Save nucular/a48224bc40f312fc1988 to your computer and use it in GitHub Desktop.
#!/bin/env python3
# python3 chanrip.py --help
"""
Chanrip
=======
Rips a thread from 4chan or 8chan, downloading all posted files and saving all
replies to a HTML file. Optionally supports monitoring changes while downloading
new replies automatically and thumbnail downloading.
TODO:
- De-spaghettify and cleanup (it's a real mess currently)
- Make the chans/imageboards separate classes inheriting from an abstract class
- Themes for the output html?
- Support more imageboards
LICENSE:
The MIT License (MIT)
Copyright (c) 2014/2015 nucular
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import os
import sys
import time, datetime
import re
import threading
import urllib.request
import urllib.error
import argparse
import json
CHANS = {
"4chan": {
"url": re.compile(r"boards\.4chan\.org\/(\w+)\/thread\/(\d+)"),
"board": 0,
"thread": 1,
"api": "https://a.4cdn.org/{board}/thread/{thread}.json",
"file": "https://i.4cdn.org/{board}/{tim}{ext}",
"thumb": "https://0.t.4cdn.org/{board}/{tim}s.jpg",
"filename": "{tim}-{filename}{ext}",
"thumbname": "{tim}-{filename}{thext}",
"gifthumbs": False
},
"8chan": {
"url": re.compile(r"8chan\.co\/(\w+)\/res\/(\d+)\.html"),
"board": 0,
"thread": 1,
"api": "https://8chan.co/{board}/res/{thread}.json",
"file": "https://media.8chan.co/{board}/src/{tim}{ext}",
"thumb": "https://media.8chan.co/{board}/thumb/{tim}{thext}",
"filename": "{tim}-{filename}{ext}",
"thumbname": "{tim}-{filename}{thext}",
"gifthumbs": True
}
}
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
CSS_TOMORROW = """
body {
background: #1d1f21 none;
color: #C5C8C6;
font-family: arial,helvetica,sans-serif;
font-size: 10pt;
}
a:link, a:visited {
color: #81a2be;
text-decoration: underline;
}
a:link:hover {
color: #5F89AC;
}
span.deadlink {
color: #f22;
text-decoration: line-through;
}
header {
margin: 1em 0;
text-align: center;
}
header h1 {
font-family: tahoma;
letter-spacing: -2px;
font-size: 20pt;
margin: 0;
}
header div.subtitle {
font-size: 8pt;
}
header div.subtitle a {
color: #C5C8C6;
text-decoration: none;
}
header div.subtitle a:hover {
text-decoration: underline;
}
hr {
border: 0;
border-top: 1px solid #282a2e;
height: 1px;
clear: left:
}
p.fileinfo {
display: block;
margin: 0 0 0 20px;
}
.post {
max-width: 80%;
background:
}
.file {
float: left;
margin-right: 2px;
width: 210px;
}
.file:not(.multifile) {
float: none;
}
.unimportant, .unimportant * {
font-size: 10px;
}
a .post-image {
float: left;
padding: 5px;
margin: 0 20px 0 0;
max-width: 98%;
width: auto;
height: auto;
max-height: 200px;
max-width: 200px;
}
a .full-image {
position: absolute;
left: 5px;
padding: 5px;
margin: 0 20px 0 0;
max-width: 98%; max-width: calc(100% - 20px);
}
div.post.op {
margin-right: 20px;
margin-bottom: 5px;
}
div.post {
padding-left: 20px;
clear: both;
}
div.post-hover {
position: absolute;
margin: 0 !important;
box-shadow: 0px 3px 10px rgba(0,0,0,0.5);
}
p.intro {
clear: none;
margin: 0.5em 0;
padding: 0;
padding-bottom: 0.2em;
}
p.intro span.subject {
color: #b294bb;
font-weight: bold;
}
p.intro span.name {
color: #C5C8C6;
font-weight: bold;
}
p.intro span.post_no a {
color: #C5C8C6;
margin: 0;
}
p.intro a {
text-decoration: none;
}
div.post div.body {
clear: both;
word-wrap: break-word;
white-space: pre-wrap;
}
div.post.reply {
display: inline-block;
background-color: #282a2e;
border: 1px solid #282a2e;
margin-bottom: 2px;
margin-left: 16px;
margin-top: 2px;
max-width: 94%; max-width: calc(100% - 16px);
padding: 0.2em 0.3em 0.5em 0.6em;
}
div.post.reply p {
margin: 0.3em 0 0 0;
}
div.post.reply div.body {
margin-left: 1.8em;
margin-top: 0.8em;
padding-right: 3em;
padding-bottom: 0.3em;
}
span.quote {
color: #adbd68;
}
"""
JS_MAIN = """
String.prototype.endsWith = function(suffix) {
return this.indexOf(suffix, this.length - suffix.length) !== -1;
};
$(".post-image-link").bind("click", function(e) {
if (e.which == 2)
return;
e.preventDefault();
var thumb = $(this).find(".post-image");
if ($(this).hasClass("expanded")) {
$(this).find(".full-image").remove();
$(this).removeClass("expanded");
$(window).scrollTop($(this).attr("data-scrolltop"));
} else {
var href = $(this).attr("href");
if (href.endsWith(".mp4") || href.endsWith(".webm"))
$("<video src=\\"" + $(this).attr("href") + "\\" class=\\"full-image\\" autoplay controls></video>").appendTo(this);
else if (href.endsWith(".swf"))
$("<object src=\\"" + $(this).attr("href") + "\\" class=\\"full-image\\"></object>").appendTo(this);
else
$("<img src=\\"" + $(this).attr("href") + "\\" class=\\"full-image\\">").appendTo(this);
$(this).addClass("expanded");
$(this).attr("data-scrolltop", $(window).scrollTop());
}
});
$(".post-reply,.quotelink").bind("mouseenter", function(e) {
var p = $($(this).attr("href"));
if (p) {
p.clone().appendTo("body")
.addClass("post-hover")
.css($(this).position())
.bind("mouseleave", function(e) {
$(this).remove();
});
}
});
"""
HTML_MAIN = """<!-- Thread ripped from {url} using ChanRip -->
<html>
<head>
<style>{css}</style>
</head>
<body>
<header>
<h1>/{board}/</h1>
<div class="subtitle"><a href="{url}">{url}</a></div>
</header>
<hr/>
<div class="thread">
{thread}
</div>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
<script>
{js}
</script>
</body>
</html>
"""
HTML_IMAGE = """
<p class="fileinfo">File <a href="{path}">{name}</a>
<span class="unimportant">({size}, {w}x{h}, {origname})</span>
</p>
<a href="{path}" target="_blank" class="post-image-link">
<img class="post-image" src="{thumbpath}">
</a>
"""
HTML_OP = """
<div class="post op" id="p{no}">
<p class="intro">
<span class="subject">{subject}</span>
<span class="name">{name}&nbsp;{uid}</span>
<time datetime="{date}">{date}</time>
<span class="post_no">No.<a href="#p{no}">{no}</a></span>
<span class="post_replies">{replies}</span>
</p>
<div class="body">{body}</div>
</div>
"""
HTML_REPLY = """
<div class="post reply" id="p{no}">
<p class="intro">
<span class="name">{name}&nbsp;{uid}</span>
<time datetime="{date}">{date}</time>
<span class="post_no">No.<a href="#p{no}">{no}</a></span>
<span class="post_replies">{replies}</span>
</p>
<div class="files">
{files}
</div>
<div class="body">{body}</div>
</div>
"""
def sizeof_fmt(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def chunker(res, chunksize=4096, hook=None, store=False):
totalbytes = (res.info().get("Content-Length") or "0").strip()
totalbytes = int(totalbytes)
bytesread = 0
if store:
data = bytes()
try:
while True:
chunk = res.read(chunksize)
bytesread += len(chunk)
if not chunk:
break
if store:
data += chunk
if hook:
if totalbytes == 0:
percent = "?"
else:
percent = int(bytesread/totalbytes*100)
hook(chunk, bytesread, totalbytes, percent)
except KeyboardInterrupt:
return True
if store:
return data
class UserAgentOpener(urllib.request.FancyURLopener):
def __init__(self, useragent, *args, **kwargs):
self.version = useragent
class Ripper(object):
def __init__(self, url, directory, verbose=False, useragent=USER_AGENT, thumbs=True):
self.verbose = verbose
self.useragent = useragent
self.thumbs = thumbs
self.thread = None
self.ripped = []
self.postcount = 0
if not os.path.isdir(directory):
os.makedirs(directory)
if thumbs and not os.path.isdir(os.path.join(directory, "_thumbs")):
os.makedirs(os.path.join(directory, "_thumbs"))
self.directory = directory
self.chan = None
m = None
for i in CHANS.keys():
m = CHANS[i]["url"].search(url)
if m:
self.chan = CHANS[i]
break
if not self.chan:
raise NotImplementedError("{} is not supported (yet)".format(url))
self.url = url
self.board = m.groups()[self.chan["board"]]
self.thread = m.groups()[self.chan["thread"]]
self.apiurl = self.chan["api"]
self.apiurl = self.apiurl.format(board=self.board, thread=self.thread)
print("API: " + self.apiurl)
def fetch(self):
def hook(chunk, read, total, percent):
sys.stdout.write("\rFetching thread... {}/{} {}%".format(read, total, percent))
req = urllib.request.Request(self.apiurl)
req.add_header("User-Agent", self.useragent)
try:
res = urllib.request.urlopen(req)
except urllib.error.URLError as e:
print(e)
return True
data = chunker(res, chunksize=128, hook=hook, store=True)
print("")
self.thread = json.loads(data.decode("utf-8"))
def downloadFile(self, name, path, url, no=1, of=1):
req = urllib.request.Request(url)
req.add_header("User-Agent", self.useragent)
try:
res = urllib.request.urlopen(req)
except urllib.error.URLError as e:
print(e)
return
with open(path, "wb") as s:
def hook(chunk, read, total, percent):
sys.stdout.write("\r{}/{} {}... {}/{} {}%".format(no, of, name, read, total, percent))
s.write(chunk)
interrupted = chunker(res, hook=hook, chunksize=16384)
if interrupted:
print("\nInterrupted by user")
os.remove(path)
return True
print("")
def writeThread(self):
print("Writing thread to index.html...")
def figureThumbPath(v):
if self.thumbs:
v["thext"] = (self.chan["gifthumbs"] and v["ext"] == ".gif") and ".gif" or ".jpg"
return os.path.join("_thumbs", self.chan["thumbname"].format(**v))
else:
return self.chan["filename"].format(**v)
op = self.thread["posts"][0]
thread = "<div class=\"files\">"
if "extra_files" in op:
thread += "\n<div class=\"file multifile\">"
else:
thread += "\n<div class=\"file\">"
path = self.chan["filename"].format(**op)
thread += HTML_IMAGE.format(
path=path,
thumbpath=figureThumbPath(op),
name=str(op["tim"]) + op["ext"],
size=sizeof_fmt(op["fsize"]),
w=op["w"], h=op["h"],
origname=op["filename"] + op["ext"]
)
thread += "</div>"
if "extra_files" in op:
for i in op["extra_files"]:
thread += "\n<div class=\"file multifile\">"
path = self.chan["filename"].format(**i)
thread += HTML_IMAGE.format(
path=path,
thumbpath=figureThumbPath(i),
name=str(i["tim"]) + i["ext"],
size=sizeof_fmt(i["fsize"]),
w=i["w"], h=i["h"],
origname=i["filename"] + i["ext"]
)
thread += "</div>"
thread += "</div>"
thread += HTML_OP.format(
subject="sub" in op and op["sub"] or "",
name=op["name"],
uid="id" in op and ("(ID: "+op["id"]+")") or "",
date=datetime.datetime.fromtimestamp(op["time"]).strftime("%m/%d/%y (%a) %H:%M:%S"),
no=op["no"],
body="com" in op and op["com"] or "",
replies=" ".join(["<a class=\"post-reply\" href=\"#p{no}\">&gt;&gt;{no}</a>".format(no=i) for i in op["replies"]])
)
for p in self.thread["posts"][1:]:
if "tim" in p:
files = "<div class=\"files\">"
if "extra_files" in p:
files += "\n<div class=\"file multifile\">"
else:
files += "\n<div class=\"file\">"
path = self.chan["filename"].format(**p)
files += HTML_IMAGE.format(
path=path,
thumbpath=figureThumbPath(p),
name=str(p["tim"]) + p["ext"],
size=sizeof_fmt(p["fsize"]),
w=p["w"], h=p["h"],
origname=p["filename"] + p["ext"]
)
files += "</div>"
if "extra_files" in p:
for i in p["extra_files"]:
files += "\n<div class=\"file multifile\">"
path = self.chan["filename"].format(**i)
files += HTML_IMAGE.format(
path=path,
thumbpath=figureThumbPath(i),
name=str(i["tim"]) + i["ext"],
size=sizeof_fmt(i["fsize"]),
w=i["w"], h=i["h"],
origname=i["filename"] + i["ext"]
)
files += "</div>"
files += "</div>"
else:
files = ""
thread += HTML_REPLY.format(
files=files,
name=p["name"],
uid="id" in p and ("(ID: "+p["id"]+")") or "",
date=datetime.datetime.fromtimestamp(p["time"]).strftime("%m/%d/%y (%a) %H:%M:%S"),
no=p["no"],
body="com" in p and p["com"] or "",
replies=" ".join(["<a class=\"post-reply\" href=\"#p{no}\">&gt;&gt;{no}</a>".format(no=i) for i in p["replies"]])
)
thread += "<br/>"
with open(os.path.join(self.directory, "index.html"), "wt") as s:
s.write(HTML_MAIN.format(
url=self.url,
css=CSS_TOMORROW,
js=JS_MAIN,
board=self.board,
thread=thread
))
def rip(self):
firstrip = not self.thread
if self.fetch():
return
postcount = len(self.thread["posts"])
if postcount <= self.postcount:
print("No new posts")
return
files = []
thumbs = []
for i,v in enumerate(self.thread["posts"]):
# Map all replies
if (not "replies" in v) or type(v["replies"]) != list:
v["replies"] = []
for i2,v2 in enumerate(self.thread["posts"][i:]):
if ("com" in v2) and ("&gt;&gt;{}".format(v["no"]) in v2["com"]):
if not v2["no"] in v["replies"]:
v["replies"].append(v2["no"])
# Fix some reply links
if "com" in v:
v["com"] = re.sub("href=\"\\/\\w+\\/res\\/\\d+\\.html\\#(\\d+)\"", "href=\"#p\\1\" class=\"quotelink\"", v["com"])
if i < self.postcount:
continue
v.update({"board": self.board, "thread": self.thread})
if "tim" in v:
name = self.chan["filename"].format(**v)
path = os.path.join(self.directory, name)
url = self.chan["file"].format(**v)
if not os.path.exists(path):
files.append([name, path, url])
if self.thumbs:
v["thext"] = (self.chan["gifthumbs"] and v["ext"] == ".gif") and ".gif" or ".jpg"
name = self.chan["thumbname"].format(**v)
path = os.path.join(self.directory, "_thumbs", name)
url = self.chan["thumb"].format(**v)
if not os.path.exists(path):
thumbs.append([name, path, url])
if "extra_files" in v:
for v2 in v["extra_files"]:
v2.update({"board": self.board, "thread": self.thread})
name = self.chan["filename"].format(**v2)
path = os.path.join(self.directory, name)
url = self.chan["file"].format(**v2)
if not os.path.exists(path):
files.append([name, path, url])
if self.thumbs:
v2["thext"] = (self.chan["gifthumbs"] and v["ext"] == ".gif") and ".gif" or ".jpg"
name = self.chan["thumbname"].format(**v2)
path = os.path.join(self.directory, "_thumbs", name)
url = self.chan["thumb"].format(**v2)
if not os.path.exists(path):
thumbs.append([name, path, url])
print("{} not indexed post(s), downloading {} file(s)".format(postcount - self.postcount, len(files)))
for i,v in enumerate(files):
interrupted = self.downloadFile(*v, no=i+1, of=len(files))
if interrupted:
break
if self.thumbs and len(thumbs) > 0:
print("Downloading {} thumbnail(s)".format(len(thumbs)))
for i,v in enumerate(thumbs):
interrupted = self.downloadFile(*v, no=i+1, of=len(thumbs))
if interrupted:
break
self.postcount = postcount
self.writeThread()
def monitor(self, delay):
timer = delay
cleartimer = " " * (len(str(timer)) * 2 + 15)
while True:
while timer > 0:
sys.stdout.write("\r" + cleartimer)
sys.stdout.write("\rMonitoring... {}/{}".format(timer, delay))
time.sleep(1)
timer -= 1
timer = delay
print("")
self.rip()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Thread ripper for image boards implementing the 4chan/8chan\
APIs with optional monitoring."
)
parser.add_argument("URL", help="a link to the board thread")
parser.add_argument("DIR", help="a folder to rip to")
parser.add_argument("-m", "--monitor", action="store_true", help="keep running and regularly check thread for changes")
parser.add_argument("-d", "--delay", type=int, metavar="SEC", default=20, help="delay between monitoring checks, defaults to 20")
parser.add_argument("-v", "--verbose", action="store_true", help="show some debug logging")
parser.add_argument("-u", "--useragent", default=USER_AGENT, help="the user agent to use on requests")
parser.add_argument("--nothumbs", action="store_true", default=False, help="don't download thumbs with the files")
args = parser.parse_args()
ripper = Ripper(args.URL, args.DIR, verbose=args.verbose, useragent=args.useragent, thumbs=not args.nothumbs)
ripper.rip()
if args.monitor:
try:
ripper.monitor(args.delay)
except KeyboardInterrupt:
pass
print("\nDone!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment