Created
November 15, 2014 08:10
-
-
Save DanielOaks/7ad32b989558897d52ec to your computer and use it in GitHub Desktop.
Discussion/coding for the 4chan archiver I had with @antonizoon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Archiver options object proposal | |
""" might only need to be an Archiver constructor now? | |
not sure, how do you mean exactly? | |
as I see it, the main reason to make the Options class is to simplify things internally | |
like passing options between the Archiver and the SiteArchivers (or whatever we do to support 4chan/Fuuka/etc) | |
I can certainly see doing it like that, but even then I think we'd be creating an Options object internally. Definitely different files, but subclassing from a standard BaseSiteArchiver or something similar, so those objects have the same API to the controlling Archiver object (and can be used as a simple list to iterate over, below in add_thread and download_threads) | |
well, that make sense. True, that would really help to have a unified GUI. | |
I might start playing about with refactoring the repo | |
probably want to create a brand new repo, since we wrote from scratch anyway, name it BASC_Archiver | |
sure, I'll be reusing lots of code from BA-4chan-thread-archiver as well for the 4chan-specific archiver class | |
""" | |
class Options: | |
"""Holds options for Archiver.""" | |
def __init__(self, path, enable_ssl=False, **kwargs): | |
self.path = path | |
self.enable_ssl = enable_ssl | |
for arg_name in kwargs: | |
set(self, arg_name, kwargs[arg_name]) | |
class Archiver: | |
def __init__(self, options=None): | |
if options is None: | |
options = Options() | |
self.options = options | |
self.archivers = [] | |
for archiver in default_archivers: | |
self.archivers.append(archiver(self.options)) | |
def add_thread(self, url): | |
"""Archive the given thread if possible""" | |
url_archived = False | |
for archiver in self.archivers: | |
if archiver.url_valid(url): | |
archiver.add_thread(url) | |
url_archived = True | |
if url_archived: | |
return True | |
else: | |
print 'We could not find a valid archiver for:', url | |
return False | |
def download_threads(self): | |
"""Download all the threads we currently hold.""" | |
for archiver in self.archivers: | |
archiver.download_threads() | |
# site-specific archivers | |
default_archivers = [FourChanSiteArchiver, FuukaSiteArchiver] | |
class BaseSiteArchiver: | |
name = 'base' | |
def __init__(self, options): | |
if name == 'base': | |
raise Exception('BaseSiteArchiver must be subclassed!') | |
self.threads = {} | |
self.options = options | |
def url_valid(self, url): | |
"""Return true if the given URL is for my site.""" | |
raise Exception('override this method') | |
def add_thread(self, url): | |
"""Try to add the given thread to our internal list.""" | |
raise Exception('override this method') | |
def download_threads(self): | |
"""Download all the threads we currently hold.""" | |
raise Exception('override this method') | |
def _download_thread(self, thread_info): | |
"""Download the given thread, from the thread info.""" | |
raise Exception('override this method') | |
# use py4chan, but specifically our improved BASC-py4chan fork | |
# https://github.com/bibanon/BASC-py4chan | |
class FourChanSiteArchiver(BaseSiteArchiver): | |
name = '4chan' | |
pass | |
# just an example concept for now, since we have no Python wrapper for the Fuuka API yet | |
class FuukaSiteArchiver(BaseSiteArchiver): | |
name = 'fuuka' | |
def __init__(self, options): | |
BaseSiteArchiver.__init__(self, options) | |
def url_valid(self, url): | |
"""Return True if this is a Fuuka thread URL.""" | |
# use regexes, etc to determine this | |
pass | |
def add_thread(self, url): | |
"""Add thread to our internal list.""" | |
board_name = url magic | |
thread_id = url magic | |
thread_dir = magic | |
self.threads[thread_id] = { | |
'board': board_name, | |
'dir': thread_dir, | |
'id': thread_id, | |
} | |
def download_threads(self): | |
"""Download all the threads in our list.""" | |
for thread_id in self.threads: | |
thread_info = self.threads[thread_id] | |
self._download_thread(thread_info) | |
def _download_thread(self, thread_info): | |
"""Download the given thread, from the thread info.""" | |
# do all the complex downloading junk here | |
pass | |
# CLI-specific script file | |
__doc__ = "BA-4chan-thread-archiver. Uses the 4chan API (with the py4chan wrapper) etc..." | |
arguments = docopt(__doc__) | |
# nicer to do the docopt stuff this way (setting the docopy string as __doc__) | |
# because then the docstring shows up properly in help() | |
# I'm just doing some prototyping up here, don't mind me | |
""" | |
I once thought up a BASC_Archiver library (akin to py-4chan, or basically an extension of it) | |
This way, we could develop multiple interfaces for the same functions, and third parties could | |
import it to create interesting new scripts (one that archives certain posts, etc.) | |
Also might be a good idea to make it usable with Fuuka | |
making it extensible would be nice | |
Ah yeah, that Options class can be shared between CLI and Gui clients | |
""" | |
### CLI Interface: BA-4chan-thread-archiver | |
""" | |
Designed to be fully compatible, drop-in replacement for the current thread archiver. I made a great example here: | |
https://github.com/bibanon/BASC_Archiver/blob/master/4chan-thread-archiver | |
such a script would only have a main method and some CLI interface helper methods | |
Make sure to update it to work with the class you are making | |
""" | |
import BASC_Fourchan.Archiver # class for archiving from 4chan | |
"""=== Docopt Arguments and Documentation ===""" | |
from docopt import docopt | |
doc = """BA-4chan-thread-archiver. Uses the 4chan API (with the py4chan wrapper) | |
to download thread images and/or thumbnails, along with thread HTML, JSON, | |
and a list of referenced external links. | |
Usage: | |
4chan-thread-archiver <url> [--path=<string>] [--delay=<int>] [--nothumbs] [--thumbsonly] [--enablessl] | |
4chan-thread-archiver -h | --help | |
4chan-thread-archiver -v | --version | |
Options | |
--runonce Downloads the thread as it is presently, then exits | |
--silent Suppresses mundane printouts, prints what's important | |
--nothumbs Don't download thumbnails | |
--thumbsonly Download thumbnails, no images | |
--enablessl Download using HTTPS | |
--delay=<int> Delay between thread checks [default: 20] | |
-h --help Show help | |
-v --version Show version | |
""" | |
# Dhole's awesome 4chan URL check regex function | |
def check_url(url): | |
""" | |
Make sure that the given URL is a valid 4chan thread URL. | |
Originates from The Chandler by Dhole | |
""" | |
url_parsed = re.findall("http(?:s)?://(?:boards.)?.*/*/(?:res|thread)/[0-9]+(?:.php|.html)?", url) | |
if len(url_parsed) < 1: | |
return "" | |
else: | |
return url_parsed[0] | |
def timestamp(): | |
""" | |
`Timestamp` <http://www.interfaceware.com/manual/timestamps_with_milliseconds.html>_ | |
""" | |
now = time.time() | |
localtime = time.localtime(now) | |
return time.strftime('%Y-%m-%d %H:%M:%S', localtime) | |
def main(): | |
""" | |
Docopt Argument Setup | |
""" | |
# Copy data from docopt arguments | |
thread = args.get('<url>').split('/')[5] | |
board = args.get('<url>').split('/')[3] | |
path = args.get('--path') | |
runonce = args.get('--runonce', False) | |
silent = args.get('--silent', False) | |
delay = args.get('--delay') | |
nothumbs = args.get('--nothumbs', False) | |
thumbsonly = args.get('--thumbsonly', False) | |
enablessl = args.get('--enablessl', False) | |
# Set destination directory | |
dst_dir = os.path.join(path, board, thread) | |
# Set a default path if none is given | |
if (path == None): | |
path = os.path.join(os.getcwd() + os.path.sep + _DEFAULT_FOLDER) | |
""" | |
Initialization and stop conditions | |
""" | |
# Stop the script if the given URL is malformed | |
if (check_url(args.get('<url>')) == ""): | |
print(_TAG + "The URL is invalid, or it isn't a 4chan thread URL.") | |
raise SystemExit(0) | |
# Initialize Archiver object | |
curr_archiver = BASC_Fourchan.Archiver(board, thread, https=enablessl) | |
""" | |
try/except loop to handle Ctrl-C | |
""" | |
try: | |
# Switch to check for first run | |
first_iteration = True | |
while 1: | |
if (first_iteration == False): # don't run this code the first time | |
# Wait to execute code again | |
print("\n" + _TAG + "Waiting %s seconds before retrying (Type Ctrl-C to stop)" % delay) | |
time.sleep(int(delay)) | |
if curr_thread.is_404: | |
# Stop when thread gets 404'ed | |
print(_TAG + "%s - [Thread 404'ed or Connection Lost]" % timestamp()) | |
print(" :: Dump complete. To resume dumping the same thread,\nrun this script again.") | |
raise SystemExit(0) | |
# Update thread and check if new replies have appeared | |
new_replies = curr_thread.update() | |
if (new_replies == 0): | |
print(_TAG + "%s - [No new posts.]" % timestamp()) | |
continue | |
else: | |
print(_TAG + "%s - [%s new post(s) found!]" % (timestamp(), new_replies)) | |
# If all tests are OK, dump thread again | |
curr_archiver.dump(dst_dir, nothumbs, thumbsonly) | |
else: | |
# dump thread for the first time | |
print(_TAG + "Dumping the thread...") | |
curr_archiver.dump(dst_dir, nothumbs, thumbsonly) | |
# first iteration is complete | |
first_iteration = False | |
except KeyboardInterrupt: | |
""" | |
Stop try/except loop when [Ctrl-C] is pressed | |
""" | |
print("\n") | |
print(" :: Dump complete. To resume dumping the same thread,\nrun this script again.") | |
raise SystemExit(0) | |
""" | |
Use docopt to get arguments, and run main function | |
""" | |
if __name__ == '__main__': | |
args = docopt(doc, version=0.3) | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment