Last active
December 15, 2015 11:08
-
-
Save daramcq/5250314 to your computer and use it in GitHub Desktop.
Work in progress script to download threads from 4chan using their API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
import json | |
import csv | |
import datetime | |
import os | |
from PIL import Image | |
from StringIO import StringIO | |
import fcntl | |
import sys | |
import time | |
def writeThreadToFile(num,j_obj): | |
f = open(str(num)+'.json','wb') | |
f.write(str(j_obj)) | |
f.close() | |
def writeThreadsToFile(threadObject): | |
for key in threadObject.keys(): | |
writeThreadToFile(key,threadObject[key]) | |
def writeImagesToFile(images): | |
for image in images: | |
try: | |
ir = requests.get('http://images.4chan.org/b/src/'+image) | |
i = Image.open(StringIO(ir.content)) | |
image_file = open('img/'+image,"w") | |
i.save(image_file) | |
image_file.close() | |
#print("Saved image "+image) | |
except BaseException as e: | |
print("Unable to save image "+str(e)) | |
def getThreadImages(thread): | |
print(type(thread)) | |
images = [] | |
try: | |
for post in thread['posts']: | |
if 'tim' in post: | |
name = str(post['tim']) + str(post['ext']) | |
images.append(name) | |
except BaseException as e: | |
print('Error getting images out '+str(e)) | |
return images | |
def getAllThreadImages(threadObject): | |
imageList = [] | |
for thread in threadObject.values(): | |
imageList += getThreadImages(thread) | |
return imageList | |
def getThread(num): | |
try: | |
r = requests.get('http://api.4chan.org/b/res/'+str(num)+'.json') | |
if (r.status_code==200): | |
j_obj = json.loads(r.content) | |
return j_obj | |
except BaseException as e: | |
print("Error in getting thread "+str(num)+": "+str(e)) | |
def getThreadObject(threadNums): | |
threadObject = {} | |
for num in threadNums: | |
threadObject[num] = getThread(num) | |
return threadObject | |
def getThreadList(): | |
r = requests.get('http://api.4chan.org/b/threads.json') | |
threadNums = [] | |
try: | |
j_obj = json.loads(r.content) | |
for i,threadArr in enumerate(j_obj): | |
pageThreadList = threadArr['threads'] | |
for thread in pageThreadList: | |
threadNums.append(thread['no']) | |
except: | |
print 'Decoding Threads JSON has failed' | |
return threadNums | |
def main(): | |
threadObject = getThreadObject(getThreadList()) | |
writeThreadsToFile(threadObject) | |
writeImagesToFile(getAllThreadImages(threadObject)) | |
if __name__ =='__main__': | |
fl = open ('naggle.lock', 'w') | |
try: | |
fcntl.lockf(fl, fcntl.LOCK_EX | fcntl.LOCK_NB) | |
except: | |
# no need to log this... | |
sys.stderr.write('[%s] nagglebot.py already running.\n' % time.strftime('%c') ) | |
sys.exit(-1) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment