Skip to content

Instantly share code, notes, and snippets.

@muse
Last active August 29, 2015 14:14
Show Gist options
  • Save muse/1821c22c050b708f6579 to your computer and use it in GitHub Desktop.
Save muse/1821c22c050b708f6579 to your computer and use it in GitHub Desktop.
Extract all Images from a 4chan thread.
#!/usr/bin/env python
# Made by 'Mirko van der Waal'
# Distributed under terms of the MIT license.
try:
import getopt
import urllib
import os
from sys import exit, argv
from re import findall
from HTMLParser import HTMLParser
except ImportError as e:
print e, exit(0)
images = []
image_links = []
home_dir = os.getenv('HOME')
# A subclass of HTMLParser
# Moves all information about images to images.
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == "a":
if attrs[0][1] == "fileThumb":
images.append(attrs)
# Process links from the images array.
def getLinks():
for image in images:
url = image[1][1][2:]
image_links.append("http://" + url)
img_name = ''
web_addr = ''
fol_name = ''
our_dir = ''
try:
opts, args = getopt.getopt(argv[1:],
':u:n:p:x:',
['url=', 'name=', 'path=', 'prefix='])
except Exception as e:
print e, exit(0)
for o, a in opts:
if o in ('-u', '--url'): web_addr = a
elif o in ('-n', '--name'): fol_name = a
elif o in ('-p', '--path'): our_dir = a
elif o in ('-x', '--prefix'):img_name = a
if img_name == '': img_name = 'image_'
if web_addr == '': web_addr = raw_input("Board to extract images from: ")
if fol_name == '': fol_name = raw_input("Name of folder to extract to: ")
if our_dir == '': our_dir = home_dir + "/Pictures/"+ fol_name + "/"
parser = MyHTMLParser()
rc = urllib.urlopen(web_addr)
parser.feed(rc.read())
getLinks()
try:
os.mkdir(our_dir)
except Exception as e:
print e, exit(0)
for ind, image in enumerate(image_links):
ty = findall('\.\w+', image_links[ind])
print '%i/%s\t%s\t%s'%(ind+1,len(image_links),image_links[ind],''.join([our_dir,img_name,str(ind+1),''.join(ty[-1:])]))
with open(our_dir+img_name+str(ind)+''.join(ty[-1:]),"wb")as output:
output.write(urllib.urlopen(image).read())
print "\nFinished."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment