Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
4chan image dumper w/ following links
#!/bin/python
from bs4 import BeautifulSoup
import urllib.request
import sys
import re
import os
arg = sys.argv[1]
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent','Mozilla/5.0')]
urllib.request.install_opener(opener)
while True:
dirname = re.search(r'(?<=thread/)[^.\s]*',arg).group(0)
boardname = re.search(r'(?<=.org/)[^/\s]*',arg).group(0)
req = urllib.request.Request(arg)
try:
with urllib.request.urlopen(req) as response:
soup = BeautifulSoup(response.read(),features="html.parser")
except urllib.error.HTTPError:
print("404, Trying archive.nyafuu.org...")
req = urllib.request.Request('https://archive.nyafuu.org/'+boardname+'/thread/'+dirname)
with urllib.request.urlopen(req) as response:
soup = BeautifulSoup(response.read(),features="html.parser")
try:
os.mkdir(dirname)
except FileExistsError as error:
print('Directory '+dirname+' already exists, not creating...')
for link in soup.findAll("a", {"class":"fileThumb"})+soup.findAll("a", {"class":"thread_image_link"}):
if link['href'].find('http') != -1:
url = link['href']
else:
url = 'http:'+link['href']
filename = url.rsplit('/', 1)[-1]
if os.path.isfile(dirname+'/'+filename) != True:
print('Downloading '+filename+'...')
urllib.request.urlretrieve(url,dirname+'/'+filename,)
else:
print('Skipping '+filename+', because it already exists.')
found = False
for link in soup.findAll("span", {"class":"deadlink"})+soup.findAll("a", {"class":"quotelink"})+soup.findAll("a", {"class":"backlink"}):
url = link.text
if int(url[2:]) < int(dirname):
print('Found previous thread, following the link')
found = True
arg = arg[:-len(dirname)]+url[2:]
break
if found == False:
print('Previous thread not found. Exitting..')
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment