Skip to content

Instantly share code, notes, and snippets.

@gartnera
Last active July 24, 2016 01:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gartnera/36ea72cdc10defaba720bcf7a23b7212 to your computer and use it in GitHub Desktop.
Save gartnera/36ea72cdc10defaba720bcf7a23b7212 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
import sys
import os
import tempfile
import shutil
import hashlib
import time
import re
import argparse
import subprocess
import Tkinter as tk
import ttk
import tkFileDialog
import urllib2
from bs4 import BeautifulSoup
import dataset
from fuzzywuzzy import fuzz, process
def gui_get_config():
root = tk.Tk()
root.geometry("300x300+300+300")
root.wm_title("4chan downloader")
url_label = tk.Label(root, text="Url:", justify=tk.LEFT, anchor="w")
url_label.pack(fill=tk.X, padx=5)
url_var = tk.StringVar()
url_input = tk.Entry(root, textvariable=url_var)
url_input.pack(fill=tk.X, pady=(0,5), padx=5)
folder_frame = tk.Frame(root)
folder_label = tk.Label(folder_frame, text="Destination:", justify=tk.LEFT, anchor="w")
folder_label.pack(fill=tk.X)
folder_var = tk.StringVar()
folder_input = tk.Entry(folder_frame, textvariable=folder_var)
folder_input.pack(fill=tk.X, expand=True, side="left")
choose_button = tk.Button(folder_frame, text='...', command=lambda: folder_var.set(tkFileDialog.askdirectory()))
choose_button.pack(side="left")
folder_frame.pack(fill=tk.X, pady=(0,5), padx=5)
rename_var = tk.IntVar()
rename_checkbox = tk.Checkbutton(root, text="Prompt for rename?", variable=rename_var, anchor="w")
rename_checkbox.pack(fill=tk.X)
preview_var = tk.IntVar()
preview_checkbox = tk.Checkbutton(root, text="Preview?", variable=preview_var, anchor="w")
preview_checkbox.pack(fill=tk.X)
webm_frame = tk.Frame(root)
webm_label = tk.Label(webm_frame, text="Webm preview executable:", justify=tk.LEFT, anchor="w")
webm_label.pack(fill=tk.X)
webm_var = tk.StringVar()
webm_input = tk.Entry(webm_frame, textvariable=webm_var)
webm_input.pack(fill=tk.X, expand=True, side="left")
webm_choose_button = tk.Button(webm_frame, text='...', command=lambda: webm_var.set(tkFileDialog.askopenfilename()))
webm_choose_button.pack(side="left")
webm_frame.pack(fill=tk.X, pady=(0,5), padx=5)
gif_frame = tk.Frame(root)
gif_label = tk.Label(gif_frame, text="Gif preview executable:", justify=tk.LEFT, anchor="w")
gif_label.pack(fill=tk.X)
gif_var = tk.StringVar()
gif_input = tk.Entry(gif_frame, textvariable=gif_var)
gif_input.pack(fill=tk.X, expand=True, side="left")
gif_choose_button = tk.Button(gif_frame, text='...', command=lambda: gif_var.set(tkFileDialog.askopenfilename()))
gif_choose_button.pack(side="left")
gif_frame.pack(fill=tk.X, pady=(0,5), padx=5)
submit = tk.Button(root, text='Go!', command=lambda: root.quit())
submit.pack(side="bottom", pady=(0,5))
root.mainloop()
params = {
"thread":url_var.get(),
"folder":folder_var.get(),
"rename":bool(rename_var.get()),
"preview":bool(preview_var.get()),
"webm_preview":webm_var.get(),
"gif_preview":gif_var.get()
}
for widget in root.winfo_children():
widget.destroy()
return (params, root)
def gui_setup_progress(gui, max):
url_label = tk.Label(gui, text="Url:", justify=tk.LEFT, anchor="w")
url_label.pack(fill=tk.X, padx=5)
url_input = tk.Entry(gui, state=tk.DISABLED)
url_input.pack(fill=tk.X, pady=(0,5), padx=5)
name_label = tk.Label(gui, text="FileName:", justify=tk.LEFT, anchor="w")
name_label.pack(fill=tk.X, padx=5)
name_var = tk.StringVar()
name_input = tk.Entry(gui, state=tk.DISABLED)
name_input.pack(fill=tk.X, pady=(0,5), padx=5)
progress = ttk.Progressbar(gui, maximum=max)
progress.pack(fill=tk.X, pady=(0,5), padx=5)
progress_label = tk.Label(gui, text="1/" + str(max))
progress_label.pack(fill=tk.X, padx=5)
gui_render(gui)
def gui_update(gui, url, filename):
children = gui.winfo_children()
url_input = children[1]
name_input = children[3]
progress=children[4]
progress_label=children[5]
url_input.config(state=tk.NORMAL)
url_input.delete(0,tk.END)
url_input.insert(0, url)
url_input.config(state=tk.DISABLED)
name_input.config(state=tk.NORMAL)
name_input.delete(0,tk.END)
name_input.insert(0, filename)
name_input.config(state=tk.DISABLED)
#hax. .step() dosn't work right (wraps around)
if progress['value'] == progress['maximum'] - 1:
progress['value'] = progress['maximum']
else:
progress.step()
var = tk.StringVar()
var.set(str(int(progress['value'])) + "/" + str(progress['maximum']))
progress_label.config(textvariable=var)
gui_render(gui)
def gui_rename(gui):
name_label = tk.Label(gui, text="New Name:", justify=tk.LEFT, anchor="w")
name_label.pack(fill=tk.X, padx=5)
name_input = tk.Entry(gui)
name_input.pack(fill=tk.X, pady=(0,5), padx=5)
submit = tk.Button(gui, text='Rename!', command=lambda: gui.quit())
submit.pack(side="bottom", pady=(0,5))
gui.mainloop()
name = name_input.get()
name_label.destroy()
name_input.destroy()
submit.destroy()
return name
def gui_render(gui):
gui.update_idletasks()
gui.update()
parser = argparse.ArgumentParser(description="4chanDL.py")
parser.add_argument('thread', help='url of the thread')
parser.add_argument('-r', '--rename', action='store_true', help='Prompts you to rename unknown files')
parser.add_argument('-p', '--preview', action='store_true', help='Whether to preview file')
parser.add_argument('-wp', '--webm_preview', type=str, help='Program to preview webm with')
parser.add_argument('-gp', '--gif_preview', type=str, help='Program to preview gif with')
gui = None
try:
args = parser.parse_args()
except SystemExit:
args, gui = gui_get_config()
os.chdir(args['folder'])
db = dataset.connect("sqlite:///_data.db")
hashTable = db['hash_table']
altNameTable = db['altname']
threadTable = db['threads']
req = urllib2.Request(args['thread'], headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
htmlStr = urllib2.urlopen(req).read()
soup = BeautifulSoup(htmlStr, 'html.parser')
res = soup.findAll('div', attrs = {'class' : 'fileText'})
threadid = int(re.search('\/(\d+)$', args['thread']).group(1))
row = threadTable.find_one(threadid=threadid)
if row:
maxKnownImageId = int(row['max_image_id'])
else:
maxKnownImageId = 0
if len(res) == 0:
print "Invalid URL or you're banned :/"
exit()
if gui:
gui_setup_progress(gui, len(res))
for item in res:
tag = list(item.children)[1]
url = "https:" + tag.get('href')
filename = tag.get('title')
if not filename:
filename = tag.get_text()
if gui:
gui_update(gui, url, filename)
else:
print "{}\n\t{}".format(filename, url)
imageid = int(re.search('\/(\d+)\.', url).group(1))
if imageid <= maxKnownImageId:
print "\tIgnoring known image id"
continue
shouldRename = False
if filename.startswith("tmp"):
shouldRename = True
firstPart, extension = os.path.splitext(filename)
if re.match('^\d+$', firstPart):
shouldRename = True
if not args['rename'] and shouldRename:
continue
#download to tempfile
tempName = tempfile.NamedTemporaryFile().name
req = urllib2.urlopen(url)
with open(tempName, 'wb') as f:
shutil.copyfileobj(req, f)
hashObj = hashlib.sha1()
with open(tempName, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hashObj.update(chunk)
fHash = hashObj.hexdigest()
filePath = os.path.abspath(filename)
row = hashTable.find_one(fHash=fHash)
#image already exists
if row:
if fuzz.ratio(filename, row['filename']) < 90:
print "Image already exists:"
print "\tExisting: " + row['filename']
print "\tNew: " + filename
else:
print "\t^ Dupe ^"
#don't store name if it's garbage
if not shouldRename:
data = dict(existing=row['filename'], new=filename)
altNameTable.upsert(data, ['existing', 'new'])
os.remove(tempName)
continue
#if name is garbage, prompt to rename
if args['rename'] and shouldRename:
firstPart, extension = os.path.splitext(filename)
if not gui:
print "Rename: " + filename
if extension == '.webm':
if args['webm_preview']:
subprocess.call([args['webm_preview'], tempName])
if extension == '.gif':
if args['gif_preview']:
subprocess.call([args['gif_preview'], tempName])
if gui:
filename = gui_rename(gui)
else:
filename = raw_input('> ')
#if nothing entered, skip and blacklist
if filename == "":
hashTable.insert(dict(fHash=fHash, filename=filename))
continue
filename += extension
filePath = os.path.abspath(filename)
#duplicate filename, but different content
if os.path.isfile(filename):
firstPart, extension = os.path.splitext(filename)
counter = 1
while True:
newPath = "{}_{}{}".format(firstPart, counter, extension)
if not os.path.isfile(newPath):
filePath = os.path.abspath(newPath)
break
counter += 1
shutil.move(tempName, filePath)
hashTable.insert(dict(fHash=fHash, filename=filename))
#don't get banned
time.sleep(.1)
threadTable.insert(dict(threadid=threadid, max_image_id=imageid))
if gui:
gui.destroy()
dataset
bs4
fuzzywuzzy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment