Last active
July 24, 2016 01:23
-
-
Save gartnera/36ea72cdc10defaba720bcf7a23b7212 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import sys | |
import os | |
import tempfile | |
import shutil | |
import hashlib | |
import time | |
import re | |
import argparse | |
import subprocess | |
import Tkinter as tk | |
import ttk | |
import tkFileDialog | |
import urllib2 | |
from bs4 import BeautifulSoup | |
import dataset | |
from fuzzywuzzy import fuzz, process | |
def gui_get_config(): | |
root = tk.Tk() | |
root.geometry("300x300+300+300") | |
root.wm_title("4chan downloader") | |
url_label = tk.Label(root, text="Url:", justify=tk.LEFT, anchor="w") | |
url_label.pack(fill=tk.X, padx=5) | |
url_var = tk.StringVar() | |
url_input = tk.Entry(root, textvariable=url_var) | |
url_input.pack(fill=tk.X, pady=(0,5), padx=5) | |
folder_frame = tk.Frame(root) | |
folder_label = tk.Label(folder_frame, text="Destination:", justify=tk.LEFT, anchor="w") | |
folder_label.pack(fill=tk.X) | |
folder_var = tk.StringVar() | |
folder_input = tk.Entry(folder_frame, textvariable=folder_var) | |
folder_input.pack(fill=tk.X, expand=True, side="left") | |
choose_button = tk.Button(folder_frame, text='...', command=lambda: folder_var.set(tkFileDialog.askdirectory())) | |
choose_button.pack(side="left") | |
folder_frame.pack(fill=tk.X, pady=(0,5), padx=5) | |
rename_var = tk.IntVar() | |
rename_checkbox = tk.Checkbutton(root, text="Prompt for rename?", variable=rename_var, anchor="w") | |
rename_checkbox.pack(fill=tk.X) | |
preview_var = tk.IntVar() | |
preview_checkbox = tk.Checkbutton(root, text="Preview?", variable=preview_var, anchor="w") | |
preview_checkbox.pack(fill=tk.X) | |
webm_frame = tk.Frame(root) | |
webm_label = tk.Label(webm_frame, text="Webm preview executable:", justify=tk.LEFT, anchor="w") | |
webm_label.pack(fill=tk.X) | |
webm_var = tk.StringVar() | |
webm_input = tk.Entry(webm_frame, textvariable=webm_var) | |
webm_input.pack(fill=tk.X, expand=True, side="left") | |
webm_choose_button = tk.Button(webm_frame, text='...', command=lambda: webm_var.set(tkFileDialog.askopenfilename())) | |
webm_choose_button.pack(side="left") | |
webm_frame.pack(fill=tk.X, pady=(0,5), padx=5) | |
gif_frame = tk.Frame(root) | |
gif_label = tk.Label(gif_frame, text="Gif preview executable:", justify=tk.LEFT, anchor="w") | |
gif_label.pack(fill=tk.X) | |
gif_var = tk.StringVar() | |
gif_input = tk.Entry(gif_frame, textvariable=gif_var) | |
gif_input.pack(fill=tk.X, expand=True, side="left") | |
gif_choose_button = tk.Button(gif_frame, text='...', command=lambda: gif_var.set(tkFileDialog.askopenfilename())) | |
gif_choose_button.pack(side="left") | |
gif_frame.pack(fill=tk.X, pady=(0,5), padx=5) | |
submit = tk.Button(root, text='Go!', command=lambda: root.quit()) | |
submit.pack(side="bottom", pady=(0,5)) | |
root.mainloop() | |
params = { | |
"thread":url_var.get(), | |
"folder":folder_var.get(), | |
"rename":bool(rename_var.get()), | |
"preview":bool(preview_var.get()), | |
"webm_preview":webm_var.get(), | |
"gif_preview":gif_var.get() | |
} | |
for widget in root.winfo_children(): | |
widget.destroy() | |
return (params, root) | |
def gui_setup_progress(gui, max): | |
url_label = tk.Label(gui, text="Url:", justify=tk.LEFT, anchor="w") | |
url_label.pack(fill=tk.X, padx=5) | |
url_input = tk.Entry(gui, state=tk.DISABLED) | |
url_input.pack(fill=tk.X, pady=(0,5), padx=5) | |
name_label = tk.Label(gui, text="FileName:", justify=tk.LEFT, anchor="w") | |
name_label.pack(fill=tk.X, padx=5) | |
name_var = tk.StringVar() | |
name_input = tk.Entry(gui, state=tk.DISABLED) | |
name_input.pack(fill=tk.X, pady=(0,5), padx=5) | |
progress = ttk.Progressbar(gui, maximum=max) | |
progress.pack(fill=tk.X, pady=(0,5), padx=5) | |
progress_label = tk.Label(gui, text="1/" + str(max)) | |
progress_label.pack(fill=tk.X, padx=5) | |
gui_render(gui) | |
def gui_update(gui, url, filename): | |
children = gui.winfo_children() | |
url_input = children[1] | |
name_input = children[3] | |
progress=children[4] | |
progress_label=children[5] | |
url_input.config(state=tk.NORMAL) | |
url_input.delete(0,tk.END) | |
url_input.insert(0, url) | |
url_input.config(state=tk.DISABLED) | |
name_input.config(state=tk.NORMAL) | |
name_input.delete(0,tk.END) | |
name_input.insert(0, filename) | |
name_input.config(state=tk.DISABLED) | |
#hax. .step() dosn't work right (wraps around) | |
if progress['value'] == progress['maximum'] - 1: | |
progress['value'] = progress['maximum'] | |
else: | |
progress.step() | |
var = tk.StringVar() | |
var.set(str(int(progress['value'])) + "/" + str(progress['maximum'])) | |
progress_label.config(textvariable=var) | |
gui_render(gui) | |
def gui_rename(gui): | |
name_label = tk.Label(gui, text="New Name:", justify=tk.LEFT, anchor="w") | |
name_label.pack(fill=tk.X, padx=5) | |
name_input = tk.Entry(gui) | |
name_input.pack(fill=tk.X, pady=(0,5), padx=5) | |
submit = tk.Button(gui, text='Rename!', command=lambda: gui.quit()) | |
submit.pack(side="bottom", pady=(0,5)) | |
gui.mainloop() | |
name = name_input.get() | |
name_label.destroy() | |
name_input.destroy() | |
submit.destroy() | |
return name | |
def gui_render(gui): | |
gui.update_idletasks() | |
gui.update() | |
parser = argparse.ArgumentParser(description="4chanDL.py") | |
parser.add_argument('thread', help='url of the thread') | |
parser.add_argument('-r', '--rename', action='store_true', help='Prompts you to rename unknown files') | |
parser.add_argument('-p', '--preview', action='store_true', help='Whether to preview file') | |
parser.add_argument('-wp', '--webm_preview', type=str, help='Program to preview webm with') | |
parser.add_argument('-gp', '--gif_preview', type=str, help='Program to preview gif with') | |
gui = None | |
try: | |
args = parser.parse_args() | |
except SystemExit: | |
args, gui = gui_get_config() | |
os.chdir(args['folder']) | |
db = dataset.connect("sqlite:///_data.db") | |
hashTable = db['hash_table'] | |
altNameTable = db['altname'] | |
threadTable = db['threads'] | |
req = urllib2.Request(args['thread'], headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}) | |
htmlStr = urllib2.urlopen(req).read() | |
soup = BeautifulSoup(htmlStr, 'html.parser') | |
res = soup.findAll('div', attrs = {'class' : 'fileText'}) | |
threadid = int(re.search('\/(\d+)$', args['thread']).group(1)) | |
row = threadTable.find_one(threadid=threadid) | |
if row: | |
maxKnownImageId = int(row['max_image_id']) | |
else: | |
maxKnownImageId = 0 | |
if len(res) == 0: | |
print "Invalid URL or you're banned :/" | |
exit() | |
if gui: | |
gui_setup_progress(gui, len(res)) | |
for item in res: | |
tag = list(item.children)[1] | |
url = "https:" + tag.get('href') | |
filename = tag.get('title') | |
if not filename: | |
filename = tag.get_text() | |
if gui: | |
gui_update(gui, url, filename) | |
else: | |
print "{}\n\t{}".format(filename, url) | |
imageid = int(re.search('\/(\d+)\.', url).group(1)) | |
if imageid <= maxKnownImageId: | |
print "\tIgnoring known image id" | |
continue | |
shouldRename = False | |
if filename.startswith("tmp"): | |
shouldRename = True | |
firstPart, extension = os.path.splitext(filename) | |
if re.match('^\d+$', firstPart): | |
shouldRename = True | |
if not args['rename'] and shouldRename: | |
continue | |
#download to tempfile | |
tempName = tempfile.NamedTemporaryFile().name | |
req = urllib2.urlopen(url) | |
with open(tempName, 'wb') as f: | |
shutil.copyfileobj(req, f) | |
hashObj = hashlib.sha1() | |
with open(tempName, 'rb') as f: | |
for chunk in iter(lambda: f.read(4096), b""): | |
hashObj.update(chunk) | |
fHash = hashObj.hexdigest() | |
filePath = os.path.abspath(filename) | |
row = hashTable.find_one(fHash=fHash) | |
#image already exists | |
if row: | |
if fuzz.ratio(filename, row['filename']) < 90: | |
print "Image already exists:" | |
print "\tExisting: " + row['filename'] | |
print "\tNew: " + filename | |
else: | |
print "\t^ Dupe ^" | |
#don't store name if it's garbage | |
if not shouldRename: | |
data = dict(existing=row['filename'], new=filename) | |
altNameTable.upsert(data, ['existing', 'new']) | |
os.remove(tempName) | |
continue | |
#if name is garbage, prompt to rename | |
if args['rename'] and shouldRename: | |
firstPart, extension = os.path.splitext(filename) | |
if not gui: | |
print "Rename: " + filename | |
if extension == '.webm': | |
if args['webm_preview']: | |
subprocess.call([args['webm_preview'], tempName]) | |
if extension == '.gif': | |
if args['gif_preview']: | |
subprocess.call([args['gif_preview'], tempName]) | |
if gui: | |
filename = gui_rename(gui) | |
else: | |
filename = raw_input('> ') | |
#if nothing entered, skip and blacklist | |
if filename == "": | |
hashTable.insert(dict(fHash=fHash, filename=filename)) | |
continue | |
filename += extension | |
filePath = os.path.abspath(filename) | |
#duplicate filename, but different content | |
if os.path.isfile(filename): | |
firstPart, extension = os.path.splitext(filename) | |
counter = 1 | |
while True: | |
newPath = "{}_{}{}".format(firstPart, counter, extension) | |
if not os.path.isfile(newPath): | |
filePath = os.path.abspath(newPath) | |
break | |
counter += 1 | |
shutil.move(tempName, filePath) | |
hashTable.insert(dict(fHash=fHash, filename=filename)) | |
#don't get banned | |
time.sleep(.1) | |
threadTable.insert(dict(threadid=threadid, max_image_id=imageid)) | |
if gui: | |
gui.destroy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dataset | |
bs4 | |
fuzzywuzzy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment