|
import os |
|
import json |
|
import praw |
|
import re |
|
import requests |
|
import string |
|
import sys |
|
import time |
|
from urllib import parse |
|
|
|
config = { |
|
"client_id": "REDDIT_CLIENT_ID", |
|
"client_secret": "REDDIT_CLIENT_SECRET", |
|
"imgur_client_id": "IMGUR_CLIENT_ID", |
|
"desired_files": ( |
|
".gif", |
|
".gifv", |
|
".webm", |
|
".jpeg", |
|
".jpg", |
|
".png", |
|
".mp4" |
|
), |
|
"wanted_links": ( |
|
"gyazo", |
|
"youtube", |
|
"myanimelist", |
|
"wikipedia", |
|
"youtu.be", |
|
"twitter" |
|
) |
|
} |
|
ignored = {} |
|
exceptions = {} |
|
errors_file = None |
|
saved_links = None |
|
print_output = None |
|
post = {} |
|
user_agent = "Reddit Downloader 0.1" |
|
|
|
def my_print(text): |
|
print_output.write("{}\n".format(text)) |
|
|
|
def parse_url(url): |
|
return(parse.urlparse(url)) |
|
|
|
def valid_filename(filename): |
|
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) |
|
return ''.join(c for c in filename if c in valid_chars) |
|
|
|
def format_filename(folder, title, file, extension='', separator=' - '): |
|
return folder + valid_filename(title[:100] + separator + file + extension) |
|
|
|
|
|
def download_file(filename, url, service, id, title=None): |
|
try: |
|
if not os.path.isfile(filename): |
|
os.makedirs(os.path.dirname(filename), exist_ok=True) |
|
r = requests.get(url, timeout=30) |
|
with open(filename, 'wb') as outfile: |
|
outfile.write(r.content) |
|
except Exception as e: |
|
errors_file.write("{} | {} | {}\n".format(url, filename, e)) |
|
|
|
|
|
|
|
def save_link(folder, link): |
|
text = "{}-{} | {} - {}\n" |
|
saved_links.write(text.format(post["id"], post["title"], link["title"], link["url"])) |
|
|
|
def save_exception(link): |
|
text = "Exception: {}-{} | {} - {}\n" |
|
saved_links.write(text.format(post["id"], post["title"], link["title"], link["url"])) |
|
|
|
def ignore_link(folder, link): |
|
domain = parse_url(link["url"]).netloc |
|
if domain not in ignored: |
|
exceptions[domain] = [] |
|
exceptions[domain].append(link) |
|
|
|
|
|
def get_service_and_id(link): |
|
parts = link.split('/') |
|
service = parts[2] |
|
id = parts[-1] |
|
return service, id |
|
|
|
def download_link(folder, link, service=None, id=None): |
|
if not service: |
|
service, id = get_service_and_id(link["url"]) |
|
|
|
url = link["url"] |
|
|
|
fn = format_filename(folder, link["title"], url[url.rfind('/')+1:]) |
|
download_file(fn, url, service, id, link["title"]) |
|
|
|
|
|
def special_imgur_album(folder, link): |
|
url = link["url"] |
|
rf = url.rfind('#') |
|
if rf > 0: |
|
url = url[:url.rfind('#')] |
|
albumname = url[url.rfind('/')+1:] |
|
|
|
down_url = 'https://api.imgur.com/3/album/{}/images?client_id={}'.format(albumname, config["imgur_client_id"]) |
|
r = requests.get(down_url) |
|
|
|
try: |
|
api_data = json.loads(r.content.decode("utf-8")) |
|
data = api_data["data"] |
|
|
|
df = format_filename(folder, link["title"], albumname) |
|
df = df + "/" |
|
|
|
i = 1 |
|
if 'error' in data: |
|
errors_file.write("{} | {} | {}\n".format("special_imgur_album", link["url"], data['error'])) |
|
return |
|
|
|
for img in data: |
|
ttl = str(i)+" - "+(img.get("title") or img["id"]) |
|
download_link(df, {'url': img["link"], 'title': ttl}, 'imgur', img["id"]) |
|
i = i+1 |
|
except Exception as inst: |
|
print("ERROR::: ", albumname) |
|
print(inst) |
|
|
|
def special_imgur_image(folder, link): |
|
url = link["url"] |
|
fn = url[url.rfind('/')+1:] |
|
url = url[:url.rfind('/')+1] + "download/" + fn |
|
|
|
# TODO: change to download_file() |
|
r = requests.get(url) |
|
if "Content-Type" in r.headers: |
|
try: |
|
ft = r.headers["Content-Type"] |
|
|
|
fn = fn + "." + ft[ft.rfind('/')+1:] |
|
fn = format_filename(folder, link["title"], fn) |
|
if not os.path.isfile(fn): |
|
os.makedirs(os.path.dirname(fn), exist_ok=True) |
|
with open(fn, 'wb') as outfile: |
|
outfile.write(r.content) |
|
except Exception as e: |
|
errors_file.write("{} | {}\n".format(url, e)) |
|
else: |
|
errors_file.write("{} | {}\n".format("IGNORED: ", url)) |
|
ignore_link(folder, link) |
|
|
|
|
|
def special_imgur_gifv(folder, link): |
|
url = link["url"] |
|
gif_url = url.replace("gifv", "gif") |
|
url = url.replace("gifv", "mp4") |
|
|
|
my_print(" SKIPPING IMGUR GIFV {} {}".format(url[url.rfind('/')+1:], link["url"])) |
|
|
|
save_link(folder, link) |
|
|
|
def special_gfycat(folder, link): |
|
gfycat_api_url = 'https://api.gfycat.com/v1/gfycats/{}' |
|
name = link["url"] |
|
name = name[name.rfind('/')+1:] |
|
|
|
api_request = requests.get(gfycat_api_url.format(name)) |
|
api_data = json.loads(api_request.content.decode("utf-8")) |
|
|
|
if api_data and "gfyItem" in api_data: |
|
url = api_data["gfyItem"]["webmUrl"] |
|
|
|
fn = format_filename(folder, link["title"], name, '.webm') |
|
|
|
# TODO: change to download_link |
|
download_file(fn, url, 'gfycat', name, link["title"]) |
|
else: |
|
errors_file.write("{} | {} | {}\n".format("GFYCAT ", link, api_data)) |
|
|
|
|
|
def special_streamable(folder, link): |
|
apiurl = 'https://api.streamable.com/videos/{}' |
|
name = link["url"] |
|
name = name[name.rfind('/')+1:] |
|
api_request = requests.get(apiurl.format(name)) |
|
|
|
try: |
|
my_print(link["url"]) |
|
api_data = json.loads(api_request.content.decode("utf-8")) |
|
|
|
if api_data and "files" in api_data: |
|
data = api_data["files"] |
|
if "mp4" in data: |
|
url = data["mp4"]["url"] |
|
url = "https://"+url[2:] # removes '//' at beginning |
|
fn = format_filename(folder, link["title"], name, '.mp4') |
|
download_file(fn, url, 'streamable', name, link["title"]) |
|
else: |
|
errors_file.write("{} | {}\n".format("streamable", api_data)) |
|
except: |
|
errors_file.write("{} | {}\n".format("streamable", api_request.content)) |
|
|
|
actions = { |
|
"download": download_link, |
|
"special_imgur_album": special_imgur_album, |
|
"special_imgur_image": special_imgur_image, |
|
"special_imgur_gifv": special_imgur_gifv, |
|
"special_gfycat": special_gfycat, |
|
"special_streamable": special_streamable, |
|
"save": save_link, |
|
"ignore": ignore_link, |
|
} |
|
|
|
|
|
def get_links(comment): |
|
# Links without title are not processed for now :( |
|
|
|
# Anything that isn't a square closing bracket |
|
name_regex = "[^]]+" |
|
# http:// or https:// followed by anything but a closing parentheses |
|
url_regex = "http[s]?://[^)]+" |
|
markup_regex = "\[({0})]\(\s*({1})\s*\)".format(name_regex, url_regex) |
|
ret = re.findall(markup_regex, comment.body) |
|
links = [] |
|
for itm in ret: |
|
links.append({"title": itm[0], "url": itm[1]}) |
|
|
|
return links |
|
|
|
|
|
def check_link_action(url): |
|
if url.endswith(".gifv"): |
|
return "special_imgur_gifv" |
|
# elif any(x in url for x in is_file): |
|
elif url.endswith(config["desired_files"]): |
|
return "download" |
|
elif "imgur.com/a/" in url: |
|
return "special_imgur_album" |
|
elif "imgur.com/gallery/" in url: |
|
return "special_imgur_album" |
|
elif "imgur" in url: |
|
return "special_imgur_image" |
|
elif "gfycat" in url: |
|
return "special_gfycat" |
|
elif "streamable" in url: |
|
return "special_streamable" |
|
elif any(x in url for x in config["wanted_links"]): |
|
return "save" |
|
else: |
|
return "ignore" |
|
|
|
|
|
def download_links(folder, links): |
|
for link in links: |
|
link_action = check_link_action(link["url"]) |
|
my_print(link["url"]) |
|
actions[link_action](folder, link) |
|
|
|
|
|
def format_comment_dict(c): |
|
return { |
|
"author": c.author.name if c.author else "", |
|
"body": c.body, |
|
"controversiality": c.controversiality, |
|
"depth": c.depth, |
|
"gilded": c.gilded, |
|
"id": c.id, |
|
"permalink": "https://reddit.com" + c.permalink, |
|
"score": c.score |
|
} |
|
|
|
def download_posts_media(reddit, submission_list): |
|
global ignored, exceptions, errors_file, saved_links, print_output, post |
|
|
|
folder = "downloaded/" |
|
|
|
if not os.path.exists(folder): |
|
os.makedirs(folder) |
|
|
|
errors_file = open("{}error_logs.txt".format(folder), "a+", encoding="utf-8") |
|
saved_links = open("{}saved_links.txt".format(folder), "a+", encoding="utf-8") |
|
print_output = open("{}output.txt".format(folder), "a+", encoding="utf-8") |
|
|
|
if not isinstance(submission_list, list): |
|
submission_list = [submission_list] |
|
|
|
for idx, submission_id in enumerate(submission_list, start=1): |
|
print('Downloading post {}'.format(idx)) |
|
comments = [] |
|
ignored = {} |
|
exceptions = {} |
|
submission = reddit.submission(id=submission_id) |
|
|
|
my_print('Getting submission {} {}'.format(submission_id, submission.title)) |
|
|
|
download_folder = "{}{}/".format(folder, valid_filename(submission.subreddit.display_name)) |
|
comments_folder = "{}[{}]-{}/".format(download_folder, submission.id, submission.title) |
|
|
|
post = {"url": submission.url, "title": submission.title, "id": submission.id} |
|
|
|
if not os.path.exists(comments_folder): |
|
os.makedirs(comments_folder) |
|
|
|
submission.comments.replace_more(limit=None) |
|
comment_queue = submission.comments[:] # Seed with top-level |
|
|
|
i = 0 |
|
while comment_queue: |
|
i = i+1 |
|
comment = comment_queue.pop(0) |
|
|
|
comments.append(format_comment_dict(comment)) |
|
url_list = get_links(comment) |
|
|
|
download_links(comments_folder, url_list) |
|
comment_queue.extend(comment.replies) |
|
|
|
errors_file.flush() |
|
saved_links.flush() |
|
print_output.flush() |
|
os.fsync(errors_file.fileno()) |
|
os.fsync(saved_links.fileno()) |
|
os.fsync(print_output.fileno()) |
|
|
|
time.sleep(0.1) |
|
|
|
errors_file.close() |
|
saved_links.close() |
|
print_output.close() |
|
|
|
def search_posts(reddit, query='', subreddit='anime', sort='new', t='year', flair='Episode', l=None): |
|
if flair and flair != '': |
|
query = ' '.join((query, 'flair_name:"{}"'.format(flair))) |
|
|
|
sr = reddit.subreddit(subreddit) |
|
post_list = sr.search(query, sort=sort, syntax='lucene', |
|
time_filter=t, limit=l) |
|
|
|
posts = [post.id for post in post_list] |
|
|
|
return posts |
|
|
|
reddit = praw.Reddit( |
|
user_agent=user_agent, client_id=config["client_id"], client_secret=config["client_secret"]) |
|
|
|
search_term = sys.argv[1] |
|
limit = int(sys.argv[2]) if len(sys.argv) >= 3 else 24 |
|
posts = search_posts(reddit, search_term, l=limit) |
|
print('{} posts found'.format(len(posts))) |
|
|
|
download_posts_media(reddit, posts) |