Skip to content

Instantly share code, notes, and snippets.

@zeratax
Last active January 2, 2021 23:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zeratax/920ae622c2fb0041233299b76f672c57 to your computer and use it in GitHub Desktop.
Save zeratax/920ae622c2fb0041233299b76f672c57 to your computer and use it in GitHub Desktop.
archive your bookmarks -- looks through exported html files (e.g. exported bookmarks) for specific websites and downloads them with youtube-dl
#####
# Installation:
# install python3 and install via pip "bs4" and "youtube_dl"
#
# Usage:
# python3 crawler.py -h
#
# Example:
# export your bookmarks with firefox and then crawl them for youtube and soundcloud by executing:
# python3 crawler.py -f bookmarks.html -w youtube.com -w youtu.be -w soundcloud.com
#####
from __future__ import unicode_literals
import argparse
import json
import os
from bs4 import BeautifulSoup
import youtube_dl
fails = []
def crawler(file, websites):
if file.endswith(".txt"):
with open(file) as f:
urls = f.read().splitlines()
else:
html_doc = open(file, 'r')
soup = BeautifulSoup(html_doc, 'html.parser')
links = soup.find_all('a')
urls = []
for link in links:
urls.append(link["href"])
for url in urls:
domain = url.replace("https://", '').replace("www.", '').split('/')[0]
for website in websites:
if website == domain:
print(url)
if not download(url):
fails.append(url)
with open("failed_downloads.txt", 'w') as f:
for fail in fails:
f.write("%s\n" % fail)
def download(url):
format = "m4a"
ydl_opts = {
"outtmpl": "downloads/%(uploader)s/%(title)s.%(ext)s",
"format": "bestaudio",
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": format
}],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=False)
if "_type" in info and info["_type"] == "playlist":
for entry in info["entries"]:
url = entry["webpage_url"]
if not download(url):
fails.append(url)
else:
title = info["title"]
uploader = info["uploader"]
if len(title) > 199:
title = title[:197] + "..."
uploader = prepare_filename(uploader)
title = prepare_filename(title)
path = "downloads/{}/{}".format(uploader, title)
if not os.path.isfile(path + "." + format):
ydl.download([url])
else:
print("already downloaded.")
with open(path + ".json", 'w') as f:
json.dump(info, f)
return True
except youtube_dl.utils.DownloadError:
print("file couldn't be downloaded, possibly deleted!")
return False
def prepare_filename(filename):
filename = filename.replace("?", "")
filename = filename.replace("/", "_")
filename = filename.replace(": ", " - ")
filename = filename.replace(" :", " -")
filename = filename.replace(":", "_")
filename = filename.replace('"', "'")
filename = filename.replace("||", '|')
filename = filename.replace("|", '_')
filename = filename.replace("__", '_')
filename = filename.replace("*", '_')
if filename.startswith("_"):
filename = filename[1:]
if filename.endswith("_"):
filename = filename[:-1]
if filename.startswith("-"):
filename = list(filename)
filename[0] = "_"
filename = "".join(filename)
if filename.endswith("-"):
filename = list(filename)
filename[-1] = "_"
filename = "".join(filename)
return filename
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="archive your bookmarks")
parser.add_argument("-f", "--file", help="html file",
metavar="FILE", required=True)
parser.add_argument('-w', '--website', action='append',
help='websites that should be archived, e.g. "youtube.com"', required=True)
args = parser.parse_args()
crawler(args.file, args.website)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment