zeratax/crawler.py

## crawler.py
#####
# Installation:
# install python3 and install via pip "bs4" and "youtube_dl"
#
# Usage:
# python3 crawler.py -h
#
# Example:
#  export your bookmarks with firefox and then crawl them for youtube and soundcloud by executing:
#  python3 crawler.py -f bookmarks.html -w youtube.com -w youtu.be -w soundcloud.com
#####
from __future__ import unicode_literals

import argparse
import json
import os

from bs4 import BeautifulSoup
import youtube_dl


fails = []

def crawler(file, websites):
    if file.endswith(".txt"):
        with open(file) as f:
            urls = f.read().splitlines()
    else:
        html_doc = open(file, 'r')

        soup = BeautifulSoup(html_doc, 'html.parser')
        links = soup.find_all('a')
        urls = []
        for link in links:
            urls.append(link["href"])

    for url in urls:
        domain = url.replace("https://", '').replace("www.", '').split('/')[0]

        for website in websites:
            if website == domain:
                print(url)
                if not download(url):
                    fails.append(url)

    with open("failed_downloads.txt", 'w') as f:
        for fail in fails:
            f.write("%s\n" % fail)


def download(url):
    format = "m4a"
    ydl_opts = {
        "outtmpl": "downloads/%(uploader)s/%(title)s.%(ext)s",
        "format": "bestaudio",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": format
        }],
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(url, download=False)
            if "_type" in info and info["_type"] == "playlist":
                for entry in info["entries"]:
                    url = entry["webpage_url"]
                    if not download(url):
                        fails.append(url)
            else:
                title = info["title"]
                uploader = info["uploader"]

                if len(title) > 199:
                    title = title[:197] + "..."


                uploader = prepare_filename(uploader)
                title = prepare_filename(title)

                path = "downloads/{}/{}".format(uploader, title)

                if not os.path.isfile(path + "." + format):
                    ydl.download([url])
                else:
                    print("already downloaded.")
                with open(path + ".json", 'w') as f:
                    json.dump(info, f)
                return True
        except youtube_dl.utils.DownloadError:
            print("file couldn't be downloaded, possibly deleted!")
            return False


def prepare_filename(filename):
    filename = filename.replace("?", "")
    filename = filename.replace("/", "_")
    filename = filename.replace(": ", " - ")
    filename = filename.replace(" :", "  -")
    filename = filename.replace(":", "_")
    filename = filename.replace('"', "'")
    filename = filename.replace("||", '|')
    filename = filename.replace("|", '_')
    filename = filename.replace("__", '_')
    filename = filename.replace("*", '_')

    if filename.startswith("_"):
        filename = filename[1:]
    if filename.endswith("_"):
        filename = filename[:-1]
    if filename.startswith("-"):
        filename = list(filename)
        filename[0] = "_"
        filename = "".join(filename)
    if filename.endswith("-"):
        filename = list(filename)
        filename[-1] = "_"
        filename = "".join(filename)

    return filename


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="archive your bookmarks")
    parser.add_argument("-f", "--file", help="html file",
                        metavar="FILE", required=True)
    parser.add_argument('-w', '--website', action='append',
                        help='websites that should be archived, e.g. "youtube.com"', required=True)
    args = parser.parse_args()

    crawler(args.file, args.website)
	#####
	# Installation:
	# install python3 and install via pip "bs4" and "youtube_dl"
	#
	# Usage:
	# python3 crawler.py -h
	#
	# Example:
	# export your bookmarks with firefox and then crawl them for youtube and soundcloud by executing:
	# python3 crawler.py -f bookmarks.html -w youtube.com -w youtu.be -w soundcloud.com
	#####
	from __future__ import unicode_literals

	import argparse
	import json
	import os

	from bs4 import BeautifulSoup
	import youtube_dl


	fails = []

	def crawler(file, websites):
	if file.endswith(".txt"):
	with open(file) as f:
	urls = f.read().splitlines()
	else:
	html_doc = open(file, 'r')

	soup = BeautifulSoup(html_doc, 'html.parser')
	links = soup.find_all('a')
	urls = []
	for link in links:
	urls.append(link["href"])

	for url in urls:
	domain = url.replace("https://", '').replace("www.", '').split('/')[0]

	for website in websites:
	if website == domain:
	print(url)
	if not download(url):
	fails.append(url)

	with open("failed_downloads.txt", 'w') as f:
	for fail in fails:
	f.write("%s\n" % fail)


	def download(url):
	format = "m4a"
	ydl_opts = {
	"outtmpl": "downloads/%(uploader)s/%(title)s.%(ext)s",
	"format": "bestaudio",
	"postprocessors": [{
	"key": "FFmpegExtractAudio",
	"preferredcodec": format
	}],
	}
	with youtube_dl.YoutubeDL(ydl_opts) as ydl:
	try:
	info = ydl.extract_info(url, download=False)
	if "_type" in info and info["_type"] == "playlist":
	for entry in info["entries"]:
	url = entry["webpage_url"]
	if not download(url):
	fails.append(url)
	else:
	title = info["title"]
	uploader = info["uploader"]

	if len(title) > 199:
	title = title[:197] + "..."


	uploader = prepare_filename(uploader)
	title = prepare_filename(title)

	path = "downloads/{}/{}".format(uploader, title)

	if not os.path.isfile(path + "." + format):
	ydl.download([url])
	else:
	print("already downloaded.")
	with open(path + ".json", 'w') as f:
	json.dump(info, f)
	return True
	except youtube_dl.utils.DownloadError:
	print("file couldn't be downloaded, possibly deleted!")
	return False


	def prepare_filename(filename):
	filename = filename.replace("?", "")
	filename = filename.replace("/", "_")
	filename = filename.replace(": ", " - ")
	filename = filename.replace(" :", " -")
	filename = filename.replace(":", "_")
	filename = filename.replace('"', "'")
	filename = filename.replace("\|\|", '\|')
	filename = filename.replace("\|", '_')
	filename = filename.replace("__", '_')
	filename = filename.replace("*", '_')

	if filename.startswith("_"):
	filename = filename[1:]
	if filename.endswith("_"):
	filename = filename[:-1]
	if filename.startswith("-"):
	filename = list(filename)
	filename[0] = "_"
	filename = "".join(filename)
	if filename.endswith("-"):
	filename = list(filename)
	filename[-1] = "_"
	filename = "".join(filename)

	return filename


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="archive your bookmarks")
	parser.add_argument("-f", "--file", help="html file",
	metavar="FILE", required=True)
	parser.add_argument('-w', '--website', action='append',
	help='websites that should be archived, e.g. "youtube.com"', required=True)
	args = parser.parse_args()

	crawler(args.file, args.website)