Skip to content

Instantly share code, notes, and snippets.

@ksamuel
Created May 5, 2023 16:46
Show Gist options
  • Save ksamuel/609d645af40711447c5adde634fe4d66 to your computer and use it in GitHub Desktop.
Save ksamuel/609d645af40711447c5adde634fe4d66 to your computer and use it in GitHub Desktop.
"""
usage: download_bookmarks.py [-h] [--concurrency [CONCURRENCY]] [--directory DIRECTORY] bookmarks
positional arguments:
bookmarks The path to the sqlite db file containing
the bookmarks. It's the places.sqlite file
in your default profile dir.
optional arguments:
-h, --help show this help message and exit
--concurrency [CONCURRENCY], -c [CONCURRENCY]
Max number of bookmarks to process in parallel
--directory DIRECTORY, -d DIRECTORY
Directory to store the downloaded files. Will be recursively created if it doesn't exist. Otherwise,
a temp dir will be used.
"""
import argparse
import asyncio
import sqlite3
import sys
from asyncio.exceptions import CancelledError
from pathlib import Path
from tempfile import TemporaryDirectory
if not sys.version_info >= (3, 8):
sys.exit("This script requires Python 3.8 or higher")
UA = (
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20070802 SeaMonkey/1.1.4"
)
async def download(i, total, url, directory, concurrency_limit):
async with concurrency_limit:
print(f"Downloading: {url} - START ({i}/{total})")
proc = None
try:
proc = await asyncio.create_subprocess_shell(
f'wget -o /dev/null -H -U "{UA}" -p -k -P {directory} {url}',
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await asyncio.wait_for(proc.communicate(), timeout=10)
if stderr:
print(f"Downloading: {url} - ERROR ({i}/{total})", file=sys.stderr)
err = stderr.decode("utf8", errors="replace")
print(f"\n[stderr]\n{err}", file=sys.stderr)
print(f"Downloading: {url} - DONE ({i}/{total})")
except (TimeoutError, CancelledError):
print(f"Downloading: {url} - TIMEOUT ({i}/{total})", file=sys.stderr)
except Exception as e:
print(f"Downloading: {url} - ERR...: '{e}' ({i}/{total})", file=sys.stderr)
finally:
if proc:
try:
proc.terminate()
except ProcessLookupError:
pass
async def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"bookmarks",
help="The path to the sqlite db file containing the bookmarks. It's the places.sqlite file in your default profile dir.",
)
parser.add_argument(
"--concurrency",
"-c",
type=int,
nargs="?",
help="Max number of bookmarks to process in parallel",
default=40,
)
parser.add_argument(
"--directory",
"-d",
help="Directory to store the downloaded files. Will be recursively created if it doesn't exist. Otherwise, a temp dir will be used.",
)
args = parser.parse_args()
directory = args.directory or TemporaryDirectory().name
directory = Path(directory)
try:
directory.mkdir(exist_ok=True, parents=True)
except OSError as e:
sys.exit(f"Error while creating the output directory: {e}")
bookmark_file = Path(args.bookmarks)
if not bookmark_file.is_file():
sys.exit(f'Cannot find "{bookmark_file}"')
if not bookmark_file.name == "places.sqlite":
sys.exit(
f'The bookmark file should be a "place.sqlite" file, got "{bookmark_file}"'
)
with sqlite3.connect(bookmark_file) as con:
query = """
SELECT url from moz_places, moz_bookmarks
WHERE moz_places.id = moz_bookmarks.fk;
"""
try:
urls = {url for [url] in con.execute(query)}
except sqlite3.OperationalError as e:
if "locked" in str(e):
sys.exit("Close Firefox before running this script")
raise
total = len(urls)
print(f"Ready to process {total} bookmarks")
print(f"Saving results in: {directory}")
running_tasks = set()
concurrency_limit = asyncio.Semaphore(args.concurrency)
for i, url in enumerate(urls, 1):
running_tasks.add(download(i, total, url, directory, concurrency_limit))
await asyncio.wait(running_tasks)
print(f"Results saved in: {directory}")
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment