Skip to content

Instantly share code, notes, and snippets.

@nijave
Last active April 27, 2023 00:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nijave/0759b2e2cd2ae63a59bf53121d314295 to your computer and use it in GitHub Desktop.
Save nijave/0759b2e2cd2ae63a59bf53121d314295 to your computer and use it in GitHub Desktop.
Generates a qdirstat cache file from a borgbackup archive
"""
See section about "Reading and writing cache files" for how this works
with QDirStat https://github.com/shundhammer/qdirstat/blob/master/README.md?plain=1#L848
`borg list` should run without prompting for credentials. See borg docs for configuring
environment variables https://borgbackup.readthedocs.io/en/stable/quickstart.html#automating-backups
"""
import dataclasses
import datetime
import gzip
import io
import logging
import subprocess
import sys
import time
# can be swapped out with native json (and references below fixed)
# but it will likely be a lot slower
import orjson
import typing
# percentcoding can be swapped out with urllib.parse.quote to make
# it easier to run but it's significantly slower when processing lots of data
# The version on pypi segfaults on amd64
# git+https://github.com/nijave/python-percentcoding.git
import percentcoding
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
OUTPUT_FILENAME = "qdirstat.cache.gz"
@dataclasses.dataclass
class QDirStatCacheItem:
type: str # F=File, D=Directory, L=Link
path: str # Url encoded path
size: int # Bytes
mtime: datetime.datetime
blocks: typing.Optional[int] = None
links: typing.Optional[int] = None
def __str__(self):
mtime = self.mtime
if self.mtime:
mtime = time.mktime(self.mtime.timetuple())
if self.mtime is None:
mtime = 0
line = (
f"{self.type}\t"
+ "/".join(percentcoding.quote(part) for part in self.path.split("/"))
+ f"\t{self.size}\t{mtime}"
)
if len(line) > 1024:
raise ValueError("generated item is longer than 1024 bytes")
return line
def write_qdirstat_cache_header(writer: io.TextIOBase) -> None:
writer.write(
"""[qdirstat 1.0 cache file]
# Generated by qdirstat-generate (borg-qdirstat.py)
# Do not edit!
#
# Type path size mtime <optional fields>
D / 0 0
"""
)
def borg_list_archives(repository: str) -> typing.List[str]:
logger.info("Getting list of borg archives in repository")
borg_list_output = orjson.loads(
subprocess.check_output(["borg", "list", "--json", repository], text=True)
)
archives = [archive["name"] for archive in borg_list_output["archives"]]
logger.info("Found %d archives", len(archives))
return archives
def write_borg_output(
file_stream: io.TextIOWrapper, output_filename: str = OUTPUT_FILENAME
) -> None:
items_written = 0
with io.open(output_filename, mode="wb", buffering=1024 * 1024 * 64) as raw_output:
with gzip.GzipFile(
fileobj=raw_output,
mode="w",
compresslevel=1,
) as compressed_binary:
compressed = io.TextIOWrapper(compressed_binary, encoding="utf-8")
write_qdirstat_cache_header(compressed)
fmt = "%Y-%m-%dT%H:%M:%S.%f"
while line := file_stream.readline():
file = orjson.loads(line)
mtime = datetime.datetime.strptime(file["mtime"], fmt)
compressed.write(
str(
QDirStatCacheItem(
type="D" if file["type"] == "d" else "F",
path=f'/{file["path"]}',
size=file["size"],
mtime=mtime,
)
)
)
compressed.write("\n")
items_written += 1
if items_written % 50000 == 0:
logger.info("Wrote %d items", items_written)
logger.info("Wrote %d items total", items_written)
if __name__ == "__main__":
if len(sys.argv) != 2:
raise ValueError(f"Usage: {sys.argv[0]} repo-name")
repo = sys.argv[1]
archives = borg_list_archives(repo)
archive = archives[-1]
logger.info("Listing files for archive %s", archive)
borg_list_files = subprocess.Popen(
["borg", "list", "--json-lines", f"{repo}::{archive}"],
text=True,
stdout=subprocess.PIPE,
)
write_borg_output(borg_list_files.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment