Techcable/convert_bookmarks.py

## convert_bookmarks.py
"""Converts firefox's JSON bookmarks format to the CSV format raindrop.io expects


See here for docs on raindrop: https://help.raindrop.io/import/#csv
"""
from __future__ import annotations
from typing import (
    ClassVar, Iterable, NewType, Any, Iterator
)
from enum import Enum, nonmember as enum_nonmember
import dataclasses as dc
import json
import operator
import functools
import sys
import csv
from io import TextIOBase
from datetime import datetime

import click

from support.simple_serde import NameStyle, ParseError, parse_type

class EntryType(Enum):
    PLACE = "text/x-moz-place"
    PLACE_CONTAINER = "text/x-moz-place-container"
    PLACE_SEPERATOR = "text/x-moz-place-separator"

def parse_moz_datetime(tp: type, value: Any) -> datetime:
    assert issubclass(tp, datetime), tp
    if not isinstance(value, int):
        raise ParseError(f"Value should be an integer timestamp: {value!r}")
    primary, micros = divmod(value, 1000_000)
    return datetime.fromtimestamp(primary).replace(microsecond=micros)


@dc.dataclass(kw_only=True)
class NormalizedEntry:
    guid: str
    title: str
    tags: list[str] = dc.field(
        metadata={"parser": lambda _tp, val: val.split(',')},
        default_factory=list
    )
    date_added: datetime = dc.field(metadata={"parser": parse_moz_datetime})
    last_modified: datetime = dc.field(metadata={"parser": parse_moz_datetime})
    entry_type: EntryType = dc.field(metadata={"name": "type"})
    root_id: str | None = dc.field(metadata={"name": "root"}, default=None)
    children: list[NormalizedEntry] | None = dc.field(default=None, repr=False)
    uri: str | None = None
    keyword: str | None = None
    _ignored_fields: ClassVar[set[str]] = {"index", "id", "typeCode", "iconUri", "postData", "charset"}
    # convert from camel case -> snake case
    _name_styling: ClassVar = (NameStyle.CAMEL_CASE, NameStyle.SNAKE_CASE)

    @staticmethod
    def parse(data: Any) -> NormalizedEntry:
        return parse_type(NormalizedEntry, data)

    def print(
        self, target=sys.stdout, *,
        level: int, child_limit: int | None = None
    ):
        indent = ' ' * (2 * level)
        num_children = len(self.children) if self.children is not None else 0
        data = self.uri or str(num_children)
        print(f"{indent}* {self.title} -> {data[:30]}", file=target)
        if self.children is not None:
            for child in self.children[:child_limit]:
                child.print(target, level=level + 1, child_limit=child_limit)

    def __hash__(self):
        return hash(self.guid)

    def __eq__(self, other):
        if isinstance(other, NormalizedEntry):
            return self.guid == other.guid
        else:
            return NotImplemented

class RootChildType(Enum):
    MENU = ("menu", "bookmarksMenuFolder")
    TOOLBAR = ("toolbar", "toolbarFolder")
    UNFILED = ("unfiled", "unfiledBookmarksFolder")
    MOBILE = ("mobile", "mobileFolder")

    title: str
    root_id: str

    def __new__(cls, title, root_id):
        obj = object.__new__(cls)
        obj._value_ = title
        obj.title = title
        obj.root_id = root_id
        return obj

    @property
    def guid_prefix(self) -> str:
        return self.title + "___"

    @property
    def human_name(self):
        match self:
            case RootChildType.MENU:
                return "Bookmarks Menu"
            case RootChildType.TOOLBAR:
                return "Bookmarks Toolbar"
            case RootChildType.UNFILED:
                return "Other Bookmarks"
            case RootChildType.MOBILE:
                return "Mobile Bookmarks"
            case _:
                raise AssertionError

@dc.dataclass(frozen=True)
class EntryCsvMeta:
    entry: NormalizedEntry
    _: dc.KW_ONLY
    human_name: str
    parent: EntryCsvMeta | None

    def __post_init__(self):
        if ('/' in self.human_name and
            self.entry.entry_type != EntryType.PLACE):
            # HACK
            object.__setattr__(
                self, 'human_name',
                self.human_name.replace('/', '<slash>')
            )

    def parents(self) -> Iterator[EntryCsvMeta]:
        parent = self.parent
        while parent is not None:
            yield parent
            parent = parent.parent

    @functools.cached_property
    def full_human_name(self) -> str:
        parts = [self.human_name]
        for parent in self.parents():
            assert '/' not in parent.human_name, parent
            parts.append(parent.human_name)
        parts.reverse()
        return '/'.join(parts)

def write_csv(root_node: NormalizedEntry, output_file: TextIOBase):
    assert root_node.guid.startswith("root___"), root_node.guid
    assert root_node.root_id == "placesRoot", root_node.root_id
    assert root_node.title == "", root_node.title
    assert root_node.children is not None
    assert root_node.entry_type == EntryType.PLACE_CONTAINER
    root_children: dict[RootChildType, NormalizedEntry] = {}
    resolved_meta: dict[NormalizedEntry, EntryCsvMeta] = {}
    for child in root_node.children:
        child_type: RootChildType = RootChildType(child.title)
        assert child_type not in root_children, (
            f"Duplicate types: {child.title!r}"
        )
        assert child.guid.startswith(child_type.guid_prefix), child.guid
        assert child.root_id == child.root_id, child.root_id
        assert child.entry_type == EntryType.PLACE_CONTAINER
        assert child.children is not None
        root_children[child_type] = child
        resolved_meta[child] = EntryCsvMeta(
            entry=child,
            human_name=child_type.human_name,
            parent=None
        )
    assert set(root_children.keys()) == set(RootChildType), set(root_children.keys())
    assert len(root_children) == 4

    stack: list[NormalizedEntry] = list(root_children.values())
    while stack:
        parent = stack.pop()
        resolved_parent = resolved_meta[parent]
        assert parent.children is not None
        for child in parent.children:
            assert child not in resolved_meta
            resolved_meta[child] = EntryCsvMeta(
                entry=child,
                human_name=child.title,
                parent=resolved_parent
            )
            if child.children:
                stack.append(child)

    resolved_meta_sorted = sorted(
        filter(
            lambda meta: meta.entry.entry_type == EntryType.PLACE,
            resolved_meta.values()
        ),
        key=operator.attrgetter('full_human_name')
    )
    writer = csv.DictWriter(
        output_file,
        ('folder', 'title', 'url', 'description', 'tags', 'created'),
    )
    writer.writeheader()
    for meta in resolved_meta_sorted:
        assert meta.entry.entry_type == EntryType.PLACE
        assert meta.parent is not None, meta
        writer.writerow(dict(
            url=meta.entry.uri,
            folder=meta.parent.full_human_name,
            title=meta.entry.title,
            # description: <missing>
            tags=','.join(meta.entry.tags),
            created=meta.entry.date_added.isoformat()
        ))


@click.command('convert')
@click.argument('input_file', type=click.File())
@click.argument('output_file', type=click.File(mode='wt'))
@click.option('output_format', '--format', type=click.Choice(('text', 'csv')))
def convert(input_file, output_format, output_file):
    raw_data = json.load(input_file)
    result = NormalizedEntry.parse(raw_data)
    match output_format:
        case 'text':
            result.print(level=0)
        case 'csv':
            write_csv(result, output_file)
        case _:
            raise AssertionError

if __name__ == "__main__":
    convert()
	"""Converts firefox's JSON bookmarks format to the CSV format raindrop.io expects


	See here for docs on raindrop: https://help.raindrop.io/import/#csv
	"""
	from __future__ import annotations
	from typing import (
	ClassVar, Iterable, NewType, Any, Iterator
	)
	from enum import Enum, nonmember as enum_nonmember
	import dataclasses as dc
	import json
	import operator
	import functools
	import sys
	import csv
	from io import TextIOBase
	from datetime import datetime

	import click

	from support.simple_serde import NameStyle, ParseError, parse_type

	class EntryType(Enum):
	PLACE = "text/x-moz-place"
	PLACE_CONTAINER = "text/x-moz-place-container"
	PLACE_SEPERATOR = "text/x-moz-place-separator"

	def parse_moz_datetime(tp: type, value: Any) -> datetime:
	assert issubclass(tp, datetime), tp
	if not isinstance(value, int):
	raise ParseError(f"Value should be an integer timestamp: {value!r}")
	primary, micros = divmod(value, 1000_000)
	return datetime.fromtimestamp(primary).replace(microsecond=micros)



	@dc.dataclass(kw_only=True)
	class NormalizedEntry:
	guid: str
	title: str
	tags: list[str] = dc.field(
	metadata={"parser": lambda _tp, val: val.split(',')},
	default_factory=list
	)
	date_added: datetime = dc.field(metadata={"parser": parse_moz_datetime})
	last_modified: datetime = dc.field(metadata={"parser": parse_moz_datetime})
	entry_type: EntryType = dc.field(metadata={"name": "type"})
	root_id: str \| None = dc.field(metadata={"name": "root"}, default=None)
	children: list[NormalizedEntry] \| None = dc.field(default=None, repr=False)
	uri: str \| None = None
	keyword: str \| None = None
	_ignored_fields: ClassVar[set[str]] = {"index", "id", "typeCode", "iconUri", "postData", "charset"}
	# convert from camel case -> snake case
	_name_styling: ClassVar = (NameStyle.CAMEL_CASE, NameStyle.SNAKE_CASE)

	@staticmethod
	def parse(data: Any) -> NormalizedEntry:
	return parse_type(NormalizedEntry, data)

	def print(
	self, target=sys.stdout, *,
	level: int, child_limit: int \| None = None
	):
	indent = ' ' * (2 * level)
	num_children = len(self.children) if self.children is not None else 0
	data = self.uri or str(num_children)
	print(f"{indent}* {self.title} -> {data[:30]}", file=target)
	if self.children is not None:
	for child in self.children[:child_limit]:
	child.print(target, level=level + 1, child_limit=child_limit)

	def __hash__(self):
	return hash(self.guid)

	def __eq__(self, other):
	if isinstance(other, NormalizedEntry):
	return self.guid == other.guid
	else:
	return NotImplemented

	class RootChildType(Enum):
	MENU = ("menu", "bookmarksMenuFolder")
	TOOLBAR = ("toolbar", "toolbarFolder")
	UNFILED = ("unfiled", "unfiledBookmarksFolder")
	MOBILE = ("mobile", "mobileFolder")

	title: str
	root_id: str

	def __new__(cls, title, root_id):
	obj = object.__new__(cls)
	obj._value_ = title
	obj.title = title
	obj.root_id = root_id
	return obj

	@property
	def guid_prefix(self) -> str:
	return self.title + "___"

	@property
	def human_name(self):
	match self:
	case RootChildType.MENU:
	return "Bookmarks Menu"
	case RootChildType.TOOLBAR:
	return "Bookmarks Toolbar"
	case RootChildType.UNFILED:
	return "Other Bookmarks"
	case RootChildType.MOBILE:
	return "Mobile Bookmarks"
	case _:
	raise AssertionError

	@dc.dataclass(frozen=True)
	class EntryCsvMeta:
	entry: NormalizedEntry
	_: dc.KW_ONLY
	human_name: str
	parent: EntryCsvMeta \| None

	def __post_init__(self):
	if ('/' in self.human_name and
	self.entry.entry_type != EntryType.PLACE):
	# HACK
	object.__setattr__(
	self, 'human_name',
	self.human_name.replace('/', '<slash>')
	)

	def parents(self) -> Iterator[EntryCsvMeta]:
	parent = self.parent
	while parent is not None:
	yield parent
	parent = parent.parent

	@functools.cached_property
	def full_human_name(self) -> str:
	parts = [self.human_name]
	for parent in self.parents():
	assert '/' not in parent.human_name, parent
	parts.append(parent.human_name)
	parts.reverse()
	return '/'.join(parts)

	def write_csv(root_node: NormalizedEntry, output_file: TextIOBase):
	assert root_node.guid.startswith("root___"), root_node.guid
	assert root_node.root_id == "placesRoot", root_node.root_id
	assert root_node.title == "", root_node.title
	assert root_node.children is not None
	assert root_node.entry_type == EntryType.PLACE_CONTAINER
	root_children: dict[RootChildType, NormalizedEntry] = {}
	resolved_meta: dict[NormalizedEntry, EntryCsvMeta] = {}
	for child in root_node.children:
	child_type: RootChildType = RootChildType(child.title)
	assert child_type not in root_children, (
	f"Duplicate types: {child.title!r}"
	)
	assert child.guid.startswith(child_type.guid_prefix), child.guid
	assert child.root_id == child.root_id, child.root_id
	assert child.entry_type == EntryType.PLACE_CONTAINER
	assert child.children is not None
	root_children[child_type] = child
	resolved_meta[child] = EntryCsvMeta(
	entry=child,
	human_name=child_type.human_name,
	parent=None
	)
	assert set(root_children.keys()) == set(RootChildType), set(root_children.keys())
	assert len(root_children) == 4

	stack: list[NormalizedEntry] = list(root_children.values())
	while stack:
	parent = stack.pop()
	resolved_parent = resolved_meta[parent]
	assert parent.children is not None
	for child in parent.children:
	assert child not in resolved_meta
	resolved_meta[child] = EntryCsvMeta(
	entry=child,
	human_name=child.title,
	parent=resolved_parent
	)
	if child.children:
	stack.append(child)

	resolved_meta_sorted = sorted(
	filter(
	lambda meta: meta.entry.entry_type == EntryType.PLACE,
	resolved_meta.values()
	),
	key=operator.attrgetter('full_human_name')
	)
	writer = csv.DictWriter(
	output_file,
	('folder', 'title', 'url', 'description', 'tags', 'created'),
	)
	writer.writeheader()
	for meta in resolved_meta_sorted:
	assert meta.entry.entry_type == EntryType.PLACE
	assert meta.parent is not None, meta
	writer.writerow(dict(
	url=meta.entry.uri,
	folder=meta.parent.full_human_name,
	title=meta.entry.title,
	# description: <missing>
	tags=','.join(meta.entry.tags),
	created=meta.entry.date_added.isoformat()
	))


	@click.command('convert')
	@click.argument('input_file', type=click.File())
	@click.argument('output_file', type=click.File(mode='wt'))
	@click.option('output_format', '--format', type=click.Choice(('text', 'csv')))
	def convert(input_file, output_format, output_file):
	raw_data = json.load(input_file)
	result = NormalizedEntry.parse(raw_data)
	match output_format:
	case 'text':
	result.print(level=0)
	case 'csv':
	write_csv(result, output_file)
	case _:
	raise AssertionError

	if __name__ == "__main__":
	convert()