astoeckel/transfer_rhythmdb_ratings.py

## transfer_rhythmdb_ratings.py
#!/usr/bin/env python3

import os
import sys
import urllib.parse
import urllib.request
import lxml.etree as ET
import icu
import collections
import math

class RatingDB:

    Song = collections.namedtuple('Song', 'title artist album duration rating filesize')

    def __init__(self):
        self.songs = []
        self.tl = icu.Transliterator.createInstance('Any-Latin; Latin-ASCII')
        self.attrs = ('title', 'artist', 'album')
        self.attrs_importance = {
                'title': 0.6,
                'artist': 0.2,
                'album': 0.2
        }

        self.shingles_to_songs = {}
        self.songs_to_shingles = {attr: [] for attr in self.attrs}


    def shingle(self, s, l=3):
        t = ''.join([c for c in self.tl.transliterate(s).lower() if c.isalnum()])
        return {t[i:i+l] for i in range(0, len(t) - l)}

    def read_song(self, x):
        def text_or_default(key, default=None):
            elem = x.find(key)
            if elem is None:
                return default
            return elem.text

        # Fetch the important metadata
        title = text_or_default("title")
        artist = text_or_default("artist")
        album = text_or_default("album")
        duration = float(text_or_default("duration", 0.0))
        rating = text_or_default("rating")
        filesize = int(text_or_default("file-size", 0))

        # Map "unknown" onto nothing
        if title.lower() == "unknown":
            title = None
        if artist.lower() == "unknown":
            artist = None
        if album.lower() == "unknown":
            album = None

        return RatingDB.Song(title, artist, album, duration, rating, filesize)


    def read_rhythmdb(self, filename):
        x_root = ET.parse(filename).getroot()
        for x_child in x_root:
            if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
                continue

            song = self.read_song(x_child)
            if song.rating is None:
                continue
            song_idx = len(self.songs)
            self.songs.append(song)
            for attr in self.attrs:
                self.songs_to_shingles[attr].append(set())
                s = getattr(song, attr)
                if s is None:
                    continue
                shingles = self.shingle(s)
                for shingle in shingles:
                    if not shingle in self.shingles_to_songs:
                        self.shingles_to_songs[shingle] = set()
                    self.shingles_to_songs[shingle].add(song_idx)
                self.songs_to_shingles[attr][song_idx] = shingles

    def infer_rating(self, song):
        # For relevant attributes, compute the similarity to other songs in the
        # local database
        matched_songs = {}
        for attr in self.attrs:
            # Convert the attribute to shingles
            value = getattr(song, attr)
            if value is None:
                continue
            shingles = self.shingle(value)

            # Fetch all songs sharing the same shingles
            songs = set()
            for shingle in shingles:
                if shingle in self.shingles_to_songs:
                    songs |= self.shingles_to_songs[shingle]

            # For each song, compute the Jaccard similarity between the song
            # shingles and the shingles for this attribute
            sim = {}
            for song_idx in songs:
                song_shingles = self.songs_to_shingles[attr][song_idx]
                sim = len(song_shingles.intersection(shingles)) / len(song_shingles | shingles)
                if sim > 0.5:
                    if not song_idx in matched_songs:
                        matched_songs[song_idx] = {attr: 0.0 for attr in self.attrs}
                    matched_songs[song_idx][attr] = sim

        # Compute the overal score, i.e. the likelihood that the given song is
        # actually one of the matched_songs. Compare the song durations. Return
        # the above-threshold song with the highest rating.
        best_rating, best_song_idx, best_p = 0, None, 0.5
        for song_idx in matched_songs:
            matched_song = self.songs[song_idx]
            p = 0.0
            for attr, w in self.attrs_importance.items():
                p += w * matched_songs[song_idx][attr]
            if song.duration > 0.0 and matched_song.duration > 0.0:
                p *= math.exp(-(song.duration - matched_song.duration)**2 / 1000)
            rating = 0 if matched_song.rating is None else int(matched_song.rating)
            if p > 0.5 and rating > best_rating:
                best_rating = rating
                best_song_idx = song_idx
                best_p = p
            elif p > best_p and rating == best_rating:
                best_song_idx = song_idx
                best_p = p
        return best_song_idx

    def transfer_ratings(self, filename):
        matched_songs = {}
        x_root = ET.parse(filename).getroot()
        for x_child in x_root:
            if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
                continue

            # Convert the element into a "song" element
            song = self.read_song(x_child)

            # Remove the "rating" element; there can be only one song matched
            # to the same logical song
            x_rating = x_child.find("rating")
            if not x_rating is None:
                x_child.remove(x_rating)

            # Try to infer the rating and the logical song
            song_idx = self.infer_rating(song)
            if song_idx is None:
                continue
            sys.stderr.write(str(song) + ' --> ' + str(self.songs[song_idx]) + '\n')

            # Remember that this song was matched to the given song index
            if not song_idx in matched_songs:
                matched_songs[song_idx] = []
            matched_songs[song_idx].append((song, x_child))

        # For each matched logical song, find the song in the XML file with the
        # best quality. Add the "rating" tag with the inferred rating to exactly
        # one song.
        for song_idx, songs in matched_songs.items():
            largest_filesize, x_tar = 0, None
            for song, x_child in songs:
                if song.filesize > largest_filesize:
                    x_tar = x_child
                    largest_filesize = song.filesize
            if not x_tar is None:
                x_rating = ET.SubElement(x_tar, "rating")
                x_rating.text = str(self.songs[song_idx].rating)

        return x_root

db = RatingDB()
db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.bck.xml')
x_new = db.transfer_ratings('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
sys.stdout.buffer.write(ET.tostring(x_new, pretty_print=True))
	#!/usr/bin/env python3

	import os
	import sys
	import urllib.parse
	import urllib.request
	import lxml.etree as ET
	import icu
	import collections
	import math

	class RatingDB:

	Song = collections.namedtuple('Song', 'title artist album duration rating filesize')

	def __init__(self):
	self.songs = []
	self.tl = icu.Transliterator.createInstance('Any-Latin; Latin-ASCII')
	self.attrs = ('title', 'artist', 'album')
	self.attrs_importance = {
	'title': 0.6,
	'artist': 0.2,
	'album': 0.2
	}

	self.shingles_to_songs = {}
	self.songs_to_shingles = {attr: [] for attr in self.attrs}


	def shingle(self, s, l=3):
	t = ''.join([c for c in self.tl.transliterate(s).lower() if c.isalnum()])
	return {t[i:i+l] for i in range(0, len(t) - l)}

	def read_song(self, x):
	def text_or_default(key, default=None):
	elem = x.find(key)
	if elem is None:
	return default
	return elem.text

	# Fetch the important metadata
	title = text_or_default("title")
	artist = text_or_default("artist")
	album = text_or_default("album")
	duration = float(text_or_default("duration", 0.0))
	rating = text_or_default("rating")
	filesize = int(text_or_default("file-size", 0))

	# Map "unknown" onto nothing
	if title.lower() == "unknown":
	title = None
	if artist.lower() == "unknown":
	artist = None
	if album.lower() == "unknown":
	album = None

	return RatingDB.Song(title, artist, album, duration, rating, filesize)


	def read_rhythmdb(self, filename):
	x_root = ET.parse(filename).getroot()
	for x_child in x_root:
	if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
	continue

	song = self.read_song(x_child)
	if song.rating is None:
	continue
	song_idx = len(self.songs)
	self.songs.append(song)
	for attr in self.attrs:
	self.songs_to_shingles[attr].append(set())
	s = getattr(song, attr)
	if s is None:
	continue
	shingles = self.shingle(s)
	for shingle in shingles:
	if not shingle in self.shingles_to_songs:
	self.shingles_to_songs[shingle] = set()
	self.shingles_to_songs[shingle].add(song_idx)
	self.songs_to_shingles[attr][song_idx] = shingles

	def infer_rating(self, song):
	# For relevant attributes, compute the similarity to other songs in the
	# local database
	matched_songs = {}
	for attr in self.attrs:
	# Convert the attribute to shingles
	value = getattr(song, attr)
	if value is None:
	continue
	shingles = self.shingle(value)

	# Fetch all songs sharing the same shingles
	songs = set()
	for shingle in shingles:
	if shingle in self.shingles_to_songs:
	songs \|= self.shingles_to_songs[shingle]

	# For each song, compute the Jaccard similarity between the song
	# shingles and the shingles for this attribute
	sim = {}
	for song_idx in songs:
	song_shingles = self.songs_to_shingles[attr][song_idx]
	sim = len(song_shingles.intersection(shingles)) / len(song_shingles \| shingles)
	if sim > 0.5:
	if not song_idx in matched_songs:
	matched_songs[song_idx] = {attr: 0.0 for attr in self.attrs}
	matched_songs[song_idx][attr] = sim

	# Compute the overal score, i.e. the likelihood that the given song is
	# actually one of the matched_songs. Compare the song durations. Return
	# the above-threshold song with the highest rating.
	best_rating, best_song_idx, best_p = 0, None, 0.5
	for song_idx in matched_songs:
	matched_song = self.songs[song_idx]
	p = 0.0
	for attr, w in self.attrs_importance.items():
	p += w * matched_songs[song_idx][attr]
	if song.duration > 0.0 and matched_song.duration > 0.0:
	p = math.exp(-(song.duration - matched_song.duration)*2 / 1000)
	rating = 0 if matched_song.rating is None else int(matched_song.rating)
	if p > 0.5 and rating > best_rating:
	best_rating = rating
	best_song_idx = song_idx
	best_p = p
	elif p > best_p and rating == best_rating:
	best_song_idx = song_idx
	best_p = p
	return best_song_idx

	def transfer_ratings(self, filename):
	matched_songs = {}
	x_root = ET.parse(filename).getroot()
	for x_child in x_root:
	if x_child.tag != 'entry' or x_child.attrib['type'] != 'song':
	continue

	# Convert the element into a "song" element
	song = self.read_song(x_child)

	# Remove the "rating" element; there can be only one song matched
	# to the same logical song
	x_rating = x_child.find("rating")
	if not x_rating is None:
	x_child.remove(x_rating)

	# Try to infer the rating and the logical song
	song_idx = self.infer_rating(song)
	if song_idx is None:
	continue
	sys.stderr.write(str(song) + ' --> ' + str(self.songs[song_idx]) + '\n')

	# Remember that this song was matched to the given song index
	if not song_idx in matched_songs:
	matched_songs[song_idx] = []
	matched_songs[song_idx].append((song, x_child))

	# For each matched logical song, find the song in the XML file with the
	# best quality. Add the "rating" tag with the inferred rating to exactly
	# one song.
	for song_idx, songs in matched_songs.items():
	largest_filesize, x_tar = 0, None
	for song, x_child in songs:
	if song.filesize > largest_filesize:
	x_tar = x_child
	largest_filesize = song.filesize
	if not x_tar is None:
	x_rating = ET.SubElement(x_tar, "rating")
	x_rating.text = str(self.songs[song_idx].rating)

	return x_root

	db = RatingDB()
	db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
	db.read_rhythmdb('/home/andreas/.local/share/rhythmbox/rhythmdb.bck.xml')
	x_new = db.transfer_ratings('/home/andreas/.local/share/rhythmbox/rhythmdb.xml')
	sys.stdout.buffer.write(ET.tostring(x_new, pretty_print=True))