jeamland/playlist_extractor.py

## playlist_extractor.py
#!/usr/bin/env python

# Extract playlists from a non-XML iTunes Library file (.itl)
# Copyright (c) 2018 Benno Rice, released under the BSD (2 Clause) Licence.

# Important information on the encryption used in the .itl file found here:
# https://mrexodia.cf/reversing/2014/12/16/iTunes-Library-Format-1
# Highly useful information on the .itl format itself found here:
# https://github.com/josephw/titl/blob/master/titl-core/src/main/java/org/kafsemo/titl/ParseLibrary.java

import argparse
import collections
import csv
import enum
import io
import struct
import zlib

from Crypto.Cipher import AES


HEADER_LENGTH = 0x90
CRYPTO_KEY = b'BHUILuilfghuila3'


Hdfm = collections.namedtuple('Hdfm', field_names=[
    'file_length',
    'version',
])

Hdsm = collections.namedtuple('Hdsm', field_names=[
    'block_type',
    'block_length',
])

Hghm = collections.namedtuple('Hghm', field_names=[])

Hohm = collections.namedtuple('Hohm', field_names=[
    'record_length',
    'type',
    'data',
])

Halm = collections.namedtuple('Hghm', field_names=[])

Haim = collections.namedtuple('Haim', field_names=[])

Hilm = collections.namedtuple('Hilm', field_names=[])

Hiim = collections.namedtuple('Hiim', field_names=[])

Htlm = collections.namedtuple('Htlm', field_names=[])

Htim = collections.namedtuple('Htim', field_names=[
    'record_length',
    'sub_blocks',
    'song_id',
    'block_type',
    # 'file_type',
    # 'playtime',
    # 'track_number',
    # 'track_total',
    # 'year',
    # 'bit_rate',
    # 'sample_rate',
    # 'volume_adjustment',
    # 'start_time',
    # 'end_time',
    # 'play_count',
    # 'compilation',
    # 'last_played',
    # 'disk_number',
    # 'disk_total',
    # 'rating',
    # 'added',
])

Hqlm = collections.namedtuple('Hqlm', field_names=[])

Hqim = collections.namedtuple('Hqlm', field_names=[])

Hsts = collections.namedtuple('Hsts', field_names=[])

Hplm = collections.namedtuple('Hplm', field_names=[])

Hpim = collections.namedtuple('Hpim', field_names=[
    'item_count',
])

Hptm = collections.namedtuple('Hptm', field_names=[
    'key',
])

Hslm = collections.namedtuple('Hslm', field_names=[])

Hpsm = collections.namedtuple('Hpsm', field_names=[])

Hrlm = collections.namedtuple('Hrlm', field_names=[])

Hrpm = collections.namedtuple('Hrpm', field_names=[])


class HohmType(enum.IntEnum):
    TITLE = 0x02
    ALBUM_TITLE = 0x03
    ARTIST = 0x04
    PLAYLIST_TITLE = 0x64


HOHM_ODD_TYPES = (0x42, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x192, 0x1f7, 0x1f4, 0x202, 0x320)


class ItlIO(io.BytesIO):
    def __init__(self, *args, **kwargs):
        self.flipped = False
        super().__init__(*args, **kwargs)

    def skip(self, nbytes):
        self.read(nbytes)

    def read_ascii(self, nbytes):
        return self.read(nbytes).decode('ascii')

    def read_byte(self):
        return self.read(1)[0]

    def read_uint(self):
        if self.flipped:
            return struct.unpack('<I', self.read(4))[0]
        else:
            return struct.unpack('>I', self.read(4))[0]


class RecordParser:
    def __init__(self, data):
        self.data = ItlIO(data)

    def parse(self):
        while True:
            record_type = self.data.read_ascii(4)
            if not record_type:
                return

            if self.data.flipped:
                record_type = record_type[-1::-1]

            method = f'parse_{record_type}'

            if not hasattr(self, method):
                method = f'parse_{record_type[-1::-1]}'
                if not hasattr(self, method):
                    print(self.data.getvalue()[self.data.tell():])
                    raise ValueError(f"unknown record type: {record_type}")
                self.data.flipped = True

            length = self.data.read_uint()
            data = ItlIO(self.data.read(length - 8))
            if self.data.flipped:
                data.flipped = True
            yield getattr(self, method)(data)

    def parse_hdfm(self, data):
        file_length = data.read_uint()
        data.skip(4)
        version_length = data.read_byte()
        version = data.read_ascii(version_length)
        return Hdfm(file_length=file_length,
                    version=version)

    def parse_hdsm(self, data):
        record_length = data.read_uint()
        block_type = data.read_uint()

        if block_type in (4, 22):
            self.data.skip(record_length - len(data.getvalue()) - 8)

        return Hdsm(block_type=block_type, block_length=record_length)

    def parse_hghm(self, data):
        return Hghm()

    def parse_hohm(self, data):
        record_length = data.read_uint()
        hohm_type = data.read_uint()
        hohm_data = self.data.read(record_length - len(data.getvalue()) - 8)

        # print(hex(hohm_type), repr(hohm_data))

        if hohm_type not in HOHM_ODD_TYPES:
            hohm_data = hohm_data[16:]
            # What even is character encoding?
            # There might be something telling us what the encoding is but this
            # is sufficient for current purposes.
            if len(hohm_data) > 1 and len(hohm_data) % 2 == 0 and hohm_data[0] == 0:
                hohm_data = hohm_data.decode('utf-16be')
            elif len(hohm_data) > 1 and len(hohm_data) % 2 == 0 and hohm_data[-1] == 0:
                hohm_data = hohm_data.decode('utf-16le')
            else:
                hohm_data = hohm_data.decode('iso-8859-1')

        return Hohm(record_length=record_length, type=hohm_type, data=hohm_data)

    def parse_halm(self, data):
        return Halm()

    def parse_haim(self, data):
        return Haim()

    def parse_hilm(self, data):
        return Hilm()

    def parse_hiim(self, data):
        return Hiim()

    def parse_htlm(self, data):
        return Htlm()

    def parse_htim(self, data):
        record_length = data.read_uint()
        sub_blocks = data.read_uint()
        song_id = data.read_uint()
        block_type = data.read_uint()

        # data = self.data.read(record_length - len(data.getvalue()) - 8)
        # print(repr(data))

        return Htim(record_length, sub_blocks, song_id, block_type)

    def parse_hqlm(self, data):
        return Hqlm()

    def parse_hqim(self, data):
        return Hqim()

    def parse_hsts(self, data):
        return Hsts()

    def parse_hplm(self, data):
        return Hplm()

    def parse_hpim(self, data):
        data.skip(4 + 4)
        item_count = data.read_uint()
        return Hpim(item_count)

    def parse_hptm(self, data):
        data.skip(16)
        key = data.read_uint()
        return Hptm(key)

    def parse_hslm(self, data):
        return Hslm()

    def parse_hpsm(self, data):
        return Hpsm()

    def parse_hrlm(self, data):
        return Hrlm()

    def parse_hrpm(self, data):
        return Hrpm()


parser = argparse.ArgumentParser()
parser.add_argument('filename', nargs='?', default='iTunes Library.itl',
                    help='iTunes Library Filename')
args = parser.parse_args()

# So it appears that the .itl format, in modern versions of iTunes, has a header
# block containing some information, one part of which tells us how much of the
# following data is AES/ECB encrypted with a key that's made it around the
# Internet a bit. To get at the actual data you need to decrypt that bit in place
# then decompress (zlib) the bit after the initial header. After that it's a similar
# format to older iTunes library files.

itl = open(args.filename, 'rb').read()
header = itl[:HEADER_LENGTH]

crypt_length = (len(itl) - HEADER_LENGTH) & ~0xf
max_crypt_length = struct.unpack('>I', header[0x5C:0x60])[0]
crypt_length = min(crypt_length, max_crypt_length)

cipher = AES.new(CRYPTO_KEY, AES.MODE_ECB)
decrypted = cipher.decrypt(itl[HEADER_LENGTH:max_crypt_length + HEADER_LENGTH])

itl = decrypted + itl[max_crypt_length + HEADER_LENGTH:]
itl = header + zlib.decompress(itl)

track = {}
tracks = {}
playlist = {}
playlists = {}

for record in RecordParser(itl).parse():
    if type(record) is Htim:
        if track:
            tracks[track['song_id']] = track
        track = {'song_id': record.song_id}
    elif type(record) is Hohm:
        if record.type == HohmType.TITLE:
            track['title'] = record.data
        elif record.type == HohmType.ALBUM_TITLE:
            track['album'] = record.data
        elif record.type == HohmType.ARTIST:
            track['artist'] = record.data
        elif record.type == HohmType.PLAYLIST_TITLE:
            playlist['title'] = record.data
    elif type(record) is Hpim:
        if playlist:
            playlists[playlist['title']] = playlist
        playlist = {'items': []}
    elif type(record) is Hptm:
        playlist['items'].append(record.key)

if track:
    tracks[track['song_id']] = track

if playlist:
    playlists[playlist['title']] = playlist

output = csv.writer(open('playlists.csv', 'w'))

for title, playlist in playlists.items():
    # The playlists I was after had titles of the form 'YYYY-M' or 'YYYY-MM'...
    if len(title) < 5 or title[0] != '2' or title[4] != '-':
        continue
    year, month = title.split('-')
    # ... and I wanted to make them consistently 'YYYY-MM'.
    title = f'{year}-{int(month):02d}'
    for item in (tracks[x] for x in playlist['items']):
        print(repr(item))
        output.writerow([title, item['title'], item['artist'], item.get('album', '')])
	#!/usr/bin/env python

	# Extract playlists from a non-XML iTunes Library file (.itl)
	# Copyright (c) 2018 Benno Rice, released under the BSD (2 Clause) Licence.

	# Important information on the encryption used in the .itl file found here:
	# https://mrexodia.cf/reversing/2014/12/16/iTunes-Library-Format-1
	# Highly useful information on the .itl format itself found here:
	# https://github.com/josephw/titl/blob/master/titl-core/src/main/java/org/kafsemo/titl/ParseLibrary.java

	import argparse
	import collections
	import csv
	import enum
	import io
	import struct
	import zlib

	from Crypto.Cipher import AES


	HEADER_LENGTH = 0x90
	CRYPTO_KEY = b'BHUILuilfghuila3'


	Hdfm = collections.namedtuple('Hdfm', field_names=[
	'file_length',
	'version',
	])

	Hdsm = collections.namedtuple('Hdsm', field_names=[
	'block_type',
	'block_length',
	])

	Hghm = collections.namedtuple('Hghm', field_names=[])

	Hohm = collections.namedtuple('Hohm', field_names=[
	'record_length',
	'type',
	'data',
	])

	Halm = collections.namedtuple('Hghm', field_names=[])

	Haim = collections.namedtuple('Haim', field_names=[])

	Hilm = collections.namedtuple('Hilm', field_names=[])

	Hiim = collections.namedtuple('Hiim', field_names=[])

	Htlm = collections.namedtuple('Htlm', field_names=[])

	Htim = collections.namedtuple('Htim', field_names=[
	'record_length',
	'sub_blocks',
	'song_id',
	'block_type',
	# 'file_type',
	# 'playtime',
	# 'track_number',
	# 'track_total',
	# 'year',
	# 'bit_rate',
	# 'sample_rate',
	# 'volume_adjustment',
	# 'start_time',
	# 'end_time',
	# 'play_count',
	# 'compilation',
	# 'last_played',
	# 'disk_number',
	# 'disk_total',
	# 'rating',
	# 'added',
	])

	Hqlm = collections.namedtuple('Hqlm', field_names=[])

	Hqim = collections.namedtuple('Hqlm', field_names=[])

	Hsts = collections.namedtuple('Hsts', field_names=[])

	Hplm = collections.namedtuple('Hplm', field_names=[])

	Hpim = collections.namedtuple('Hpim', field_names=[
	'item_count',
	])

	Hptm = collections.namedtuple('Hptm', field_names=[
	'key',
	])

	Hslm = collections.namedtuple('Hslm', field_names=[])

	Hpsm = collections.namedtuple('Hpsm', field_names=[])

	Hrlm = collections.namedtuple('Hrlm', field_names=[])

	Hrpm = collections.namedtuple('Hrpm', field_names=[])


	class HohmType(enum.IntEnum):
	TITLE = 0x02
	ALBUM_TITLE = 0x03
	ARTIST = 0x04
	PLAYLIST_TITLE = 0x64


	HOHM_ODD_TYPES = (0x42, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x192, 0x1f7, 0x1f4, 0x202, 0x320)


	class ItlIO(io.BytesIO):
	def __init__(self, args, *kwargs):
	self.flipped = False
	super().__init__(args, *kwargs)

	def skip(self, nbytes):
	self.read(nbytes)

	def read_ascii(self, nbytes):
	return self.read(nbytes).decode('ascii')

	def read_byte(self):
	return self.read(1)[0]

	def read_uint(self):
	if self.flipped:
	return struct.unpack('<I', self.read(4))[0]
	else:
	return struct.unpack('>I', self.read(4))[0]


	class RecordParser:
	def __init__(self, data):
	self.data = ItlIO(data)

	def parse(self):
	while True:
	record_type = self.data.read_ascii(4)
	if not record_type:
	return

	if self.data.flipped:
	record_type = record_type[-1::-1]

	method = f'parse_{record_type}'

	if not hasattr(self, method):
	method = f'parse_{record_type[-1::-1]}'
	if not hasattr(self, method):
	print(self.data.getvalue()[self.data.tell():])
	raise ValueError(f"unknown record type: {record_type}")
	self.data.flipped = True

	length = self.data.read_uint()
	data = ItlIO(self.data.read(length - 8))
	if self.data.flipped:
	data.flipped = True
	yield getattr(self, method)(data)

	def parse_hdfm(self, data):
	file_length = data.read_uint()
	data.skip(4)
	version_length = data.read_byte()
	version = data.read_ascii(version_length)
	return Hdfm(file_length=file_length,
	version=version)

	def parse_hdsm(self, data):
	record_length = data.read_uint()
	block_type = data.read_uint()

	if block_type in (4, 22):
	self.data.skip(record_length - len(data.getvalue()) - 8)

	return Hdsm(block_type=block_type, block_length=record_length)

	def parse_hghm(self, data):
	return Hghm()

	def parse_hohm(self, data):
	record_length = data.read_uint()
	hohm_type = data.read_uint()
	hohm_data = self.data.read(record_length - len(data.getvalue()) - 8)

	# print(hex(hohm_type), repr(hohm_data))

	if hohm_type not in HOHM_ODD_TYPES:
	hohm_data = hohm_data[16:]
	# What even is character encoding?
	# There might be something telling us what the encoding is but this
	# is sufficient for current purposes.
	if len(hohm_data) > 1 and len(hohm_data) % 2 == 0 and hohm_data[0] == 0:
	hohm_data = hohm_data.decode('utf-16be')
	elif len(hohm_data) > 1 and len(hohm_data) % 2 == 0 and hohm_data[-1] == 0:
	hohm_data = hohm_data.decode('utf-16le')
	else:
	hohm_data = hohm_data.decode('iso-8859-1')

	return Hohm(record_length=record_length, type=hohm_type, data=hohm_data)

	def parse_halm(self, data):
	return Halm()

	def parse_haim(self, data):
	return Haim()

	def parse_hilm(self, data):
	return Hilm()

	def parse_hiim(self, data):
	return Hiim()

	def parse_htlm(self, data):
	return Htlm()

	def parse_htim(self, data):
	record_length = data.read_uint()
	sub_blocks = data.read_uint()
	song_id = data.read_uint()
	block_type = data.read_uint()

	# data = self.data.read(record_length - len(data.getvalue()) - 8)
	# print(repr(data))

	return Htim(record_length, sub_blocks, song_id, block_type)

	def parse_hqlm(self, data):
	return Hqlm()

	def parse_hqim(self, data):
	return Hqim()

	def parse_hsts(self, data):
	return Hsts()

	def parse_hplm(self, data):
	return Hplm()

	def parse_hpim(self, data):
	data.skip(4 + 4)
	item_count = data.read_uint()
	return Hpim(item_count)

	def parse_hptm(self, data):
	data.skip(16)
	key = data.read_uint()
	return Hptm(key)

	def parse_hslm(self, data):
	return Hslm()

	def parse_hpsm(self, data):
	return Hpsm()

	def parse_hrlm(self, data):
	return Hrlm()

	def parse_hrpm(self, data):
	return Hrpm()


	parser = argparse.ArgumentParser()
	parser.add_argument('filename', nargs='?', default='iTunes Library.itl',
	help='iTunes Library Filename')
	args = parser.parse_args()

	# So it appears that the .itl format, in modern versions of iTunes, has a header
	# block containing some information, one part of which tells us how much of the
	# following data is AES/ECB encrypted with a key that's made it around the
	# Internet a bit. To get at the actual data you need to decrypt that bit in place
	# then decompress (zlib) the bit after the initial header. After that it's a similar
	# format to older iTunes library files.

	itl = open(args.filename, 'rb').read()
	header = itl[:HEADER_LENGTH]

	crypt_length = (len(itl) - HEADER_LENGTH) & ~0xf
	max_crypt_length = struct.unpack('>I', header[0x5C:0x60])[0]
	crypt_length = min(crypt_length, max_crypt_length)

	cipher = AES.new(CRYPTO_KEY, AES.MODE_ECB)
	decrypted = cipher.decrypt(itl[HEADER_LENGTH:max_crypt_length + HEADER_LENGTH])

	itl = decrypted + itl[max_crypt_length + HEADER_LENGTH:]
	itl = header + zlib.decompress(itl)

	track = {}
	tracks = {}
	playlist = {}
	playlists = {}

	for record in RecordParser(itl).parse():
	if type(record) is Htim:
	if track:
	tracks[track['song_id']] = track
	track = {'song_id': record.song_id}
	elif type(record) is Hohm:
	if record.type == HohmType.TITLE:
	track['title'] = record.data
	elif record.type == HohmType.ALBUM_TITLE:
	track['album'] = record.data
	elif record.type == HohmType.ARTIST:
	track['artist'] = record.data
	elif record.type == HohmType.PLAYLIST_TITLE:
	playlist['title'] = record.data
	elif type(record) is Hpim:
	if playlist:
	playlists[playlist['title']] = playlist
	playlist = {'items': []}
	elif type(record) is Hptm:
	playlist['items'].append(record.key)

	if track:
	tracks[track['song_id']] = track

	if playlist:
	playlists[playlist['title']] = playlist

	output = csv.writer(open('playlists.csv', 'w'))

	for title, playlist in playlists.items():
	# The playlists I was after had titles of the form 'YYYY-M' or 'YYYY-MM'...
	if len(title) < 5 or title[0] != '2' or title[4] != '-':
	continue
	year, month = title.split('-')
	# ... and I wanted to make them consistently 'YYYY-MM'.
	title = f'{year}-{int(month):02d}'
	for item in (tracks[x] for x in playlist['items']):
	print(repr(item))
	output.writerow([title, item['title'], item['artist'], item.get('album', '')])