Skip to content

Instantly share code, notes, and snippets.

@falcondai
Created May 2, 2012 17:36
Show Gist options
  • Save falcondai/2578542 to your computer and use it in GitHub Desktop.
Save falcondai/2578542 to your computer and use it in GitHub Desktop.
This module provides MP3 tags conversion from any (python-supported) encoding to UTF-8.
# File: mp3_tag.py
#
# Description: This module provides MP3 tags correction, i.e. from
# any encoding X to UTF-8, the standard across different apps
# and platforms. The default parameters target the common Chinese
# encoding GBK.
#
# Features: 1. it recursively handles MP3 files > 512KB in all subdirectories.
# 2. outputs a log formatted to show the directory structure
# with failures (decoding/encoding error, the field will be
# kept in its original state).
# 3. decodes the tags character by character and preserves
# UTF-8 characters (some tags mixed characters in UTF-8 and
# other encodings). This also makes it safe to run this program
# multiple times over the same files (it won't change the
# previously converted content.
#
# Requirement: This used mutagen library.
#
# Note: mutagen saves all output tags in ID3v2.4 which is not supported
# by win 7 file explorer or windows media player(they support up to
# ID3v2.3). So if you want to see the tags in win 7 explorer, you
# need to convert them with other tools, such as iTunes.
#
# Use: at your command prompt, type:
# python mp3_tag.py [<dir>]
# where the optional <dir> is the directory of MP3 files (handles the
# current directory if missing)
#
# More info: http://blog.falcondai.com/2012/05/text-encoding-conversion.html
#
# Author: Falcon Dai
#
# Date: 4/26/2012
#
# License: Regarding the utility code, I do not provide any warranty but
# you are free to do whatever with these code. It edits some tags of
# your MP3 files, specifically only the album, artist, album artist,
# performer, and genre fields (you can modify this) so you might
# wanna test it on a few duplicates before running it on your entire
# music library.
from __future__ import unicode_literals
import os
from mutagen.easyid3 import EasyID3
def decode_mixed(entry, encoding):
"""
decode a (mixed) string character by character.
@param: entry - a string that might contain text
in mixed encoding (raw code point
and encoding)
encoding - the encoding used in the entry
@return: a unicode string decoded from the entry
"""
t = u''
s = bytearray()
for i in range(len(entry)):
c = entry[i]
rc = c.encode('raw_unicode_escape')
if len(rc) == 6:
# byte is a code point (len = 6)
t += s.decode(encoding)
t += c
s = bytearray()
else:
# raw byte
s += rc
if i == len(entry)-1:
# at the end of entry
t += s.decode(encoding)
return t
def main(path):
print 'working on %s and all its subdirectories...' % path
log = open('mp3_tag_change.log', 'w')
# change encoding to your guess of the encoding, for supported
# encodings, check http://docs.python.org/library/codecs.html#standard-encodings
# GBK is a good guess for Chinese songs
encoding = 'gbk'
error_count = fix(path, log, 0, encoding)
print 'SUMMARY'
print 'encountered %d errors.' % error_count
log.write('encountered %d errors.' % error_count)
log.close()
def fix(dir, log, nt, encoding):
"""
recursively fixes the mp3 tags in the designated directory with
a guess encoding. Note that the mp3 tags will be saved in ID3v2.4 format.
@param: dir - the directory that contains the mp3 files to be fixed
log - the log file to dump the logging information
nt - the numbers of tabs to start with for nice logging
@return: number of errors encountered (counted by fields)
"""
print dir.encode('raw_unicode_escape')
# change to the desired directory
os.chdir(dir)
for i in range(nt):
log.write('\t')
log.write(dir.encode('utf8'))
log.write('\\\n')
log.flush()
# scan for mp3 files
fl = os.listdir('.')
dirs = []
fs = []
for f in fl:
if os.path.isdir(f):
# found a directory
dirs.append(f)
# change the size criterion to convert smaller MP3's
if f.split('.')[-1] == 'mp3' and os.stat(f).st_size > 512 * 1024:
# found an mp3 files larger than 512kb
fs.append(f)
# handle each file with tags in the following fields
# change this list to convert more or less fields
fields = ['album', 'artist', 'title', 'album artist', 'performer', 'genre']
n = 0
ne = 0
for f in fs:
print f.encode('raw_unicode_escape')
# log the file name with prefixing tabs
for i in range(nt+1):
log.write('\t')
log.write(f.encode('utf8'))
try:
# read and rewrite tags
tags = EasyID3(f)
for field in fields:
if tags.has_key(field):
new_tag = []
try:
for entry in tags[field]:
if entry != '':
t = decode_mixed(entry, encoding)
new_tag.append(t)
tags[field] = new_tag
except:
print 'encountered an error in '+field
log.write(' encountered an error in '+field)
ne += 1
tags.save()
except:
# this program only handles IDv3 tags as it is
# written now. But you can do more with mutagen
log.write(' not in IDv3 format')
ne += 1
log.write('\n')
log.flush()
n += 1
print 'edited %d mp3 files.' % n
for dir in dirs:
# update the error counts
ne += fix(dir, log, nt+1, encoding)
os.chdir('..')
return ne
if __name__ == '__main__':
import sys
if len(sys.argv) < 2:
main('.')
else:
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment