Skip to content

Instantly share code, notes, and snippets.

@swdunlop
Created January 31, 2011 23:29
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save swdunlop/805058 to your computer and use it in GitHub Desktop.
Save swdunlop/805058 to your computer and use it in GitHub Desktop.
a simple utility to hardlink mp3 files to a hash of their non-ID3 contents
package main
import "io"
import "os"
import "fmt"
import "encoding/hex"
import "crypto/sha256"
import "path"
import "strings"
//TODO: if link fails, use copy.
//TODO: permit specification of directory.
//TODO: if file is a jpg | gif | pdf ignore.
//TODO: handle m4a
//TODO: handle ogg vorbis
//TODO: handle flac
// mp3collect frequently seeks mp3s to eliminate id3's
type Source interface {
io.Reader
io.Seeker
}
// implementation of the mp3collect command.
func main() {
if len(os.Args) < 2 {
fmt.Println(usage)
return
}
var col Collector
for _, root := range os.Args[1:] {
path.Walk(root, &col, nil)
}
}
// our collector implementation is a stateless implementation of path.Visitor
type Collector struct{}
// we will visit every directory.
func (col *Collector) VisitDir(path string, f *os.FileInfo) bool {
return true
}
// we will visit each file ending with .mp3
func (col *Collector) VisitFile(src string, f *os.FileInfo) {
ext := strings.ToLower(path.Ext(src))
if ext != ".mp3" {
return
}
hash, err := hashMp3(src)
if err != nil {
goto alert
}
_, err = linkMp3("/data/music", src, hash)
switch err {
case os.EEXIST:
fmt.Println(src, "-- is old.")
return
case nil:
fmt.Println(src, "-- is new.")
return
}
alert:
fmt.Fprintln(os.Stderr, src, "!!", err)
}
// constructs a link from an mp3 to its hash in the specified root
func linkMp3(root, path, hash string) (dst string, err os.Error) {
dst = fmt.Sprint(root, "/", hash, ".mp3")
err = os.Link(path, dst)
// Link does exactly adhere to os.Error for the purpose of identity testing
// this makes it difficult to identify EEXIST
if ler, ok := err.(*os.LinkError); ok {
err = ler.Error
}
return
}
// determines the media-specific hash of an mp3 file, ignoring ID3 metadata
func hashMp3(path string) (hash string, err os.Error) {
src, err := os.Open(path, os.O_RDONLY, 0600)
if err != nil {
return
}
defer src.Close()
start, err := findMp3Start(src)
if err != nil {
return
}
end, err := findMp3End(src)
if err != nil {
return
}
hs := sha256.New()
_, err = src.Seek(start, 0)
if err != nil {
return
}
_, err = io.Copy(hs, io.LimitReader(src, end-start))
if err != nil {
return
}
hash = hex.EncodeToString(hs.Sum())
return
}
// determines the start of media content for an mp3 file
func findMp3Start(src Source) (ofs int64, err os.Error) {
// reset file position to start of the source
_, err = src.Seek(0, 0)
if err != nil {
return
}
again:
more, err := skipId3v2(src)
if more {
goto again
}
return src.Seek(0, 1)
}
// determines the end of media content for an mp3 file
func findMp3End(src Source) (ofs int64, err os.Error) {
// reset file position to end of the source
_, err = src.Seek(0, 2)
if err != nil {
return
}
err = skipId3v1(src)
return src.Seek(0, 1)
}
// used by findMp3Start repeatedly to bypass ID3v2 noise
func skipId3v2(src Source) (more bool, err os.Error) {
more = false
origin, err := src.Seek(0, 1)
if err != nil {
goto alert
}
// as defined by ID3.org, ID3v2 can be positiively identified by looking
// for "ID3" followed by a number of bytes which are "sync-safe"
buf, err := ReadBytes(src, 10)
switch {
case err != nil:
goto alert
case buf[0] != 'I':
goto reset
case buf[1] != 'D':
goto reset
case buf[2] != '3':
goto reset
case buf[3] >= 0xFF:
goto reset
case buf[4] >= 0xFF:
goto reset
case buf[6] >= 0x80:
goto reset
case buf[7] >= 0x80:
goto reset
case buf[8] >= 0x80:
goto reset
case buf[9] >= 0x80:
goto reset
}
// size is big-endian with 8th bit clean, 28 bits in total
sz := int64(buf[6] & 0x7F)
sz = sz<<7 | int64(buf[7])
sz = sz<<7 | int64(buf[8])
sz = sz<<7 | int64(buf[9])
// id3v2 headers are variable in size
_, err = src.Seek(sz-10, 1)
if err == nil {
more = true
return
}
reset:
_, err = src.Seek(origin, 0)
alert:
return
}
// used by findMp3End, once, to truncate the media stream, removing id3v1 tags
func skipId3v1(src Source) (err os.Error) {
origin, err := src.Seek(0, 1)
if err != nil {
goto alert
}
// ID3v1 BASIC
_, err = src.Seek(-128, 1)
if err != nil {
goto alert
}
buf, err := ReadBytes(src, 3)
switch {
case err != nil:
goto alert
case buf[0] != 'T':
goto reset
case buf[1] != 'A':
goto reset
case buf[2] != 'G':
goto reset
}
// unread those bytes, because that didn't look like an ID3 Basic Tag
origin, err = src.Seek(-3, 1)
if err != nil {
goto alert
}
// ID3v1 EXTENDED
_, err = src.Seek(-227, 1)
if err != nil {
goto alert
}
buf, err = ReadBytes(src, 4)
switch {
case err != nil:
goto alert
case buf[0] != 'T':
goto reset
case buf[1] != 'A':
goto reset
case buf[2] != 'G':
goto reset
case buf[3] != '+':
goto reset
}
// unread those bytes, because that didn't look like an ID3 Extended Tag
_, err = src.Seek(-4, 1)
return
reset:
_, err = src.Seek(origin, 0)
alert:
return
}
// reads a number of bytes, looping until the full requirement is satisfied.
func ReadBytes(r io.Reader, req int) ([]byte, os.Error) {
buf := make([]byte, req)
err := ReadSlice(r, buf)
if err != nil {
buf = nil
}
return buf, err
}
// fills a slice with bytes, looping until the buffer is full.
func ReadSlice(r io.Reader, buf []byte) os.Error {
req := cap(buf)
tot := 0
for tot < req {
amt, err := r.Read(buf[tot:])
if err != nil {
return err
}
tot += amt
}
return nil
}
const usage = ("\n" +
"mp3collect dir1 ... dirN\n" +
"\n" +
"Recursively searches each directory for mp3 files, and creates a hard " +
"link from to the file to /data/music/hash.mp3, where hash is the sha256 " +
"hash of the media portion of the song. Mp3collect deliberately ignores " +
"the ID3v1 and ID3v2 elements of the file. MP3 library applications may " +
"alter these entries to reflect user edits.\n" +
"\n" +
"The final product should be a directory containing only unique copies of " +
"the original MP3 music. These copies can then be safely managed by other " +
"MP3 library applications without worrying about the possibility of ID3 " +
"induced duplication.\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment