Created
January 31, 2011 23:29
-
-
Save swdunlop/805058 to your computer and use it in GitHub Desktop.
a simple utility to hardlink mp3 files to a hash of their non-ID3 contents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import "io" | |
import "os" | |
import "fmt" | |
import "encoding/hex" | |
import "crypto/sha256" | |
import "path" | |
import "strings" | |
//TODO: if link fails, use copy. | |
//TODO: permit specification of directory. | |
//TODO: if file is a jpg | gif | pdf ignore. | |
//TODO: handle m4a | |
//TODO: handle ogg vorbis | |
//TODO: handle flac | |
// mp3collect frequently seeks mp3s to eliminate id3's | |
type Source interface { | |
io.Reader | |
io.Seeker | |
} | |
// implementation of the mp3collect command. | |
func main() { | |
if len(os.Args) < 2 { | |
fmt.Println(usage) | |
return | |
} | |
var col Collector | |
for _, root := range os.Args[1:] { | |
path.Walk(root, &col, nil) | |
} | |
} | |
// our collector implementation is a stateless implementation of path.Visitor | |
type Collector struct{} | |
// we will visit every directory. | |
func (col *Collector) VisitDir(path string, f *os.FileInfo) bool { | |
return true | |
} | |
// we will visit each file ending with .mp3 | |
func (col *Collector) VisitFile(src string, f *os.FileInfo) { | |
ext := strings.ToLower(path.Ext(src)) | |
if ext != ".mp3" { | |
return | |
} | |
hash, err := hashMp3(src) | |
if err != nil { | |
goto alert | |
} | |
_, err = linkMp3("/data/music", src, hash) | |
switch err { | |
case os.EEXIST: | |
fmt.Println(src, "-- is old.") | |
return | |
case nil: | |
fmt.Println(src, "-- is new.") | |
return | |
} | |
alert: | |
fmt.Fprintln(os.Stderr, src, "!!", err) | |
} | |
// constructs a link from an mp3 to its hash in the specified root | |
func linkMp3(root, path, hash string) (dst string, err os.Error) { | |
dst = fmt.Sprint(root, "/", hash, ".mp3") | |
err = os.Link(path, dst) | |
// Link does exactly adhere to os.Error for the purpose of identity testing | |
// this makes it difficult to identify EEXIST | |
if ler, ok := err.(*os.LinkError); ok { | |
err = ler.Error | |
} | |
return | |
} | |
// determines the media-specific hash of an mp3 file, ignoring ID3 metadata | |
func hashMp3(path string) (hash string, err os.Error) { | |
src, err := os.Open(path, os.O_RDONLY, 0600) | |
if err != nil { | |
return | |
} | |
defer src.Close() | |
start, err := findMp3Start(src) | |
if err != nil { | |
return | |
} | |
end, err := findMp3End(src) | |
if err != nil { | |
return | |
} | |
hs := sha256.New() | |
_, err = src.Seek(start, 0) | |
if err != nil { | |
return | |
} | |
_, err = io.Copy(hs, io.LimitReader(src, end-start)) | |
if err != nil { | |
return | |
} | |
hash = hex.EncodeToString(hs.Sum()) | |
return | |
} | |
// determines the start of media content for an mp3 file | |
func findMp3Start(src Source) (ofs int64, err os.Error) { | |
// reset file position to start of the source | |
_, err = src.Seek(0, 0) | |
if err != nil { | |
return | |
} | |
again: | |
more, err := skipId3v2(src) | |
if more { | |
goto again | |
} | |
return src.Seek(0, 1) | |
} | |
// determines the end of media content for an mp3 file | |
func findMp3End(src Source) (ofs int64, err os.Error) { | |
// reset file position to end of the source | |
_, err = src.Seek(0, 2) | |
if err != nil { | |
return | |
} | |
err = skipId3v1(src) | |
return src.Seek(0, 1) | |
} | |
// used by findMp3Start repeatedly to bypass ID3v2 noise | |
func skipId3v2(src Source) (more bool, err os.Error) { | |
more = false | |
origin, err := src.Seek(0, 1) | |
if err != nil { | |
goto alert | |
} | |
// as defined by ID3.org, ID3v2 can be positiively identified by looking | |
// for "ID3" followed by a number of bytes which are "sync-safe" | |
buf, err := ReadBytes(src, 10) | |
switch { | |
case err != nil: | |
goto alert | |
case buf[0] != 'I': | |
goto reset | |
case buf[1] != 'D': | |
goto reset | |
case buf[2] != '3': | |
goto reset | |
case buf[3] >= 0xFF: | |
goto reset | |
case buf[4] >= 0xFF: | |
goto reset | |
case buf[6] >= 0x80: | |
goto reset | |
case buf[7] >= 0x80: | |
goto reset | |
case buf[8] >= 0x80: | |
goto reset | |
case buf[9] >= 0x80: | |
goto reset | |
} | |
// size is big-endian with 8th bit clean, 28 bits in total | |
sz := int64(buf[6] & 0x7F) | |
sz = sz<<7 | int64(buf[7]) | |
sz = sz<<7 | int64(buf[8]) | |
sz = sz<<7 | int64(buf[9]) | |
// id3v2 headers are variable in size | |
_, err = src.Seek(sz-10, 1) | |
if err == nil { | |
more = true | |
return | |
} | |
reset: | |
_, err = src.Seek(origin, 0) | |
alert: | |
return | |
} | |
// used by findMp3End, once, to truncate the media stream, removing id3v1 tags | |
func skipId3v1(src Source) (err os.Error) { | |
origin, err := src.Seek(0, 1) | |
if err != nil { | |
goto alert | |
} | |
// ID3v1 BASIC | |
_, err = src.Seek(-128, 1) | |
if err != nil { | |
goto alert | |
} | |
buf, err := ReadBytes(src, 3) | |
switch { | |
case err != nil: | |
goto alert | |
case buf[0] != 'T': | |
goto reset | |
case buf[1] != 'A': | |
goto reset | |
case buf[2] != 'G': | |
goto reset | |
} | |
// unread those bytes, because that didn't look like an ID3 Basic Tag | |
origin, err = src.Seek(-3, 1) | |
if err != nil { | |
goto alert | |
} | |
// ID3v1 EXTENDED | |
_, err = src.Seek(-227, 1) | |
if err != nil { | |
goto alert | |
} | |
buf, err = ReadBytes(src, 4) | |
switch { | |
case err != nil: | |
goto alert | |
case buf[0] != 'T': | |
goto reset | |
case buf[1] != 'A': | |
goto reset | |
case buf[2] != 'G': | |
goto reset | |
case buf[3] != '+': | |
goto reset | |
} | |
// unread those bytes, because that didn't look like an ID3 Extended Tag | |
_, err = src.Seek(-4, 1) | |
return | |
reset: | |
_, err = src.Seek(origin, 0) | |
alert: | |
return | |
} | |
// reads a number of bytes, looping until the full requirement is satisfied. | |
func ReadBytes(r io.Reader, req int) ([]byte, os.Error) { | |
buf := make([]byte, req) | |
err := ReadSlice(r, buf) | |
if err != nil { | |
buf = nil | |
} | |
return buf, err | |
} | |
// fills a slice with bytes, looping until the buffer is full. | |
func ReadSlice(r io.Reader, buf []byte) os.Error { | |
req := cap(buf) | |
tot := 0 | |
for tot < req { | |
amt, err := r.Read(buf[tot:]) | |
if err != nil { | |
return err | |
} | |
tot += amt | |
} | |
return nil | |
} | |
const usage = ("\n" + | |
"mp3collect dir1 ... dirN\n" + | |
"\n" + | |
"Recursively searches each directory for mp3 files, and creates a hard " + | |
"link from to the file to /data/music/hash.mp3, where hash is the sha256 " + | |
"hash of the media portion of the song. Mp3collect deliberately ignores " + | |
"the ID3v1 and ID3v2 elements of the file. MP3 library applications may " + | |
"alter these entries to reflect user edits.\n" + | |
"\n" + | |
"The final product should be a directory containing only unique copies of " + | |
"the original MP3 music. These copies can then be safely managed by other " + | |
"MP3 library applications without worrying about the possibility of ID3 " + | |
"induced duplication.\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment