Skip to content

Instantly share code, notes, and snippets.

@anatolebeuzon
Created January 4, 2021 20:06
Show Gist options
  • Save anatolebeuzon/5a147269675ed50e9d0d60dd99abd524 to your computer and use it in GitHub Desktop.
Save anatolebeuzon/5a147269675ed50e9d0d60dd99abd524 to your computer and use it in GitHub Desktop.
CLI: set difference of two folders
// Suppose you have two folders, A and B, containing various files.
// You want to get the files in A that are not in B. This is what this CLI provides.
// A is called the 'in' set and B is called the 'discard' set.
// Items in A that are also in B will be _removed_ from A.
// File equality is based on MD5 hashes, not filenames.
package main
import (
"crypto/md5"
"flag"
"fmt"
"io"
"os"
"path/filepath"
)
var flags struct {
discard string
in string
dryRun bool
}
func init() {
flag.StringVar(&flags.discard, "discard", "", "folder containing the files you don't want to keep if they are found in the 'in' folder")
flag.StringVar(&flags.in, "in", "", "folder containing a mix of files to keep and to discard")
flag.BoolVar(&flags.dryRun, "dry-run", true, "by default, dry run is enabled, so no file is actually removed")
flag.Parse()
if flags.discard == "" || flags.in == "" {
flag.Usage()
os.Exit(1)
}
}
func main() {
fmt.Println("Loading MD5 hashes of files to discard...")
discard := make(map[string]struct{})
doWithProgress(flags.discard, func(path string) {
discard[hash(path)] = struct{}{}
})
fmt.Println("Looking for items to discard in the 'in' folder...")
deleted := 0
doWithProgress(flags.in, func(path string) {
if _, ok := discard[hash(path)]; ok {
if !flags.dryRun {
err := os.Remove(path)
if err != nil {
panic(err)
}
}
deleted++
}
})
log := fmt.Sprintf("deleted %d files", deleted)
if flags.dryRun {
fmt.Println("would have " + log)
fmt.Println("To actually remove deduped files, re-run the command with '-dry-run=false'")
} else {
fmt.Println(log)
}
}
func doWithProgress(folder string, doFn func(path string)) {
todo := countFiles(folder)
done := 0
filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
fmt.Printf("\r%d%%", int(100*done/todo))
if !info.IsDir() {
doFn(path)
done++
}
return nil
})
fmt.Println("\r100%")
}
func countFiles(folder string) int {
files := 0
filepath.Walk(folder, func(_ string, info os.FileInfo, _ error) error {
if !info.IsDir() {
files++
}
return nil
})
return files
}
// hash returns a hash of the *contents* of the file at the provided path
func hash(path string) string {
f, err := os.Open(path)
if err != nil {
panic(err)
}
defer f.Close()
h := md5.New()
if _, err := io.Copy(h, f); err != nil {
panic(err)
}
return fmt.Sprintf("%x", h.Sum(nil))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment