Skip to content

Instantly share code, notes, and snippets.

@maddie
Last active July 5, 2018 16:59
Show Gist options
  • Save maddie/4b86f7ebc0d42fabe695bf7c4f208833 to your computer and use it in GitHub Desktop.
Save maddie/4b86f7ebc0d42fabe695bf7c4f208833 to your computer and use it in GitHub Desktop.
find duplicate files in a directory recursively
package main
import (
"crypto/sha1"
"encoding/json"
"flag"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"runtime"
"strings"
"sync"
"time"
// this dependency is optional, you can always use raw value of instead of the humanized ones
"github.com/dustin/go-humanize"
)
var (
// the root path for finding duplicates
findPath = flag.String("p", "./", "path to find duplicates")
// directories that's being skipped, use relative path
skipPaths stringSlice
// how many hash workers should be spawned for working. this isn't really useful in most cases because
// the hashing speed will most likely be IO bound instead of CPU bound, but for the sake of exercise
// this option is provided and set its default to the max number of CPUs on the running machine
concurrency = flag.Int("c", runtime.NumCPU(), "number of concurrent hash workers")
// outputs the hashing log to a file, an empty string means no file will be written
logFile = flag.String("l", "", "output log to file")
// outputs the hash result to a JSON file, an empty string means no file will be written
output = flag.String("o", "", "output result as JSON to file")
// option to enable verbose output in os.Stdout, output to file (-o) will always be verbose
verbose = flag.Bool("v", false, "turn on verbose logging")
// writer for writing logs
resultOutput io.Writer = os.Stdout
// map for saving hash results
hashMap = make(map[string][]string)
// channel for distributing hash jobs to workers
hashChan = make(chan string)
// the file for log output
outputFile *os.File
// an array of file paths that were failed to access by this program during walk
// this will be shown in the end summary
failedPaths []failure
// skipped paths
skippedPaths []string
// map for hash error files
hashErrors []failure
// lock for map access
lock sync.Mutex
// wait group for hash jobs
wg sync.WaitGroup
)
type result struct {
WorkDir string `json:"work_dir"`
Skip []string `json:"skip"`
TotalFilesHashed int `json:"total_hashed"`
TotalDuplicates int `json:"total_duplicates"`
AllHashes map[string][]string `json:"all_hashes"`
Duplicates map[string][]string `json:"duplicates"`
FailedPaths []failure `json:"failed_paths"`
SkippedPaths []string `json:"skipped_paths"`
Errors []failure `json:"errors"`
}
type failure struct {
Path string `json:"path"`
Reason string `json:"reason"`
}
type stringSlice struct {
strings []string
}
func (slice *stringSlice) String() string {
return strings.Join(slice.strings, ",")
}
func (slice *stringSlice) Set(value string) error {
slice.strings = append(slice.strings, value)
return nil
}
func (slice *stringSlice) Contains(value string) bool {
for _, val := range slice.strings {
if val == value {
return true
}
}
return false
}
func (slice *stringSlice) MatchesPrefix(value string) bool {
for _, val := range slice.strings {
re := regexp.MustCompile(`^` + regexp.QuoteMeta(val) + `/.*`)
if re.MatchString(value) {
return true
}
}
return false
}
func main() {
// parse input flags
flag.Var(&skipPaths, "s", "paths to be skipped, use relative path. use multiple times for more paths")
flag.Parse()
// path usually defaults to working directory. if user gives an empty path, show help message and quit
if *findPath == "" {
flag.PrintDefaults()
os.Exit(1)
}
if len(skipPaths.strings) > 0 && !regexp.MustCompile(`^\./?`).MatchString(*findPath) {
for idx, p := range skipPaths.strings {
skipPaths.strings[idx] = filepath.Join(*findPath, p)
}
}
if len(skipPaths.strings) > 0 {
printf("skip paths: ")
for _, p := range skipPaths.strings {
printf("- %s", p)
}
}
printf("will be starting in 5 seconds...")
time.Sleep(5 * time.Second)
// combine output file and os.Stdout as resultOutput if an output file is given
if *logFile != "" {
// tries to open the output file, if the file already exists, it will be truncated
f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0644)
if err != nil {
printf("failed to open file for output: %+v", err)
os.Exit(1)
}
// save the file pointer for printVerbose
outputFile = f
// combine the writers
resultOutput = io.MultiWriter(os.Stdout, f)
}
// tries to get the given path's information first
dir, err := os.Stat(*findPath)
if err != nil {
printf("error opening directory: %+v", err)
os.Exit(1)
}
// only continue if the given path is a directory
if !dir.IsDir() {
printf("%s is not a directory", *findPath)
os.Exit(1)
}
// spawn hash workers
for i := 0; i < *concurrency; i++ {
go hashWorker()
}
// start a timer for calculating total time elapsed
start := time.Now()
// start walking through the given path recursively
walkDir(*findPath)
// wait for all hash workers to finish
wg.Wait()
workDir := *findPath
if p, err := filepath.Abs(*findPath); err == nil {
workDir = p
}
// from here to the end of the main function is the summarize part
printf("====================================================================================")
printf("working directory: %s", workDir)
printf("====================================================================================")
// a counter for showing how many files get hashed in the summary
filesTotal := 0
// a counter for showing how many duplicates (group by SHA1 checksum) in the summary
dupes := 0
// a map for recording duplicates
dupeMap := make(map[string][]string)
for hash, files := range hashMap {
filesTotal += len(files)
// more than one file has the same SHA1 checksum, duplicate(s) found
if len(files) > 1 {
dupes++
var paths []string
for _, f := range files {
if workDir != *findPath {
f = filepath.Join(workDir, f)
}
paths = append(paths, f)
}
dupeMap[hash] = paths
printf("found %d duplicate entries with SHA1 hash %s", len(files), hash)
for _, p := range paths {
printf("- %s", p)
}
// newline for readability
printf("")
}
}
// summary
if dupes == 0 {
fmt.Println("no duplicates found")
} else {
printf("%d duplicates found", dupes)
}
if len(failedPaths) > 0 {
printf("failed to access these files/directories:")
for _, path := range failedPaths {
if workDir != *findPath {
path.Path = filepath.Join(workDir, path.Path)
}
printf("- %s", path)
}
}
// see if there's any files that failed to be hashed
if len(hashErrors) > 0 {
printf("failed to hash these files:")
for _, err := range hashErrors {
printf("- %s (%s)", err.Path, err.Reason)
}
}
printf("%d files hashed", filesTotal)
printf("total time used: %s", time.Since(start).String())
// outputs result to a JSON file for later use
if *output != "" {
var result result
result.WorkDir = workDir
result.Skip = skipPaths.strings
result.TotalDuplicates = dupes
result.TotalFilesHashed = filesTotal
result.AllHashes = hashMap
result.Duplicates = dupeMap
result.FailedPaths = failedPaths
result.SkippedPaths = skippedPaths
result.Errors = hashErrors
b, err := json.Marshal(&result)
if err != nil {
printf("failed to generate result JSON: %+v", err)
os.Exit(1)
}
if err := ioutil.WriteFile(*output, b, 0644); err != nil {
printf("failed to write result JSON: %+v", err)
}
}
}
// hashWorker is the actual worker that consumes the jobs given by walkDir function
func hashWorker() {
// loop forever for incoming jobs
for {
// get a path from the channel
path := <-hashChan
// start hashing
hash := getFileMD5(path)
// skip hash errors
if hash != "" {
// add the result to the map
lock.Lock()
hashMap[hash] = append(hashMap[hash], path)
lock.Unlock()
}
// tell the wait group that this work is done
wg.Done()
}
}
// walkDir traverses the directory passed in recursively, passing files to hash workers and walk directories
// rootPath is the directory that will be walked recursively
func walkDir(rootPath string) {
filepath.Walk(rootPath, func(path string, info os.FileInfo, err error) error {
// skip root path given since it's already the one being scanned
if path == rootPath {
return nil
}
// skip paths
if skipPaths.MatchesPrefix(path) {
printf("path %s is in skip list, skipping", path)
skippedPaths = append(skippedPaths, path)
return nil
}
// if fails to open the given path, add it to the failedPaths array for summary, print logs, and continue
if err != nil {
printf("failed to open file or directory: %+v", err)
var f failure
f.Path = path
f.Reason = err.Error()
failedPaths = append(failedPaths, f)
return nil
}
// path is accessible, and if it's not a directory, pass it to the hash worker
if !info.IsDir() {
// tell wait group that a new job has been added
wg.Add(1)
hashChan <- path
}
return nil
})
}
// getFileMD5 is the actual code for calculating SHA1 checksum for a given file
func getFileMD5(path string) string {
// a timer for calculating time elapsed for hashing each file
start := time.Now()
// try to access the given file, open it as read only
f, err := os.OpenFile(path, os.O_RDONLY, 0644)
if err != nil {
printf("error opening file for hashing: %+v", err)
// returns "error" as key for hashMap when there's an error occurred during hash
var f failure
f.Path = path
f.Reason = err.Error()
hashErrors = append(hashErrors, f)
return ""
}
// close the file when work is done
defer f.Close()
// get file properties
stat, err := f.Stat()
if err != nil {
printf("error getting file information: %+v", err)
var f failure
f.Path = path
f.Reason = err.Error()
hashErrors = append(hashErrors, f)
return ""
}
// the hash
h := sha1.New()
// copy the file content directly to the hash to avoid loading the file into memory
// this is for handling very large files in some cases
if _, err := io.Copy(h, f); err != nil {
printf("error reading file for hashing: %+v", err)
var f failure
f.Path = path
f.Reason = err.Error()
hashErrors = append(hashErrors, f)
return ""
}
// output the hash as hex string
hash := fmt.Sprintf("%x", h.Sum(nil))
// you can substitute humanize here with just the raw value of stat.Size() to avoid external dependency
// be sure to change size %s to size %d if you decides to do so
printVerbose("hashed file %s (size %s, SHA1 %s), took %s", path, humanize.Bytes(uint64(stat.Size())), hash, time.Since(start).String())
// return the hex string
return hash
}
// printf prints given formatted string to the resultOutput io.Writer, resultOutput can be just os.Stdout
// or a io.MultiWriter that combines os.Stdout and a file, depending on the -o flag
func printf(format string, args ...interface{}) (int, error) {
return fprintf(resultOutput, format, args...)
}
// printVerbose is for printing extra logs that are not necessarily helpful (noisy). Where it outputs depends on
// verbose flag (-v). If verbose = true, then it outputs to whatever resultOutput directs to (just os.Stdout or along
// with output file given by flag -o). Otherwise it will only write to the output file given by flag -o, if available.
func printVerbose(format string, args ...interface{}) (int, error) {
// check for verbose flag
if *verbose {
return printf(format, args...)
}
// check for output file
if outputFile != nil {
return fprintf(outputFile, format, args...)
}
// return nothing if none of the above were true/given
return 0, nil
}
// fprintf adds newline to printed string, and output it to the given writer
func fprintf(writer io.Writer, format string, args ...interface{}) (int, error) {
// check if there's a newline in the end, add one if there's none
if !strings.HasPrefix(format, "\n") {
format = format + "\n"
}
// print the formatted string to the given writer
return fmt.Fprintf(writer, format, args...)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment