Skip to content

Instantly share code, notes, and snippets.

@wolf0403
Created October 3, 2016 11:50
Show Gist options
  • Save wolf0403/07de296b5cb131b54099392b9ab97207 to your computer and use it in GitHub Desktop.
Save wolf0403/07de296b5cb131b54099392b9ab97207 to your computer and use it in GitHub Desktop.
Scan multiple fs roots, looking for dup files.
package main
import (
"crypto/sha1"
"encoding/json"
"flag"
"fmt"
"hash/crc64"
"io/ioutil"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
)
const config = "config.json"
var (
writeback = flag.Bool("w", true, "write back fileinfo.json")
crcTable = crc64.MakeTable(crc64.ECMA)
)
type FolderInfo struct {
Name string `json:"name"`
Children []string `json:"children"`
}
type FileInfo struct {
Name string `json:"name"`
Size int64 `json:"size"`
First1k string `json:"cksum1k,omitempty"`
Cksum string `json:"cksum,omitempty"`
Sha1 string `json:"sha1,omitempty"`
}
type Runtime struct {
Roots []string `json:"roots"`
Filters map[string]bool `json:"filter"`
Files map[string]*FileInfo `json:"files"`
Folders map[string]*FolderInfo `json:"-"`
Dups [][]string `json:"dups"`
Links map[string][]string `json:"links"`
config string
}
func (rt *Runtime) Dump(key string) {
b, err := json.MarshalIndent(rt, "", " ")
if err != nil {
log.Fatalf("Error generating result: %v", err)
}
if err := ioutil.WriteFile(rt.config+key, b, 0644); err != nil {
log.Fatalf("Error writing result: %v", err)
}
}
// keyFn focus on low false same -
// if keyFn(f1) != keyFn(f2) then f1 != f2
type keyFn func(fi *FileInfo) (string, error)
func keySize(fi *FileInfo) (string, error) {
return strconv.Itoa(int(fi.Size)), nil
}
func keyCRC64(fi *FileInfo) (string, error) {
b, err := ioutil.ReadFile(fi.Name)
if err != nil {
return "", fmt.Errorf("Error ReadFile %q: %v", fi.Name, err)
}
crc := crc64.Checksum(b, crcTable)
return fmt.Sprintf("%v", crc), nil
}
func keySHA1(fi *FileInfo) (string, error) {
b, err := ioutil.ReadFile(fi.Name)
if err != nil {
return "", fmt.Errorf("Error ReadFile %q: %v", fi.Name, err)
}
return fmt.Sprintf("%v", sha1.Sum(b)), nil
}
type fnConfig struct {
name string
fn keyFn
}
func (rt *Runtime) Dedup() error {
return rt.dedupByKey(rt.Files, []fnConfig{
{"size", keySize},
{"crc64", keyCRC64},
{"sha1", keySHA1},
})
}
func (rt *Runtime) dedupByKey(files map[string]*FileInfo, fns []fnConfig) error {
log.Printf("Dedup by Key %s on %d files", fns[0].name, len(files))
buckets := map[string][]*FileInfo{}
for _, fi := range files {
key, err := fns[0].fn(fi)
if err != nil {
log.Printf("Error get key of %q: %v", fi.Name, err)
continue
}
files, ok := buckets[key]
if !ok {
files = []*FileInfo{}
}
buckets[key] = append(files, fi)
}
for _, fis := range buckets {
if len(fis) == 1 {
log.Printf("unique: %q", fis[0].Name)
delete(rt.Files, fis[0].Name)
continue
}
if len(fns) == 1 {
// all keys tried, final.
names := []string{}
for _, fi := range fis {
names = append(names, fi.Name)
}
rt.Dups = append(rt.Dups, names)
continue
}
names := map[string]*FileInfo{}
for _, fi := range fis {
names[fi.Name] = fi
}
rt.dedupByKey(names, fns[1:])
}
return nil
}
func getFolderInfo(path string, rt *Runtime) *FolderInfo {
if path == "" {
path = "."
}
path = strings.TrimSuffix(path, "/") + "/"
d, ok := rt.Folders[path]
if ok {
return d
}
p := filepath.Dir(path)
if p+"/" != path {
pdir := getFolderInfo(p, rt)
pdir.Children = append(pdir.Children, path)
}
d = &FolderInfo{Name: path}
rt.Folders[path] = d
return d
}
func getFileInfo(path string, dir *FolderInfo, rt *Runtime) *FileInfo {
f, ok := rt.Files[path]
if !ok {
f = &FileInfo{Name: path}
dir.Children = append(dir.Children, path)
rt.Files[path] = f
}
return f
}
func match(path, filter string) bool {
b, err := regexp.MatchString(filter, path)
if err != nil {
log.Panicf("match %q - %q failed: %v", path, filter, err)
}
return b
}
func ScanFS(root string, rt *Runtime) error {
log.Printf("Scanfs %q", root)
fs := []string{}
for filter, enabled := range rt.Filters {
if !enabled {
continue
}
fs = append(fs, filter)
}
filtered := func(path string) bool {
for _, f := range fs {
if match(path, f) {
return true
}
}
return false
}
walkFn := func(path string, info os.FileInfo, _ error) error {
if info == nil {
log.Printf("info is nil - %q", path)
return nil
}
if filtered(path) {
log.Printf("filtered: %q", path)
if info.IsDir() {
return filepath.SkipDir
}
return nil
}
if info.Mode()&os.ModeSymlink != 0 {
target, err := os.Readlink(path)
if err != nil {
log.Printf("readlink(%q) failed: %v", path, err)
return nil
}
rt.Links[target] = append(rt.Links[target], path)
return nil
}
if info.IsDir() {
getFolderInfo(path, rt)
return nil
}
dir := filepath.Dir(path)
d := getFolderInfo(dir, rt)
f := getFileInfo(path, d, rt)
f.Size = info.Size()
return nil
}
err := filepath.Walk(root, walkFn)
rt.Dump("-scanfs")
return err
}
func main() {
log.SetFlags(log.LstdFlags | log.Lshortfile)
b, err := ioutil.ReadFile(config)
if err != nil {
log.Fatalf("Error loading roots: %v", err)
}
rt := Runtime{config: config}
if err := json.Unmarshal(b, &rt); err != nil {
log.Fatalf("Error loading config: %v", err)
}
if rt.Files == nil {
rt.Files = map[string]*FileInfo{}
}
if rt.Folders == nil {
rt.Folders = map[string]*FolderInfo{}
}
if rt.Dups == nil {
rt.Dups = [][]string{}
}
if rt.Links == nil {
rt.Links = map[string][]string{}
}
if len(rt.Files) == 0 {
if len(rt.Roots) == 0 {
rt.Roots = []string{"."}
}
for _, line := range rt.Roots {
ScanFS(line, &rt)
}
}
rt.Dedup()
b, err = json.MarshalIndent(rt, "", " ")
if err != nil {
log.Fatalf("Error generating result: %v", err)
}
if *writeback {
if err := ioutil.WriteFile(config, b, 0644); err != nil {
log.Fatalf("Error writing result: %v", err)
}
} else {
fmt.Println(string(b))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment