Skip to content

Instantly share code, notes, and snippets.

@pjox
Created August 13, 2020 14:01
Show Gist options
  • Save pjox/b92e89a1808eee4d0a1769cd0dfa8d18 to your computer and use it in GitHub Desktop.
Save pjox/b92e89a1808eee4d0a1769cd0dfa8d18 to your computer and use it in GitHub Desktop.
The deduplication script for OSCAR
package main
import (
"bufio"
"fmt"
"os"
"github.com/cespare/xxhash"
)
func main() {
in, err := os.Open(os.Args[1])
if err != nil {
fmt.Println(err)
}
out, err := os.Create(os.Args[2])
if err != nil {
fmt.Println(err)
}
bufin := bufio.NewReader(in)
bufout := bufio.NewWriter(out)
tab := make(map[uint64]int)
for par, err := bufin.ReadString('\n'); err == nil; par, err = bufin.ReadString('\n') {
hash := xxhash.Sum64String(par)
if _, ok := tab[hash]; !ok || par == "\n" {
tab[hash] = 1
bufout.WriteString(par)
}
}
bufout.Flush()
in.Close()
out.Close()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment