Skip to content

Instantly share code, notes, and snippets.

@pjox
pjox / dedup.go
Created August 13, 2020 14:01
The deduplication script for OSCAR
package main
import (
"bufio"
"fmt"
"os"
"github.com/cespare/xxhash"
)
import os
import fasttext
import re
lid = fasttext.load_model("lid.176.bin")
def listdir(x):
return [x + '/' + fn for fn in os.listdir(x)]
def id(x):
#!/bin/sh
set -x
# == Swarm training (alpha release) ==
# Setup:
#
# git clone https://github.com/shawwn/gpt-2
# cd gpt-2
# git checkout dev-shard