Created
May 23, 2018 14:46
-
-
Save bobvanluijt/5df06486a1d73938b91312305ef84764 to your computer and use it in GitHub Desktop.
Creates CBOR from GloVe vector file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"bytes" | |
"compress/gzip" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"os" | |
"strconv" | |
"strings" | |
cbor "github.com/2tvenom/cbor" | |
) | |
type Vectors map[string][]float64 | |
type MapOfSimilarity map[float64]string | |
func LoadVectors(inputFile string) Vectors { | |
vectors := Vectors{} | |
// load the Glove TXT file | |
file, err := os.Open(inputFile) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer file.Close() | |
scanner := bufio.NewScanner(file) | |
if err := scanner.Err(); err != nil { | |
log.Fatal(err) | |
} | |
for scanner.Scan() { | |
stringSlice := strings.Split(scanner.Text(), " ") | |
// length is complete slice minus 1 | |
for i := 1; i < (len(stringSlice) - 1); i++ { | |
// parse the string to a float | |
float, _ := strconv.ParseFloat(stringSlice[i], 64) | |
// add the float to the vector | |
vectors[stringSlice[0]] = append(vectors[stringSlice[0]], float) | |
//vectors[stringSlice[0]][i] = float | |
} | |
} | |
return vectors | |
} | |
func main() { | |
// Load vectors | |
vectors := Vectors{} | |
vectors = LoadVectors("glove.6B.300d.txt") | |
// Create encoder and marshal | |
var buffTest bytes.Buffer | |
encoder := cbor.NewEncoder(&buffTest) | |
ok, error := encoder.Marshal(vectors) | |
//check binary string | |
if !ok { | |
fmt.Errorf("Error decoding %s", error) | |
} | |
fmt.Printf("Encoding to CBOR = done") | |
// Create GZIPed version | |
var b bytes.Buffer | |
w := gzip.NewWriter(&b) | |
w.Write(buffTest.Bytes()) | |
w.Close() | |
// Create output file | |
fmt.Println("Creating output file", "vectors.cbor.gz") | |
err := ioutil.WriteFile("vectors.cbor.gz", b.Bytes(), 0644) | |
if err != nil { | |
panic(err) | |
} | |
fmt.Println("DONE") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment