Created
April 19, 2018 13:33
-
-
Save toVersus/4fc3660e234c201cedca2b22ca94eae6 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #64: Insert and index artist information on MongoDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"flag" | |
"fmt" | |
"io" | |
"os" | |
"time" | |
mgo "gopkg.in/mgo.v2" | |
"gopkg.in/mgo.v2/bson" | |
) | |
type Artist struct { | |
ID int `json:"id"` | |
GID string `json:"gid"` | |
Name string `json:"name"` | |
SortName string `json:"sort_name"` | |
Area string `json:"area"` | |
Aliases []*Aliase `json:"aliases"` | |
Begin *Begin `json:"begin"` | |
End *End `json:"end"` | |
Tags []*Tag `json:"tags"` | |
Rating *Rating `json:"rating"` | |
} | |
type Artists []*Artist | |
type Aliase struct { | |
Name string `json:"name"` | |
SortName string `json:"sort_name"` | |
} | |
type Begin struct { | |
Year int `json:"year"` | |
Month int `json:"month"` | |
Date int `json:"date"` | |
} | |
type End struct { | |
Year int `json:"year"` | |
Month int `json:"month"` | |
Date int `json:"date"` | |
} | |
type Tag struct { | |
Count int `json:"count"` | |
Value string `json:"value"` | |
} | |
type Rating struct { | |
Count int `json:"count"` | |
Value int `json:"value"` | |
} | |
func main() { | |
var filepath string | |
flag.StringVar(&filepath, "file", "", "specify a file path") | |
flag.StringVar(&filepath, "f", "", "specify a file path") | |
flag.Parse() | |
artists, err := readBSON(filepath) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
session, err := mgo.Dial("mongodb://localhost") | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
c := session.DB("MusicBrainz").C("artist") | |
size := len(artists) | |
for progress, artist := range artists { | |
err := c.Insert(artist) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
if progress%10000 == 0 { | |
fmt.Printf("%d / %d...completed\n", progress, size) | |
} | |
} | |
} | |
func readBSON(path string) (Artists, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err) | |
} | |
defer f.Close() | |
var artists Artists | |
reader := bufio.NewReader(f) | |
for { | |
artist := Artist{} | |
buf, readErr := reader.ReadBytes('\n') | |
if (readErr != nil) && (readErr != io.EOF) { | |
panic(err) | |
} | |
if err = bson.UnmarshalJSON(buf, &artist); err != nil && readErr != io.EOF { | |
fmt.Print("could not parse json file.") | |
break | |
} | |
artists = append(artists, &artist) | |
if readErr == io.EOF { | |
break | |
} | |
} | |
return artists, nil | |
} | |
func getQueryTime(query *mgo.Query) (*Artist, time.Duration, error) { | |
artist := &Artist{} | |
start := time.Now() | |
if err := query.One(&artist); err != nil { | |
return nil, 0, err | |
} | |
return artist, time.Now().Sub(start), nil | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"os" | |
"testing" | |
"github.com/go-test/deep" | |
mgo "gopkg.in/mgo.v2" | |
"gopkg.in/mgo.v2/bson" | |
) | |
var insertDocsTests = []struct { | |
name string | |
file string | |
text string | |
want []int | |
}{ | |
{ | |
name: "should get the tags", | |
file: "./fulltext-test.json", | |
text: `{"name": "Sam James", "area": "United States", "gender": "Male", "sort_name": "James, Sam", "ended": true, "gid": "183da4be-0cb0-4e6d-ba6d-91e57b7a6780", "type": "Person", "id": 729749, "aliases": [{"name": "Sam James Vende", "sort_name": "Sam James Vende"}]} | |
{"name": "Norman Kolodziej", "area": "Germany", "gender": "Male", "sort_name": "Kolodziej, Norman", "ended": true, "gid": "5ff386f1-2c4e-4c1c-b5fc-668ec25e1b3e", "type": "Person", "id": 811484} | |
{"name": "Bass Cube", "sort_name": "Bass Cube", "ended": true, "gid": "f1568f36-152b-40da-aef3-3582636f88be", "type": "Group", "id": 6153} | |
{"name": "Medras", "sort_name": "Medras", "ended": true, "gid": "a7d007ec-8026-4e84-982d-b6306baa14df", "type": "Person", "id": 723542} | |
{"name": "Kalev Lindal", "area": "Estonia", "gender": "Male", "sort_name": "Lindal, Kalev", "ended": true, "gid": "8864f9e3-6a03-40a7-9acb-ac8386d404e7", "type": "Person", "id": 892318} | |
{"name": "Nick Flower", "sort_name": "Flower, Nick", "ended": true, "gid": "537b606c-d7e4-4c8a-8e39-91e22c2bf720", "type": "Person", "id": 725047} | |
{"name": "Danièle Forget", "sort_name": "Forget, Danièle", "ended": true, "gid": "5d6d5857-3d03-4b1b-a3ee-d789e883c1b2", "type": "Person", "id": 726135}`, | |
want: []int{729749, 811484, 6153, 723542, 892318, 725047, 726135}, | |
}, | |
} | |
func TestGetTags(t *testing.T) { | |
for _, testcase := range insertDocsTests { | |
t.Log(testcase.name) | |
f, err := os.Create(testcase.file) | |
if err != nil { | |
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err) | |
} | |
f.WriteString(testcase.text) | |
f.Close() | |
artists, err := readBSON(testcase.file) | |
if err != nil { | |
t.Errorf("could not parse a JSON file: %s\n %s\n", testcase.file, err) | |
} | |
session, err := mgo.Dial("mongodb://localhost") | |
if err != nil { | |
t.Error(err) | |
} | |
db := session.DB("Testing") | |
c := db.C("artist") | |
for _, artist := range artists { | |
err := c.Insert(artist) | |
if err != nil { | |
t.Error(err) | |
} | |
} | |
t.Log("") | |
t.Log("before indexing") | |
for _, artist := range artists { | |
query := c.Find(bson.M{"name": artist.Name}) | |
a, count, err := getQueryTime(query) | |
if err != nil { | |
t.Error(err) | |
} | |
t.Logf("%s found...%d\n", a.Name, count) | |
} | |
t.Log("") | |
t.Log("after indexing") | |
results := []int{} | |
for _, artist := range artists { | |
keys := []string{"name", "aliases.name", "tags.value", "rating.value"} | |
for _, key := range keys { | |
err = c.EnsureIndexKey(key) | |
if err != nil { | |
t.Error(err) | |
} | |
} | |
query := c.Find(bson.M{"name": artist.Name}) | |
a, count, err := getQueryTime(query) | |
if err != nil { | |
t.Error(err) | |
} | |
t.Logf("%s found...%d\n", a.Name, count) | |
results = append(results, a.ID) | |
} | |
if diff := deep.Equal(results, testcase.want); diff != nil { | |
t.Error(diff) | |
} | |
if err = db.DropDatabase(); err != nil { | |
t.Errorf("could not delete database\n %s\n", err) | |
} | |
if err = os.Remove(testcase.file); err != nil { | |
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment