Last active
October 25, 2018 10:57
-
-
Save jdkato/99de1d536f01e956647d5bbdd0f3b7fe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"encoding/json" | |
"fmt" | |
"io" | |
"io/ioutil" | |
"reflect" | |
"gopkg.in/jdkato/prose.v2" | |
) | |
// ProdigyOutput represents a single entry of Prodigy's JSON Lines output. | |
// | |
// `LabeledEntity` is a structure defined by prose that specifies where the | |
// entities are within the given `Text`. | |
type ProdigyOutput struct { | |
Text string | |
Spans []prose.LabeledEntity | |
Answer string | |
} | |
// ReadProdigy reads our JSON Lines file line-by-line, populating a | |
// slice of `ProdigyOutput` structures. | |
func ReadProdigy(jsonLines []byte) []ProdigyOutput { | |
dec := json.NewDecoder(bytes.NewReader(jsonLines)) | |
entries := []ProdigyOutput{} | |
for { | |
ent := ProdigyOutput{} | |
err := dec.Decode(&ent) | |
if err != nil { | |
if err == io.EOF { | |
break | |
} | |
panic(err) | |
} | |
entries = append(entries, ent) | |
} | |
return entries | |
} | |
// Split divides our human-annotated data set into two groups: one for training | |
// our model and one for testing it. | |
// | |
// We're using an 80-20 split here, although you may want to use a different | |
// split. | |
func Split(data []ProdigyOutput) ([]prose.EntityContext, []ProdigyOutput) { | |
cutoff := int(float64(len(data)) * 0.8) | |
train, test := []prose.EntityContext{}, []ProdigyOutput{} | |
for i, entry := range data { | |
if i < cutoff { | |
train = append(train, prose.EntityContext{ | |
Text: entry.Text, | |
Spans: entry.Spans, | |
Accept: entry.Answer == "accept"}) | |
} else { | |
test = append(test, entry) | |
} | |
} | |
return train, test | |
} | |
func main() { | |
data, err := ioutil.ReadFile("reddit_product.jsonl") | |
if err != nil { | |
panic(err) | |
} | |
train, test := Split(ReadProdigy(data)) | |
// Here, we're training a new model named PRODUCT with the training portion | |
// of our annotated data. | |
// | |
// Depending on your hardware, this should take around 1 - 3 minutes. | |
model := prose.ModelFromData("PRODUCT", prose.UsingEntities(train)) | |
// Now, let's test our model: | |
correct := 0.0 | |
for _, entry := range test { | |
// Create a document without segmentation, which isn't required for NER. | |
doc, err := prose.NewDocument( | |
entry.Text, | |
prose.WithSegmentation(false), | |
prose.UsingModel(model)) | |
if err != nil { | |
panic(err) | |
} | |
ents := doc.Entities() | |
if entry.Answer != "accept" && len(ents) == 0 { | |
// If we rejected this entity during annotation, prose shouldn't | |
// have labeled it. | |
correct++ | |
} else { | |
// Otherwise, we need to verify that we found the correct entities. | |
expected := []string{} | |
for _, span := range entry.Spans { | |
expected = append(expected, entry.Text[span.Start:span.End]) | |
} | |
if reflect.DeepEqual(expected, ents) { | |
correct++ | |
} | |
} | |
} | |
fmt.Printf("Correct (%%): %f\n", correct / float64(len(test))) | |
model.Marshal("PRODUCT") // Save the model to disk. | |
} |
Is there way to generate the jsonl file using a set of data,
OR how do we calculate rank,score,input_has etc for the jsonl file. Please suggest .
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I already found it at https://prodi.gy/assets/data/reddit_product.jsonl