Skip to content

Instantly share code, notes, and snippets.

@jdkato
Last active October 25, 2018 10:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdkato/99de1d536f01e956647d5bbdd0f3b7fe to your computer and use it in GitHub Desktop.
Save jdkato/99de1d536f01e956647d5bbdd0f3b7fe to your computer and use it in GitHub Desktop.
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"reflect"
"gopkg.in/jdkato/prose.v2"
)
// ProdigyOutput represents a single entry of Prodigy's JSON Lines output.
//
// `LabeledEntity` is a structure defined by prose that specifies where the
// entities are within the given `Text`.
type ProdigyOutput struct {
Text string
Spans []prose.LabeledEntity
Answer string
}
// ReadProdigy reads our JSON Lines file line-by-line, populating a
// slice of `ProdigyOutput` structures.
func ReadProdigy(jsonLines []byte) []ProdigyOutput {
dec := json.NewDecoder(bytes.NewReader(jsonLines))
entries := []ProdigyOutput{}
for {
ent := ProdigyOutput{}
err := dec.Decode(&ent)
if err != nil {
if err == io.EOF {
break
}
panic(err)
}
entries = append(entries, ent)
}
return entries
}
// Split divides our human-annotated data set into two groups: one for training
// our model and one for testing it.
//
// We're using an 80-20 split here, although you may want to use a different
// split.
func Split(data []ProdigyOutput) ([]prose.EntityContext, []ProdigyOutput) {
cutoff := int(float64(len(data)) * 0.8)
train, test := []prose.EntityContext{}, []ProdigyOutput{}
for i, entry := range data {
if i < cutoff {
train = append(train, prose.EntityContext{
Text: entry.Text,
Spans: entry.Spans,
Accept: entry.Answer == "accept"})
} else {
test = append(test, entry)
}
}
return train, test
}
func main() {
data, err := ioutil.ReadFile("reddit_product.jsonl")
if err != nil {
panic(err)
}
train, test := Split(ReadProdigy(data))
// Here, we're training a new model named PRODUCT with the training portion
// of our annotated data.
//
// Depending on your hardware, this should take around 1 - 3 minutes.
model := prose.ModelFromData("PRODUCT", prose.UsingEntities(train))
// Now, let's test our model:
correct := 0.0
for _, entry := range test {
// Create a document without segmentation, which isn't required for NER.
doc, err := prose.NewDocument(
entry.Text,
prose.WithSegmentation(false),
prose.UsingModel(model))
if err != nil {
panic(err)
}
ents := doc.Entities()
if entry.Answer != "accept" && len(ents) == 0 {
// If we rejected this entity during annotation, prose shouldn't
// have labeled it.
correct++
} else {
// Otherwise, we need to verify that we found the correct entities.
expected := []string{}
for _, span := range entry.Spans {
expected = append(expected, entry.Text[span.Start:span.End])
}
if reflect.DeepEqual(expected, ents) {
correct++
}
}
}
fmt.Printf("Correct (%%): %f\n", correct / float64(len(test)))
model.Marshal("PRODUCT") // Save the model to disk.
}
@WhisperRain
Copy link

Mr Joseph, where is file reddit_product.jsonl ?

@WhisperRain
Copy link

@vk18vk
Copy link

vk18vk commented Oct 25, 2018

Is there way to generate the jsonl file using a set of data,
OR how do we calculate rank,score,input_has etc for the jsonl file. Please suggest .

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment