Skip to content

Instantly share code, notes, and snippets.

@davidefiocco
Created November 5, 2020 17:59
Show Gist options
  • Save davidefiocco/eb92fc5427d2924e52858361ce98ef1a to your computer and use it in GitHub Desktop.
Save davidefiocco/eb92fc5427d2924e52858361ce98ef1a to your computer and use it in GitHub Desktop.
Prodigy recipe to categorize differences in text
{"removed":"These results suggested that the distribution of bacterial communities was driven more by sample types than the separate caves from which samples were collected.","added":"These results suggest that the distribution of bacterial communities is driven more by sample types than the separate caves from which samples were collected.","meta":{"score":1}}
import prodigy
from prodigy.components.loaders import JSONL
# run with
# python -m prodigy diff-textcat copyedits copyedits.jsonl -F .\diff_text.py
def add_label_to_stream(stream, label):
for eg in stream:
# The 'label' you get from the command line is a list
# so let's just assume it's always one and take the first
eg["label"] = label[0]
yield eg
@prodigy.recipe(
"diff-textcat",
dataset=("The dataset to use", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
)
def copyedit(dataset, source):
stream = JSONL(source)
blocks = [
{"view_id": "diff"},
{"view_id": "classification"},
]
stream = stream
return {
"stream": stream,
"dataset": dataset,
"view_id": "blocks",
"config": {
"blocks": blocks,
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment