$ prodigy textcat.teach troubleshoot-sample en_core_web_sm troubleshoot_sample.jsonl
$ prodigy textcat.batch-train troubleshoot-sample en_vectors_web_lg --output troubleshoot-sample-model --eval-split 0.2
$ prodigy dataset --author msivanes openshift_troubleshoot "Dataset to classify openshift as usage or troubleshoot types."
$ prodigy db-in openshift_troubleshoot annotated_openshift_troubleshoot.jsonl
$ prodigy drop openshift_troubleshoot
$ prodigy textcat.print-stream annotated_openshift_troubleshoot.jsonl | less -r
$ prodigy pipe annotated_reddit-INSULT-textcat.jsonl
$ prodigy textcat.print-dataset openshift_troubleshoot
$ prodigy pipe --from-dataset openshift_troubleshoot | less -r
prepare_data.py
import json
def process(record_str):
record = json.loads(record_str)
category = record.get('category', '')
if category is None:
label = ""
elif category == 'Troubleshoot':
label = "Troubleshoot"
elif len(category) > 0 and category[0] in ['Install', 'Configure', 'Supportability', 'Learn more', 'Upgrade']:
label = "Usage"
else:
label = ""
processed = {"text": record['text'], "label": label, "meta": {"id": record.get('id', "")}}
return processed
with open('data/preannotated_openshift.jsonl') as file:
data = file.readlines()
processor = lambda x: process(x)
processed_data = list(map(processor, data))
with open('data/annotated_openshift.json','w') as outfile:
json.dump(processed_data, outfile)
- Convert the above json into json lines suitable for prodigy
jq -c '.[]' annotated_openshift.json > annotated_openshift_titles.jsonl
$ prodigy db-in openshift_usage_troubleshoot annotated_openshift_titles.jsonl
python -m spacy download en_core_web_md