Creates NER training data in Spacy format from JSON downloaded from Dataturks.
import argparse | |
import sys | |
import os | |
import json | |
import logging | |
import pickle | |
############################################ NOTE ######################################################## | |
# | |
# Creates NER training data in Spacy format from JSON downloaded from Dataturks. | |
# | |
# Outputs the Spacy training data as a pickle file which can be used during Spacy training. | |
# | |
# Run: python Dataturks_to_Spacy.py <dataturks_JSON_FilePath> <training_output_FilePath> | |
# | |
# | |
############################################################################################################ | |
#enable info logging. | |
logging.getLogger().setLevel(logging.INFO) | |
def convertSingleItem(dataturks_labeled_item): | |
try: | |
data = json.loads(dataturks_labeled_item) | |
text = data['content'] | |
entities = [] | |
for annotation in data['annotation']: | |
#only a single point in text annotation. | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
# handle both list of labels or a single label. | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
#dataturks indices are both inclusive [start, end] but spacy is not [start, end) | |
entities.append((point['start'], point['end'] + 1 ,label)); | |
return (text, {"entities" : entities}); | |
except Exception as e: | |
logging.exception("Unable to process item " + dataturks_labeled_item + "\n" + "error = " + str(e)) | |
return None | |
def main(dataturks_JSON_FilePath, training_output_FilePath): | |
#make sure everything is setup. | |
if (not os.path.exists(dataturks_JSON_FilePath)): | |
logging.exception( | |
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " doesn't exist") | |
return | |
if (not os.path.exists(os.path.dirname(training_output_FilePath))): | |
logging.exception( | |
"Please specify a valid path to output file, " + os.path.dirname(training_output_FilePath) + " directory doesn't exist") | |
return | |
with open(training_output_FilePath, "a+") as f: | |
logging.info("File " + training_output_FilePath + " exists....") | |
logging.info("Converting " + dataturks_JSON_FilePath + " ..."); | |
lines = [] | |
with open(dataturks_JSON_FilePath, 'r') as f: | |
lines = f.readlines() | |
if (not lines or len(lines) == 0): | |
logging.exception( | |
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " is empty") | |
return | |
count = 0; | |
success = 0 | |
training_data = [] | |
for line in lines: | |
result = convertSingleItem(line) | |
if (result): | |
training_data.append(result) | |
success = success + 1 | |
count += 1; | |
if (count % 100 == 0): | |
logging.info(str(count) + " items done ...") | |
with open(training_output_FilePath, 'wb') as output: | |
pickle.dump(training_data, output, pickle.HIGHEST_PROTOCOL) | |
logging.info( | |
"Completed: " + str(success) + " items done, " + str(len(lines) - success) + " items ignored due to errors") | |
def create_arg_parser(): | |
""""Creates and returns the ArgumentParser object.""" | |
parser = argparse.ArgumentParser(description='Converts Dataturks NER output JSON file to Spacy training file format.') | |
parser.add_argument('dataturks_JSON_FilePath', | |
help='Path to the JSON file downloaded from Dataturks.') | |
parser.add_argument('training_output_FilePath', | |
help='Path to the file where Spacy training data will be stored as pickle output.') | |
return parser | |
if __name__ == '__main__': | |
arg_parser = create_arg_parser() | |
parsed_args = arg_parser.parse_args(sys.argv[1:]) | |
dataturks_JSON_FilePath = parsed_args.dataturks_JSON_FilePath | |
training_output_FilePath = parsed_args.training_output_FilePath | |
main(dataturks_JSON_FilePath, training_output_FilePath) |
This comment has been minimized.
This comment has been minimized.
Same error I am getting |
This comment has been minimized.
This comment has been minimized.
What I have done is I converted annotated data to StanfordNlp
format.afterwards i converted stanfordnlp to spacy.it is working now
…On Thu, 2 Apr 2020 at 10:19 PM, rahulsarkar906 ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
Same error I am getting
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<https://gist.github.com/97ff613967e8139e57091f9299c3a104#gistcomment-3237861>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AMI6GWUWGG6Y72NPUDNYOXTRKS623ANCNFSM4LFDL5QQ>
.
|
This comment has been minimized.
This comment has been minimized.
Can you please share some sample code of doing the conversion? |
This comment has been minimized.
This comment has been minimized.
U can search code in google type stanfordtonlp code.
…On Fri, 3 Apr 2020 at 2:40 AM, rahulsarkar906 ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
Can you please share some sample code of doing the conversion?
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<https://gist.github.com/97ff613967e8139e57091f9299c3a104#gistcomment-3238128>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AMI6GWVDYAZEU3GZR6NTH73RKT5LZANCNFSM4LFDL5QQ>
.
|
This comment has been minimized.
This comment has been minimized.
https://medium.com/@dataturks/convert-stanford-corenlp-training-data-to-dataturks-ner-json-output-eeaab60fb7b6
First download stanford ner format from dataturks website.then convert
stanfordner to dataturks json.
Then pass converted dataturks json to ur code.
On Fri, 3 Apr 2020 at 9:04 PM, Ramidi Aravind Reddy <
ramidiaravindreddy@gmail.com> wrote:
… U can search code in google type stanfordtonlp code.
On Fri, 3 Apr 2020 at 2:40 AM, rahulsarkar906 ***@***.***>
wrote:
> ***@***.**** commented on this gist.
> ------------------------------
>
> Can you please share some sample code of doing the conversion?
>
> —
> You are receiving this because you commented.
> Reply to this email directly, view it on GitHub
> <https://gist.github.com/97ff613967e8139e57091f9299c3a104#gistcomment-3238128>,
> or unsubscribe
> <https://github.com/notifications/unsubscribe-auth/AMI6GWVDYAZEU3GZR6NTH73RKT5LZANCNFSM4LFDL5QQ>
> .
>
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
ERROR:root:Unable to process D:\DataTurks\Resume.json
error = 'NoneType' object is not iterable
Traceback (most recent call last):
File "D:\DataturkstoSpacy.py", line 27, in convert_dataturks_to_spacy
for annotation in data['annotation']:
TypeError: 'NoneType' object is not iterable.
Getting this error while running code.