konverner/create_ner_dataset.py

## create_ner_dataset.py
"""
A O
spokesman O
for O
Israel B-ORG
civil I-ORG
administration I-ORG

Samuel B-PER
Graham I-PER
said O
it O

->

X = [['A', 'spokesman', 'for', 'Israel', 'civil', 'administration'], ['Samuel', 'Graham', 'said', 'it']]
Y = [['O', 'O', 'O', 'B-LOC', 'O', 'O'], ['B-PER', 'I-PER', 'O', 'O']]
idx2label = ['O', B-PER, 'I-PER', 'B-ORG', 'I-ORG']
"""

def create_dataset(path):
  with open(path) as f:
      sents = f.read().strip('\n').split('\n\n')

  X, Y = [], []
  idx2label = []
  for i in range(len(sents)):
    tokens_labels = sents[i].split('\n')
    tokens, labels = [], []
    for j in range(len(tokens_labels)):
      try:
        token, label = tokens_labels[j].split(' ')
        tokens.append(token)
        labels.append(label)
        if label not in idx2label:
          idx2label.append(label)
      except:
        print(f"error: skip line: {tokens_labels[j]}")
    X.append(tokens)
    Y.append(labels)

  return X, Y, idx2label

X, Y, idx2label = create_dataset(PATH)
print(f"{len(X)} sentences with {sum([len(X[i]) for i in range(len(X))])} tokens in total")
print(f"labels: {idx2label}")
	"""
	A O
	spokesman O
	for O
	Israel B-ORG
	civil I-ORG
	administration I-ORG

	Samuel B-PER
	Graham I-PER
	said O
	it O

	->

	X = [['A', 'spokesman', 'for', 'Israel', 'civil', 'administration'], ['Samuel', 'Graham', 'said', 'it']]
	Y = [['O', 'O', 'O', 'B-LOC', 'O', 'O'], ['B-PER', 'I-PER', 'O', 'O']]
	idx2label = ['O', B-PER, 'I-PER', 'B-ORG', 'I-ORG']
	"""

	def create_dataset(path):
	with open(path) as f:
	sents = f.read().strip('\n').split('\n\n')

	X, Y = [], []
	idx2label = []
	for i in range(len(sents)):
	tokens_labels = sents[i].split('\n')
	tokens, labels = [], []
	for j in range(len(tokens_labels)):
	try:
	token, label = tokens_labels[j].split(' ')
	tokens.append(token)
	labels.append(label)
	if label not in idx2label:
	idx2label.append(label)
	except:
	print(f"error: skip line: {tokens_labels[j]}")
	X.append(tokens)
	Y.append(labels)

	return X, Y, idx2label

	X, Y, idx2label = create_dataset(PATH)
	print(f"{len(X)} sentences with {sum([len(X[i]) for i in range(len(X))])} tokens in total")
	print(f"labels: {idx2label}")