farizrahman4u/preprocess

## preprocess
import json

with open('ner_dataset.csv', 'r') as f:
    lines = f.readlines()

lines.pop(0)
lines.pop(0)

sentences = []

words = []
poss = []
tags = []


def csv(x):
    y = []
    buff = ''
    escaped = False
    for c in x:
        if c == '"':
            if escaped:
                escaped = False
            else:
                escaped = True
        elif c == ',':
            if escaped:
                buff += c
            else:
                y.append(buff)
                buff = ''
        else:
            buff += c
    if buff:
        y.append(buff)
    return y


for x in lines:
    x = x[:-1]
    if x.startswith("Sentence: "):
        sentences.append([words, poss, tags])
        words = []
        poss = []
        tags = []
        x = x.split(": ")[1]
    _, word, pos, tag = csv(x)
    words.append(word)
    poss.append(pos)
    tags.append(tag)

sentences.append([words, poss, tags])

with open('data.json', 'w') as f:
    json.dump(sentences, f)

with open('data.json', 'r') as f:
    sentences = json.load(f)
	import json

	with open('ner_dataset.csv', 'r') as f:
	lines = f.readlines()

	lines.pop(0)
	lines.pop(0)

	sentences = []

	words = []
	poss = []
	tags = []


	def csv(x):
	y = []
	buff = ''
	escaped = False
	for c in x:
	if c == '"':
	if escaped:
	escaped = False
	else:
	escaped = True
	elif c == ',':
	if escaped:
	buff += c
	else:
	y.append(buff)
	buff = ''
	else:
	buff += c
	if buff:
	y.append(buff)
	return y


	for x in lines:
	x = x[:-1]
	if x.startswith("Sentence: "):
	sentences.append([words, poss, tags])
	words = []
	poss = []
	tags = []
	x = x.split(": ")[1]
	_, word, pos, tag = csv(x)
	words.append(word)
	poss.append(pos)
	tags.append(tag)

	sentences.append([words, poss, tags])

	with open('data.json', 'w') as f:
	json.dump(sentences, f)

	with open('data.json', 'r') as f:
	sentences = json.load(f)