jeshuamaxey/prepare_sentence_golds.py

## prepare_sentence_golds.py
import csv
from textblob import TextBlob

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

INFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: document_gold.csv"
OUTFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: sentence_gold.csv"


def get_sentences_from_document_goldfile(infile_name):
    with open(infile_name) as infile:
        reader = csv.DictReader(infile)
        sentences = []
        doc_id = 0

        for sample in reader:

            if sample['label_gold'] == 'yes':
                blob = TextBlob(sample['content'].encode('ascii', 'ignore'))

                for sentence_text in blob.sentences:
                    sample = {
                        'content': sentence_text,
                        'document_id': doc_id,
                        'source_id': 0,
                        'model_id': sample['model_id'],
                        'text_type': 'sentence',
                        'label_gold': '',
                        'label_gold_reason': '',
                        '_golden': 'TRUE'
                        }
                    sentences.append(sample)

                doc_id += 1

        return sentences


def write_sentence_samples_to_file(samples, outfile_name):
    with open(outfile_name, 'w') as outfile:
        fieldnames = ['document_id', 'source_id', 'model_id', 'text_type', 'content', 'label_gold', 'label_gold_reason', '_golden']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for sample in samples:
            writer.writerow(sample)

        print 'Written {} samples to {}'.format(len(samples), outfile_name)


def main():
    samples = get_sentences_from_document_goldfile(INFILE_NAME)
    write_sentence_samples_to_file(samples, OUTFILE_NAME)


if __name__ == "__main__":
    main()
	import csv
	from textblob import TextBlob

	import sys
	reload(sys)
	sys.setdefaultencoding("utf-8")

	INFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: document_gold.csv"
	OUTFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: sentence_gold.csv"


	def get_sentences_from_document_goldfile(infile_name):
	with open(infile_name) as infile:
	reader = csv.DictReader(infile)
	sentences = []
	doc_id = 0

	for sample in reader:

	if sample['label_gold'] == 'yes':
	blob = TextBlob(sample['content'].encode('ascii', 'ignore'))

	for sentence_text in blob.sentences:
	sample = {
	'content': sentence_text,
	'document_id': doc_id,
	'source_id': 0,
	'model_id': sample['model_id'],
	'text_type': 'sentence',
	'label_gold': '',
	'label_gold_reason': '',
	'_golden': 'TRUE'
	}
	sentences.append(sample)

	doc_id += 1

	return sentences


	def write_sentence_samples_to_file(samples, outfile_name):
	with open(outfile_name, 'w') as outfile:
	fieldnames = ['document_id', 'source_id', 'model_id', 'text_type', 'content', 'label_gold', 'label_gold_reason', '_golden']
	writer = csv.DictWriter(outfile, fieldnames=fieldnames)
	writer.writeheader()

	for sample in samples:
	writer.writerow(sample)

	print 'Written {} samples to {}'.format(len(samples), outfile_name)


	def main():
	samples = get_sentences_from_document_goldfile(INFILE_NAME)
	write_sentence_samples_to_file(samples, OUTFILE_NAME)


	if __name__ == "__main__":
	main()