Skip to content

Instantly share code, notes, and snippets.

@jeshuamaxey
Created March 10, 2016 16:07
Show Gist options
  • Save jeshuamaxey/bee85740917274a4d0a6 to your computer and use it in GitHub Desktop.
Save jeshuamaxey/bee85740917274a4d0a6 to your computer and use it in GitHub Desktop.
Expects a documents gold data file to exist. Breaks the content of the document gold data into sentences and save them to a csv file.
import csv
from textblob import TextBlob
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
INFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: document_gold.csv"
OUTFILE_NAME = r"absolute/path/to/text-labelling/text-labelling/cf_files/explosion: sentence_gold.csv"
def get_sentences_from_document_goldfile(infile_name):
with open(infile_name) as infile:
reader = csv.DictReader(infile)
sentences = []
doc_id = 0
for sample in reader:
if sample['label_gold'] == 'yes':
blob = TextBlob(sample['content'].encode('ascii', 'ignore'))
for sentence_text in blob.sentences:
sample = {
'content': sentence_text,
'document_id': doc_id,
'source_id': 0,
'model_id': sample['model_id'],
'text_type': 'sentence',
'label_gold': '',
'label_gold_reason': '',
'_golden': 'TRUE'
}
sentences.append(sample)
doc_id += 1
return sentences
def write_sentence_samples_to_file(samples, outfile_name):
with open(outfile_name, 'w') as outfile:
fieldnames = ['document_id', 'source_id', 'model_id', 'text_type', 'content', 'label_gold', 'label_gold_reason', '_golden']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for sample in samples:
writer.writerow(sample)
print 'Written {} samples to {}'.format(len(samples), outfile_name)
def main():
samples = get_sentences_from_document_goldfile(INFILE_NAME)
write_sentence_samples_to_file(samples, OUTFILE_NAME)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment