Created
August 20, 2020 01:39
-
-
Save robgon-art/0c387cd90fc94cf23d72229b052596a5 to your computer and use it in GitHub Desktop.
Preprocess book summaries for GPT-2 training
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download and unzip the CMU Book Summary Dataset | |
!wget -O booksummaries.tar.gz http://www.cs.cmu.edu/~dbamman/data/booksummaries.tar.gz | |
!tar -xf booksummaries.tar.gz | |
# Import support for CSV files and the JSON format | |
import csv | |
import json | |
# Initialize the genre dictionary | |
genre_groups = {} | |
# Create and open the output file | |
plot_file = open("story_plots.txt", "w", encoding="utf-8") | |
# Process the summaries to get the genre, title, and the plot summary | |
with open('booksummaries/booksummaries.txt', newline='', encoding='utf-8') as f: | |
reader = csv.reader(f, delimiter='\t') | |
for row in reader: | |
# Get the genre | |
genre_string = row[5] | |
if len(genre_string) == 0: | |
continue | |
# Parse the genres associated with the book | |
genre_dict = json.loads(genre_string) | |
genre_dict= {k: v for k, v in sorted(genre_dict.items(), key=lambda item: item[1])} | |
genre = "" | |
for key in genre_dict: | |
genre += genre_dict[key] + ", " | |
genre = genre[:-2] | |
# Add the genre to the the dictionary | |
if not genre in genre_groups: | |
genre_groups[genre] = 1; | |
else: | |
genre_groups[genre] += 1 | |
# Get the title | |
title = row[2] | |
# Get the plot, and keep the first part | |
plot = row[6][:500] | |
plot = plot.rsplit(' ', 1)[0] + " ..." | |
# Write the fields out to the output file | |
entry = 'GENRE: ' + genre + ' TITLE: ' + title + ' PLOT:' + plot | |
plot_file.write(entry + '\n') | |
plot_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment