robgon-art/preprocess.py

## preprocess.py
# Download and unzip the CMU Book Summary Dataset
!wget -O booksummaries.tar.gz http://www.cs.cmu.edu/~dbamman/data/booksummaries.tar.gz
!tar -xf booksummaries.tar.gz

# Import support for CSV files and the JSON format
import csv
import json

# Initialize the genre dictionary
genre_groups = {}

# Create and open the output file
plot_file = open("story_plots.txt", "w", encoding="utf-8")

# Process the summaries to get the genre, title, and the plot summary
with open('booksummaries/booksummaries.txt', newline='', encoding='utf-8') as f:
  reader = csv.reader(f, delimiter='\t')
  for row in reader:

    # Get the genre
    genre_string = row[5]
    if len(genre_string) == 0:
      continue

    # Parse the genres associated with the book
    genre_dict = json.loads(genre_string)
    genre_dict= {k: v for k, v in sorted(genre_dict.items(), key=lambda item: item[1])}
    genre = ""
    for key in genre_dict:
      genre += genre_dict[key] + ", "
    genre = genre[:-2]

    # Add the genre to the the dictionary
    if not genre in genre_groups:
      genre_groups[genre] = 1;
    else:
      genre_groups[genre] += 1

    # Get the title
    title = row[2]

    # Get the plot, and keep the first part
    plot = row[6][:500]
    plot = plot.rsplit(' ', 1)[0] + " ..."

    # Write the fields out to the output file
    entry = 'GENRE: ' + genre + ' TITLE: ' + title + ' PLOT:' + plot
    plot_file.write(entry + '\n')
plot_file.close()
	# Download and unzip the CMU Book Summary Dataset
	!wget -O booksummaries.tar.gz http://www.cs.cmu.edu/~dbamman/data/booksummaries.tar.gz
	!tar -xf booksummaries.tar.gz

	# Import support for CSV files and the JSON format
	import csv
	import json

	# Initialize the genre dictionary
	genre_groups = {}

	# Create and open the output file
	plot_file = open("story_plots.txt", "w", encoding="utf-8")

	# Process the summaries to get the genre, title, and the plot summary
	with open('booksummaries/booksummaries.txt', newline='', encoding='utf-8') as f:
	reader = csv.reader(f, delimiter='\t')
	for row in reader:

	# Get the genre
	genre_string = row[5]
	if len(genre_string) == 0:
	continue

	# Parse the genres associated with the book
	genre_dict = json.loads(genre_string)
	genre_dict= {k: v for k, v in sorted(genre_dict.items(), key=lambda item: item[1])}
	genre = ""
	for key in genre_dict:
	genre += genre_dict[key] + ", "
	genre = genre[:-2]

	# Add the genre to the the dictionary
	if not genre in genre_groups:
	genre_groups[genre] = 1;
	else:
	genre_groups[genre] += 1

	# Get the title
	title = row[2]

	# Get the plot, and keep the first part
	plot = row[6][:500]
	plot = plot.rsplit(' ', 1)[0] + " ..."

	# Write the fields out to the output file
	entry = 'GENRE: ' + genre + ' TITLE: ' + title + ' PLOT:' + plot
	plot_file.write(entry + '\n')
	plot_file.close()