Created
July 4, 2021 16:41
-
-
Save prakhar21/a28b27c3186079e8cb02a838446d015f to your computer and use it in GitHub Desktop.
reading corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
import glob | |
import codecs | |
import matplotlib.pyplot as plt | |
import requests | |
# Books present | |
books = sorted(glob.glob("data/harrypotter/*.txt")) | |
print "Available Books: \n" | |
for i in books: | |
print i.split("/")[2].split("_")[0] | |
# Read data from all books to single corpus variable | |
temp = "" | |
t = "" | |
chars = [] | |
book_raw = [] | |
for book in books: | |
with codecs.open(book, "rb", "utf-8") as infile: | |
temp = infile.read() | |
book_raw.append(temp) | |
# Available Books: | |
# Book 1 - The Philosopher's Stone | |
# Book 2 - The Chamber of Secrets | |
# Book 3 - The Prisoner of Azkaban | |
# Book 4 - The Goblet of Fire | |
# Book 5 - The Order of the Phoenix | |
# Book 6 - The Half Blood Prince | |
# Book 7 - The Deathly Hallows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment