Created
May 8, 2018 01:30
-
-
Save milesmcc/7a1e2b0ec68c725c6f9d6544e2e4421c to your computer and use it in GitHub Desktop.
[History 300] Searching the UCSB presidential statement archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from datetime import date, datetime | |
import json | |
from tqdm import tqdm | |
import unicodecsv as csv | |
# define queries | |
queries = ["pentagon papers", "ellsberg", "vietnam"] | |
query_results = {} | |
# initialize data | |
for query in queries: | |
query_results[query] = [] | |
continued = 0 | |
# output location constants | |
input_directory = "/Users/Miles/Downloads/presidential_statements/" | |
output_directory = "/Users/Miles/Desktop/presidential_statements/" | |
# load and search data | |
print("Loading & searching data from input directory...") | |
for filename in tqdm(os.listdir(input_directory)): | |
name = filename.split(":")[0] | |
try: | |
real_date = datetime.strptime(filename.split(" - ")[-1].split(".")[0], '%B %d, %Y').date() | |
except ValueError: # no date listed | |
continued += 1 | |
continue # better to ignore than to include potentially false data | |
with open(input_directory + filename, "r") as infile: | |
text = infile.read() | |
for query in queries: | |
if query in text.lower(): | |
query_results[query].append({ | |
"text": text, | |
"date": real_date, | |
"name": name | |
}) | |
# user output on skipped items | |
print("Was forced to skip %s items." % str(continued)) | |
# output data | |
for query in queries: | |
with open(query+'.results.csv', 'wb') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(["SPEAKER", "DATE", "TEXT"]) | |
for result in query_results[query]: | |
writer.writerow([result["name"], result["date"], result["text"]]) | |
print("Wrote %s documents for query '%s'" % (len(query_results[query]), query)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment