Skip to content

Instantly share code, notes, and snippets.

@milesmcc
Created May 8, 2018 01:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save milesmcc/7a1e2b0ec68c725c6f9d6544e2e4421c to your computer and use it in GitHub Desktop.
Save milesmcc/7a1e2b0ec68c725c6f9d6544e2e4421c to your computer and use it in GitHub Desktop.
[History 300] Searching the UCSB presidential statement archive
import os
from datetime import date, datetime
import json
from tqdm import tqdm
import unicodecsv as csv
# define queries
queries = ["pentagon papers", "ellsberg", "vietnam"]
query_results = {}
# initialize data
for query in queries:
query_results[query] = []
continued = 0
# output location constants
input_directory = "/Users/Miles/Downloads/presidential_statements/"
output_directory = "/Users/Miles/Desktop/presidential_statements/"
# load and search data
print("Loading & searching data from input directory...")
for filename in tqdm(os.listdir(input_directory)):
name = filename.split(":")[0]
try:
real_date = datetime.strptime(filename.split(" - ")[-1].split(".")[0], '%B %d, %Y').date()
except ValueError: # no date listed
continued += 1
continue # better to ignore than to include potentially false data
with open(input_directory + filename, "r") as infile:
text = infile.read()
for query in queries:
if query in text.lower():
query_results[query].append({
"text": text,
"date": real_date,
"name": name
})
# user output on skipped items
print("Was forced to skip %s items." % str(continued))
# output data
for query in queries:
with open(query+'.results.csv', 'wb') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["SPEAKER", "DATE", "TEXT"])
for result in query_results[query]:
writer.writerow([result["name"], result["date"], result["text"]])
print("Wrote %s documents for query '%s'" % (len(query_results[query]), query))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment