Created
July 17, 2023 22:23
-
-
Save Hypercoded/c26b4943daeb4e69f6b949937a485795 to your computer and use it in GitHub Desktop.
A tool for scraping headlines from archive.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
from bs4 import BeautifulSoup | |
import datetime | |
import requests | |
import re | |
import os | |
from selenium import webdriver | |
import pandas as pd | |
class Scraper: | |
# selenium init | |
options = webdriver.ChromeOptions() | |
options.add_argument('window-size=1200x600') | |
options.add_argument('headless') | |
prefs = {"profile.managed_default_content_settings.images": 2} | |
options.add_experimental_option("prefs", prefs) | |
driver = webdriver.Chrome(options=options) | |
names = { | |
"New York Times": "https://www.nytimes.com/", | |
"CNN": "https://www.cnn.com/", | |
"FOX": "https://www.foxnews.com/", | |
"New York Post": "https://nypost.com/", | |
"BBC": "https://www.bbc.co.uk/", | |
"Washington Post": "https://www.washingtonpost.com/", | |
"USA Today": "https://www.usatoday.com/", | |
"Daily Mail": "https://www.dailymail.co.uk/ushome/index.html", | |
"CNBC": "https://www.cnbc.com/", | |
"The Guardian": "https://www.theguardian.com/us" | |
} | |
namesTemp = { | |
"CNN": "https://www.cnn.com/US" | |
} | |
filename = "" | |
overwriteExisting = False | |
def __init__(self, filename="data.csv", overwriteExisting=False): | |
self.overwriteExisting = False | |
self.filename = filename | |
print(f"Initialized NewsScraper, saving data to {filename}") | |
def isHeadline(self, url): | |
if url is None: | |
print("URL is None") | |
return False | |
if "nytimes.com" in url or "cnn.com" in url or "washingtonpost.com" in url: | |
pattern = r'/(\d{4})/(\d{2})/(\d{2})/' | |
match = re.search(pattern, url) | |
if match: | |
return True | |
else: | |
return False | |
if "foxnews.com" in url: | |
pattern = r'\/([a-z]+)\/' | |
oldPattern = r'/(\d{4})/(\d{2})/(\d{2})/' | |
formattedUrl = url.split("foxnews.com")[1] | |
match = re.search(pattern, formattedUrl) | |
oldMatch = re.search(oldPattern, url) | |
for horoscope in ["Aries", "Taurus", "Gemini", "Cancer", "Leo", "Virgo", "Libra", "Scorpio", "Sagittarius", | |
"Capricorn", "Aquarius", "Pisces"]: | |
if horoscope in url: | |
return False | |
if not match and not oldMatch: | |
return False | |
elif match: | |
category = match.group(1).lower() | |
if category in ["politics", "world", "entertainment", "sports", "us", "food-drink"]: | |
return True | |
elif oldMatch: | |
return True | |
else: | |
return False | |
if "nypost.com" in url: | |
oldPattern = r'www\.nypost\.com\/p\/' | |
pattern = r'/(\d{4})/(\d{2})/(\d{2})/' | |
match = re.search(pattern, url) | |
oldMatch = re.search(oldPattern, url) | |
if match or oldMatch: | |
return True | |
else: | |
return False | |
if "bbc.com" in url or "bbc.co.uk" in url: | |
pattern = r'\/([a-z]+)\/' | |
formattedUrl = url.split("bbc.")[1] | |
match = re.search(pattern, formattedUrl) | |
if not match: | |
return False | |
category = match.group(1).lower() | |
if category in ["news", "sport", "worklife", "travel", "future", "culture"]: | |
return True | |
else: | |
return False | |
if "usatoday.com" in url: | |
if "/story/" in url: | |
return True | |
else: | |
return False | |
if "dailymail." in url: | |
if "/article-" in url: | |
return True | |
else: | |
return False | |
if "cnbc.com" in url: | |
if "/id/" in url: | |
return True | |
else: | |
pattern = r'/(\d{4})/(\d{2})/(\d{2})/' | |
match = re.search(pattern, url) | |
if match: | |
return True | |
else: | |
return False | |
if "theguardian.com" in url or "guardian.co.uk" in url: | |
# god i love the guardian for having CONSISTENT WEB DESIGN | |
pattern = r'/(\d{4})/([a-z]+)/(\d{2})/' | |
match = re.search(pattern, url) | |
if match: | |
return True | |
else: | |
return False | |
def scrapeFrontpage(self, url, name, useSelenium=False): | |
if useSelenium: | |
self.driver.get(url) | |
soup = BeautifulSoup(self.driver.page_source, "html.parser") | |
else: | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
headline_elements = soup.find_all("a") | |
results = {} | |
for headline in headline_elements: | |
href = headline.get("href") | |
text = headline.text.strip().replace("\n", " ").replace("\t", " ") | |
if href and self.isHeadline(href): | |
pattern = r'/web/\d+/' | |
modified = href.replace("http://web.archive.org", "") | |
modified = re.sub(pattern, '', modified.replace("https://web.archive.org", "")) | |
if "dailymail" in href and "Comments" in text: | |
continue | |
results[text] = modified | |
return results | |
def scrapeDay(self, name, day="Today"): | |
# Passing no argument for 'day' will result in data being scraped for today. | |
url = self.names[name] | |
print(day) | |
try: | |
response = requests.get(f"http://archive.org/wayback/available?url={url}×tamp={day}") | |
url = response.json()["archived_snapshots"]["closest"]["url"] | |
return self.scrapeFrontpage(url, name, useSelenium=False) | |
except: | |
print(f"Failed to scrape {name} for {day}") | |
print(f"Archive Response: {response.json()}") | |
return "Failed to scrape" | |
def getDateRange(self, start, end): | |
start = datetime.datetime.strptime(start, "%m-%d-%Y") | |
end = datetime.datetime.strptime(end, "%m-%d-%Y") | |
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days)] | |
formatted = [date.strftime("%Y%m%d") for date in date_generated] | |
# https://archive.org/help/wayback_api.php | |
return formatted | |
def scrapeRange(self, start, end): | |
dates = self.getDateRange(start, end) | |
done = {} | |
log = open("log.txt", "a", encoding="utf-8") | |
logData = [] | |
with open(self.filename, "r+", encoding="utf-8", newline="") as csvfile: | |
writer = csv.writer(csvfile, delimiter=",") | |
reader = csv.DictReader(csvfile, delimiter=",") | |
data = list(reader) | |
if len(data) == 0: | |
writer.writerow(["Date", "Publication", "Headline", "URL"]) | |
for i, row in enumerate(data): | |
print(i) | |
done[(row["Date"], row["Publication"])] = True | |
print(f"Already done: {len(done)}") | |
for day in dates: | |
cache = [] | |
logData = [] | |
for name in self.names: | |
print(name) | |
if (day, name) in done: | |
print(f"{name} for {day} already done") | |
continue | |
info = self.scrapeDay(name, day) | |
if info == "Failed to scrape": | |
print(f"{name} for {day} failed to scrape.") | |
logData.append(f"{name} for {day} failed to scrape.\n") | |
continue | |
if len(info) == 0: | |
print(f"{name} for {day} returned no results.") | |
logData.append(f"{name} for {day} returned no results.\n") | |
continue | |
# filtering, because dailymail really makes too many articles | |
add = list(info.items()) | |
limit = 100 | |
if len(add) > limit: | |
add = add[:limit] | |
for headline, url in add: | |
cache.append([day, name, headline, url]) | |
writer.writerows(cache) | |
log.writelines(logData) | |
csvfile.flush() | |
os.fsync(csvfile.fileno()) | |
log.flush() | |
os.fsync(log.fileno()) | |
log.close() | |
# print(data) | |
def generateDataAvailabilityCSV(): | |
# Format: | |
# Date, New York Times, Washington Post, etc | |
# 01-01-2011, True, False, etc | |
data = [] | |
with open("data.csv", "r", encoding="utf-8", newline="") as csvfile: | |
reader = csv.DictReader(csvfile, delimiter=",") | |
data = list(reader) | |
done = {} | |
for i, row in enumerate(data): | |
print(i) | |
done[(row["Date"], row["Publication"])] = True | |
try: | |
df = pd.read_csv('availability.csv') | |
except FileNotFoundError: | |
df = pd.DataFrame( | |
columns=['Date', 'New York Times', 'CNN', 'FOX', 'New York Post', 'BBC', 'Washington Post', 'USA Today', | |
'Daily Mail', 'CNBC', 'The Guardian']) | |
for date, publication in done: | |
if date not in df["Date"].values: | |
df = pd.concat([df, | |
pd.DataFrame([[date, False, False, False, False, False, False, False, False, False, False]], | |
columns=df.columns)]) | |
df.loc[df["Date"] == date, publication] = True | |
df.sort_values(by=['Date'], inplace=True) | |
df.to_csv('availability.csv', index=False) | |
def sortCSV(): | |
def get_sort_value(outlet): | |
# sorting_dict = {outlet: index for index, outlet in enumerate(sorting_order)} | |
# sorting_order = ['New York Times', 'CNN', 'FOX', 'New York Post', 'BBC', 'Washington Post', 'USA Today', | |
# 'Daily Mail', 'CNBC', 'The Guardian'] | |
sorting_dict = {"New York Times": 0, "CNN": 1, "FOX": 2, "New York Post": 3, "BBC": 4, "Washington Post": 5, | |
"USA Today": 6, "Daily Mail": 7, "CNBC": 8, "The Guardian": 9} | |
return sorting_dict.get(outlet, len(sorting_dict)) | |
df = pd.read_csv('data.csv') | |
df['ViewershipValue'] = df['Publication'].map(get_sort_value) | |
df.sort_values(by=['Date', 'ViewershipValue'],inplace=True) | |
df.drop('ViewershipValue', axis=1, inplace=True) | |
df.to_csv('headlines.csv', index=False) | |
# scraper = Scraper() | |
# scraper.scrapeRange("01-01-2011", "01-02-2013") | |
#sortCSV() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ignore the selenium stuff & namesTemp dictionary, those were for testing purposes.
For some reason CNN data cut out at some point, so I had to switch from cnn.com to cnn.com/US (the cnn.com page wouldn't load anything if you had javascript disabled)