Skip to content

Instantly share code, notes, and snippets.

@Hypercoded
Created July 17, 2023 22:23
Show Gist options
  • Save Hypercoded/c26b4943daeb4e69f6b949937a485795 to your computer and use it in GitHub Desktop.
Save Hypercoded/c26b4943daeb4e69f6b949937a485795 to your computer and use it in GitHub Desktop.
A tool for scraping headlines from archive.org
import requests
import csv
from bs4 import BeautifulSoup
import datetime
import requests
import re
import os
from selenium import webdriver
import pandas as pd
class Scraper:
# selenium init
options = webdriver.ChromeOptions()
options.add_argument('window-size=1200x600')
options.add_argument('headless')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
names = {
"New York Times": "https://www.nytimes.com/",
"CNN": "https://www.cnn.com/",
"FOX": "https://www.foxnews.com/",
"New York Post": "https://nypost.com/",
"BBC": "https://www.bbc.co.uk/",
"Washington Post": "https://www.washingtonpost.com/",
"USA Today": "https://www.usatoday.com/",
"Daily Mail": "https://www.dailymail.co.uk/ushome/index.html",
"CNBC": "https://www.cnbc.com/",
"The Guardian": "https://www.theguardian.com/us"
}
namesTemp = {
"CNN": "https://www.cnn.com/US"
}
filename = ""
overwriteExisting = False
def __init__(self, filename="data.csv", overwriteExisting=False):
self.overwriteExisting = False
self.filename = filename
print(f"Initialized NewsScraper, saving data to {filename}")
def isHeadline(self, url):
if url is None:
print("URL is None")
return False
if "nytimes.com" in url or "cnn.com" in url or "washingtonpost.com" in url:
pattern = r'/(\d{4})/(\d{2})/(\d{2})/'
match = re.search(pattern, url)
if match:
return True
else:
return False
if "foxnews.com" in url:
pattern = r'\/([a-z]+)\/'
oldPattern = r'/(\d{4})/(\d{2})/(\d{2})/'
formattedUrl = url.split("foxnews.com")[1]
match = re.search(pattern, formattedUrl)
oldMatch = re.search(oldPattern, url)
for horoscope in ["Aries", "Taurus", "Gemini", "Cancer", "Leo", "Virgo", "Libra", "Scorpio", "Sagittarius",
"Capricorn", "Aquarius", "Pisces"]:
if horoscope in url:
return False
if not match and not oldMatch:
return False
elif match:
category = match.group(1).lower()
if category in ["politics", "world", "entertainment", "sports", "us", "food-drink"]:
return True
elif oldMatch:
return True
else:
return False
if "nypost.com" in url:
oldPattern = r'www\.nypost\.com\/p\/'
pattern = r'/(\d{4})/(\d{2})/(\d{2})/'
match = re.search(pattern, url)
oldMatch = re.search(oldPattern, url)
if match or oldMatch:
return True
else:
return False
if "bbc.com" in url or "bbc.co.uk" in url:
pattern = r'\/([a-z]+)\/'
formattedUrl = url.split("bbc.")[1]
match = re.search(pattern, formattedUrl)
if not match:
return False
category = match.group(1).lower()
if category in ["news", "sport", "worklife", "travel", "future", "culture"]:
return True
else:
return False
if "usatoday.com" in url:
if "/story/" in url:
return True
else:
return False
if "dailymail." in url:
if "/article-" in url:
return True
else:
return False
if "cnbc.com" in url:
if "/id/" in url:
return True
else:
pattern = r'/(\d{4})/(\d{2})/(\d{2})/'
match = re.search(pattern, url)
if match:
return True
else:
return False
if "theguardian.com" in url or "guardian.co.uk" in url:
# god i love the guardian for having CONSISTENT WEB DESIGN
pattern = r'/(\d{4})/([a-z]+)/(\d{2})/'
match = re.search(pattern, url)
if match:
return True
else:
return False
def scrapeFrontpage(self, url, name, useSelenium=False):
if useSelenium:
self.driver.get(url)
soup = BeautifulSoup(self.driver.page_source, "html.parser")
else:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
headline_elements = soup.find_all("a")
results = {}
for headline in headline_elements:
href = headline.get("href")
text = headline.text.strip().replace("\n", " ").replace("\t", " ")
if href and self.isHeadline(href):
pattern = r'/web/\d+/'
modified = href.replace("http://web.archive.org", "")
modified = re.sub(pattern, '', modified.replace("https://web.archive.org", ""))
if "dailymail" in href and "Comments" in text:
continue
results[text] = modified
return results
def scrapeDay(self, name, day="Today"):
# Passing no argument for 'day' will result in data being scraped for today.
url = self.names[name]
print(day)
try:
response = requests.get(f"http://archive.org/wayback/available?url={url}&timestamp={day}")
url = response.json()["archived_snapshots"]["closest"]["url"]
return self.scrapeFrontpage(url, name, useSelenium=False)
except:
print(f"Failed to scrape {name} for {day}")
print(f"Archive Response: {response.json()}")
return "Failed to scrape"
def getDateRange(self, start, end):
start = datetime.datetime.strptime(start, "%m-%d-%Y")
end = datetime.datetime.strptime(end, "%m-%d-%Y")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days)]
formatted = [date.strftime("%Y%m%d") for date in date_generated]
# https://archive.org/help/wayback_api.php
return formatted
def scrapeRange(self, start, end):
dates = self.getDateRange(start, end)
done = {}
log = open("log.txt", "a", encoding="utf-8")
logData = []
with open(self.filename, "r+", encoding="utf-8", newline="") as csvfile:
writer = csv.writer(csvfile, delimiter=",")
reader = csv.DictReader(csvfile, delimiter=",")
data = list(reader)
if len(data) == 0:
writer.writerow(["Date", "Publication", "Headline", "URL"])
for i, row in enumerate(data):
print(i)
done[(row["Date"], row["Publication"])] = True
print(f"Already done: {len(done)}")
for day in dates:
cache = []
logData = []
for name in self.names:
print(name)
if (day, name) in done:
print(f"{name} for {day} already done")
continue
info = self.scrapeDay(name, day)
if info == "Failed to scrape":
print(f"{name} for {day} failed to scrape.")
logData.append(f"{name} for {day} failed to scrape.\n")
continue
if len(info) == 0:
print(f"{name} for {day} returned no results.")
logData.append(f"{name} for {day} returned no results.\n")
continue
# filtering, because dailymail really makes too many articles
add = list(info.items())
limit = 100
if len(add) > limit:
add = add[:limit]
for headline, url in add:
cache.append([day, name, headline, url])
writer.writerows(cache)
log.writelines(logData)
csvfile.flush()
os.fsync(csvfile.fileno())
log.flush()
os.fsync(log.fileno())
log.close()
# print(data)
def generateDataAvailabilityCSV():
# Format:
# Date, New York Times, Washington Post, etc
# 01-01-2011, True, False, etc
data = []
with open("data.csv", "r", encoding="utf-8", newline="") as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
data = list(reader)
done = {}
for i, row in enumerate(data):
print(i)
done[(row["Date"], row["Publication"])] = True
try:
df = pd.read_csv('availability.csv')
except FileNotFoundError:
df = pd.DataFrame(
columns=['Date', 'New York Times', 'CNN', 'FOX', 'New York Post', 'BBC', 'Washington Post', 'USA Today',
'Daily Mail', 'CNBC', 'The Guardian'])
for date, publication in done:
if date not in df["Date"].values:
df = pd.concat([df,
pd.DataFrame([[date, False, False, False, False, False, False, False, False, False, False]],
columns=df.columns)])
df.loc[df["Date"] == date, publication] = True
df.sort_values(by=['Date'], inplace=True)
df.to_csv('availability.csv', index=False)
def sortCSV():
def get_sort_value(outlet):
# sorting_dict = {outlet: index for index, outlet in enumerate(sorting_order)}
# sorting_order = ['New York Times', 'CNN', 'FOX', 'New York Post', 'BBC', 'Washington Post', 'USA Today',
# 'Daily Mail', 'CNBC', 'The Guardian']
sorting_dict = {"New York Times": 0, "CNN": 1, "FOX": 2, "New York Post": 3, "BBC": 4, "Washington Post": 5,
"USA Today": 6, "Daily Mail": 7, "CNBC": 8, "The Guardian": 9}
return sorting_dict.get(outlet, len(sorting_dict))
df = pd.read_csv('data.csv')
df['ViewershipValue'] = df['Publication'].map(get_sort_value)
df.sort_values(by=['Date', 'ViewershipValue'],inplace=True)
df.drop('ViewershipValue', axis=1, inplace=True)
df.to_csv('headlines.csv', index=False)
# scraper = Scraper()
# scraper.scrapeRange("01-01-2011", "01-02-2013")
#sortCSV()
@Hypercoded
Copy link
Author

Ignore the selenium stuff & namesTemp dictionary, those were for testing purposes.

For some reason CNN data cut out at some point, so I had to switch from cnn.com to cnn.com/US (the cnn.com page wouldn't load anything if you had javascript disabled)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment