Skip to content

Instantly share code, notes, and snippets.

@antonrasmussen
Created September 17, 2022 00:46
Show Gist options
  • Save antonrasmussen/7bef1c9013458909ce9f7b736d7b05b8 to your computer and use it in GitHub Desktop.
Save antonrasmussen/7bef1c9013458909ce9f7b736d7b05b8 to your computer and use it in GitHub Desktop.
Scraping Marc Maron's WTF Dispatches
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
html_doc = "Dispatches — WTF with Marc Maron Podcast.html"
date_list = []
url_list = []
with open(html_doc) as fp:
soup = BeautifulSoup(fp, 'html.parser')
for j in soup.find('div', id='content').find_all('article'):
date_list.append(j.find('a').contents[0])
for i in soup.find('div', id='content').find_all('article'):
url_list.append(str(i.find('h1').contents[1]).split('=')[2].split(' ')[0])
article_cntr = 0
for url in url_list:
url = url.replace("\"","")
article_date = date_list[article_cntr]
article_cntr += 1
#time.sleep(20)
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text().replace('\n',' ')
start_loc = soup.get_text().replace('\n',' ').find(article_date)
end_loc = soup.get_text().replace('\n',' ').find("Powered")
filename = 'out.txt'
with open(filename, 'a+') as f:
print(text[start_loc:end_loc].replace('Frank Cappello',''),file=f) # I have found that Frank's Name comes up a lot
print("\n",file=f)
print('Sleep start: ' + str(article_cntr))
time.sleep(90)
@antonrasmussen
Copy link
Author

I sleep to avoid getting an angry response about making too many requests

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment