Skip to content

Instantly share code, notes, and snippets.

@afiaka87
Created January 29, 2021 04:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afiaka87/13ddc362e502602ff755558a31e404d9 to your computer and use it in GitHub Desktop.
Save afiaka87/13ddc362e502602ff755558a31e404d9 to your computer and use it in GitHub Desktop.
"""
# Setup:
`python3 -m pip install bs4`
# Usage:
```python3
# Change the episode_id_param and num_pages
python3 transcripts.py
```
"""
from bs4 import BeautifulSoup
import urllib
episode_id_param = 104 # Go to show on foreverdreaming. episode_id is found in the `.*/?f=(\d+).*`
num_pages = 7 # Enter the number of pages for the episode.
start_param = 0 # (dont change) The first page of results is at index 0
increment_by = 25 # (dont change) Pages are indexed via 25, 50, 75, etc.
page = "https://transcripts.foreverdreaming.org/viewforum.php?f={}&start={}" # (dont change)
page_urls = [page.format(episode_id_param, page_start) for page_start in range(start_param, num_ pages * 25, increment_by)]
each_pages_html = [urllib.request.urlopen(_url) for _url in page_urls]
links = []
for _html in each_pages_html:
soup = BeautifulSoup(_html, "html.parser")
links.extend(
(link.get('href') for link in soup.findAll('a'))
)
episode_texts = []
for link in links:
if "./viewtopic" in link:
full_url = "https://transcripts.foreverdreaming.org/" + link.replace("./","")
episode_html = urllib.request.urlopen(full_url)
soup = BeautifulSoup(episode_html, 'html.parser')
p_elements = [elem.text for elem in soup.findAll('p')]
print("\n".join(p_elements), file=open(link, 'w'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment