Skip to content

Instantly share code, notes, and snippets.

@anniejw6
Last active January 19, 2018 05:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anniejw6/8a99384978ec534c33aac532b2cb138d to your computer and use it in GitHub Desktop.
Save anniejw6/8a99384978ec534c33aac532b2cb138d to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import sys
def gen_url(baseurl="https://www.progressivedatajobs.org/",
page=1):
if page == 1:
return(baseurl)
else:
return(baseurl + "page%s/" % page)
def gen_list(url):
soup = BeautifulSoup(urlopen(url))
links = soup.find_all('h2', "archive__item-title")
return([gen_html(link) for link in links])
def gen_html(link):
url = link.find('a').get('href')
title = link.get_text().strip()
return("<p><a href='%s'>%s</a></p>" % (url, title))
if __name__ == "__main__":
if len(sys.argv) == 1:
num_pages = 1
else:
num_pages = int(sys.argv[1])
links = []
for i in range(1, num_pages + 1):
links = links + gen_list(gen_url(page=i))
with open('recentposts.txt', 'w') as f:
for link in links:
f.write("%s\n" % link)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment