Created
July 7, 2017 06:48
-
-
Save bigwestern/e56a308a34410e762e1cefaff8a2df44 to your computer and use it in GitHub Desktop.
Scrape the conference panels from nomadit.co.uk hosted event. Output is html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# c:\Python36\python junkcode\nomadit.py > abstracts.html | |
# | |
# or: c:\Python36\python junkcode\nomadit.py --help | |
# | |
import requests | |
import argparse | |
from bs4 import BeautifulSoup | |
from jinja2 import Template | |
HTML = """ | |
<html> | |
<head> | |
<title>Abstracts</title> | |
<style> | |
.panel-title { | |
font-size: 2em; | |
margin-top: 0.67em; | |
margin-bottom: 0.67em; | |
margin-left: 0; | |
margin-right: 0; | |
font-weight: bold; | |
} | |
</style> | |
</head> | |
<body> | |
{% for title, convenors, abstract in items %} | |
<div> | |
{{ title }} | |
{{ convenors }} | |
{{ abstract }} | |
</div> | |
{% endfor %} | |
</body> | |
</html> | |
""" | |
def join_url(a, b): | |
return "/".join([a.rstrip("/"), b.lstrip("/")]) | |
def get_panel_anchors(url): | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
anchors = soup.findAll("a", { "class":'panel_list_entry'}) | |
return anchors | |
def get_panel_abstracts(base_url, anchors): | |
panels = [] | |
for anchor in anchors: | |
a = requests.get(join_url(base_url, anchor["href"])) | |
a_soup = BeautifulSoup(a.text, 'html.parser') | |
abstract = a_soup.find("div", {"class": "panel-longabstract"}) | |
title = a_soup.find("div", {"class": "panel-title"}) | |
convenors = a_soup.find("div", {"class": "panel-convenors"}) | |
panels.append((title, convenors, abstract)) | |
return panels | |
def render(panels): | |
print(Template(HTML).render(items=panels)) | |
def main(args): | |
all_panels_url = join_url(args.base_url, '/conferencesuite.php/panels/Views/allpanels') | |
anchors = get_panel_anchors(all_panels_url) | |
panels = get_panel_abstracts(args.base_url, anchors) | |
render(panels) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Scrape the panels from an nomadit.co.uk hosted conference.') | |
parser.add_argument('--base-url', '-b', dest='base_url', default='http://nomadit.co.uk/shiftingstates', | |
help='Set the base conference url') | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment