Last active
December 14, 2015 21:28
-
-
Save sgsfak/5151354 to your computer and use it in GitHub Desktop.
Extract the VPH-Toolkit tools from their web page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/data-conversion-tools.html", | |
"name": "Data Conversion Tools" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/data-fusion.html", | |
"name": "Data Fusion" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/collaborative-tools.html", | |
"name": "Collaborative Tools" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html", | |
"narrower": [ | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/solvers.html", | |
"name": "Solvers" | |
}, | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/editors.html", | |
"name": "Editors" | |
}, | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/analysis.html", | |
"name": "Analysis" | |
}, | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/converters.html", | |
"name": "Converters" | |
} | |
], | |
"name": "Modelling Tools" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/solvers.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html", | |
"name": "Modelling Tools" | |
}, | |
"name": "Solvers" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/editors.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html", | |
"name": "Modelling Tools" | |
}, | |
"name": "Editors" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/analysis.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html", | |
"name": "Modelling Tools" | |
}, | |
"name": "Analysis" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/converters.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html", | |
"name": "Modelling Tools" | |
}, | |
"name": "Converters" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/imaging.html", | |
"narrower": [ | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/imaging/development-environments%10platforms.html", | |
"name": "Development environments/platforms" | |
} | |
], | |
"name": "Imaging" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/imaging/development-environments%10platforms.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/imaging.html", | |
"name": "Imaging" | |
}, | |
"name": "Development environments/platforms" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/compute-resources.html", | |
"narrower": [ | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources/tools-for-accessing-compute-resources.html", | |
"name": "Tools for accessing compute resources" | |
}, | |
{ | |
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources/workflow.html", | |
"name": "Workflow" | |
} | |
], | |
"name": "Compute resources" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/compute-resources/tools-for-accessing-compute-resources.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources.html", | |
"name": "Compute resources" | |
}, | |
"name": "Tools for accessing compute resources" | |
}, | |
{ | |
"self": "http://toolkit.vph-noe.eu/home/tools/compute-resources/workflow.html", | |
"broader": { | |
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources.html", | |
"name": "Compute resources" | |
}, | |
"name": "Workflow" | |
} | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""scrape the VPH-Toolkit tools""" | |
from bs4 import BeautifulSoup | |
from urlparse import urljoin | |
import urllib | |
U = 'http://toolkit.vph-noe.eu/home/tools.html' | |
def parse_page(u): | |
def normalize_tag(t): | |
return [m.strip() for m in t.split('\n')] | |
print "Retrieving %s" % (u,) | |
soup = BeautifulSoup(urllib.urlopen(u)) | |
print " \xe2\x86\xb3 OK" | |
table = soup.select("table.sobi2Listing tr") | |
tools = [] | |
for tr in soup.select("table.sobi2Listing")[0].findAll('tr', recursive=False): | |
td = tr.find('td') | |
a = td.find('a', 'title_vc') | |
name = a.text.strip() | |
link = a.attrs['href'] | |
t = td.find('table', 'myview') | |
sdescr = t.find('p', 'mypara_short').text.strip() | |
g = t.findAll("tr")[2].findAll("td")[1] | |
tg = g.span.next_sibling | |
users = [] | |
if tg: | |
users = [i.strip() for i in tg.split(";")] | |
tc = t.findAll("tr")[4].findAll("td") | |
tags = [normalize_tag(c.text.strip()) for c in tc[0].findAll('a')] | |
cats = [c.text for c in tc[1].findAll('a')] | |
tools.append({'name': name, 'id':link, 'description': sdescr, 'tag':sum(tags,[]), 'category': cats, 'users':users, 'origin':'vph-toolkit'}) | |
next = soup.select("span.pagination")[0].find('a', attrs={'title': 'Next'}) | |
if next: | |
next = urljoin(U, next.attrs['href']) | |
return {'tools': tools, 'next': next} | |
def download_imgs(tools, dir="imgs"): | |
"""Downloads the "logos" of the tools by visiting each tool specific | |
web page in the VPH-Toolkit site and locating the image. The image is saved | |
in the supplied directory""" | |
import os | |
for t in tools: | |
u = t['id'] | |
print "Retrieving %s" % (u,) | |
soup = BeautifulSoup(urllib.urlopen(u)) | |
print " \xe2\x86\xb3 OK" | |
img = soup.find('img', {'class': 'sobi2DetailsImage'}) | |
if img: | |
h = img.attrs['src'] | |
i = u.rfind('/') | |
j = u.rfind('.') | |
k = h.rfind('.') | |
fn = os.path.join(dir, u[i+1:j]+h[k:]) | |
print "Downloading %s to %s" % (h, fn) | |
urllib.urlretrieve(h, fn) | |
t['img'] = fn | |
return tools | |
def doit(): | |
a = parse_page(U) | |
tools = a['tools'] | |
while a['next']: | |
a = parse_page(a['next']) | |
tools.extend(a['tools']) | |
return tools | |
def cacheem(tools): | |
"""Caches the given tools into a local Redis database""" | |
from redis import Redis | |
r = Redis() | |
import json | |
for t in tools: | |
t['type'] = 'tool' | |
r.set('wbench:tool:'+t['id'], json.dumps(t)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment