Skip to content

Instantly share code, notes, and snippets.

@sgsfak
Last active December 14, 2015 21:28
Show Gist options
  • Save sgsfak/5151354 to your computer and use it in GitHub Desktop.
Save sgsfak/5151354 to your computer and use it in GitHub Desktop.
Extract the VPH-Toolkit tools from their web page
[
{
"self": "http://toolkit.vph-noe.eu/home/tools/data-conversion-tools.html",
"name": "Data Conversion Tools"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/data-fusion.html",
"name": "Data Fusion"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/collaborative-tools.html",
"name": "Collaborative Tools"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html",
"narrower": [
{
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/solvers.html",
"name": "Solvers"
},
{
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/editors.html",
"name": "Editors"
},
{
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/analysis.html",
"name": "Analysis"
},
{
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/converters.html",
"name": "Converters"
}
],
"name": "Modelling Tools"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/solvers.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html",
"name": "Modelling Tools"
},
"name": "Solvers"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/editors.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html",
"name": "Modelling Tools"
},
"name": "Editors"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/analysis.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html",
"name": "Modelling Tools"
},
"name": "Analysis"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/modelling-tools/converters.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/modelling-tools.html",
"name": "Modelling Tools"
},
"name": "Converters"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/imaging.html",
"narrower": [
{
"resource": "http://toolkit.vph-noe.eu/home/tools/imaging/development-environments%10platforms.html",
"name": "Development environments/platforms"
}
],
"name": "Imaging"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/imaging/development-environments%10platforms.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/imaging.html",
"name": "Imaging"
},
"name": "Development environments/platforms"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/compute-resources.html",
"narrower": [
{
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources/tools-for-accessing-compute-resources.html",
"name": "Tools for accessing compute resources"
},
{
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources/workflow.html",
"name": "Workflow"
}
],
"name": "Compute resources"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/compute-resources/tools-for-accessing-compute-resources.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources.html",
"name": "Compute resources"
},
"name": "Tools for accessing compute resources"
},
{
"self": "http://toolkit.vph-noe.eu/home/tools/compute-resources/workflow.html",
"broader": {
"resource": "http://toolkit.vph-noe.eu/home/tools/compute-resources.html",
"name": "Compute resources"
},
"name": "Workflow"
}
]
#!/usr/bin/env python
"""scrape the VPH-Toolkit tools"""
from bs4 import BeautifulSoup
from urlparse import urljoin
import urllib
U = 'http://toolkit.vph-noe.eu/home/tools.html'
def parse_page(u):
def normalize_tag(t):
return [m.strip() for m in t.split('\n')]
print "Retrieving %s" % (u,)
soup = BeautifulSoup(urllib.urlopen(u))
print " \xe2\x86\xb3 OK"
table = soup.select("table.sobi2Listing tr")
tools = []
for tr in soup.select("table.sobi2Listing")[0].findAll('tr', recursive=False):
td = tr.find('td')
a = td.find('a', 'title_vc')
name = a.text.strip()
link = a.attrs['href']
t = td.find('table', 'myview')
sdescr = t.find('p', 'mypara_short').text.strip()
g = t.findAll("tr")[2].findAll("td")[1]
tg = g.span.next_sibling
users = []
if tg:
users = [i.strip() for i in tg.split(";")]
tc = t.findAll("tr")[4].findAll("td")
tags = [normalize_tag(c.text.strip()) for c in tc[0].findAll('a')]
cats = [c.text for c in tc[1].findAll('a')]
tools.append({'name': name, 'id':link, 'description': sdescr, 'tag':sum(tags,[]), 'category': cats, 'users':users, 'origin':'vph-toolkit'})
next = soup.select("span.pagination")[0].find('a', attrs={'title': 'Next'})
if next:
next = urljoin(U, next.attrs['href'])
return {'tools': tools, 'next': next}
def download_imgs(tools, dir="imgs"):
"""Downloads the "logos" of the tools by visiting each tool specific
web page in the VPH-Toolkit site and locating the image. The image is saved
in the supplied directory"""
import os
for t in tools:
u = t['id']
print "Retrieving %s" % (u,)
soup = BeautifulSoup(urllib.urlopen(u))
print " \xe2\x86\xb3 OK"
img = soup.find('img', {'class': 'sobi2DetailsImage'})
if img:
h = img.attrs['src']
i = u.rfind('/')
j = u.rfind('.')
k = h.rfind('.')
fn = os.path.join(dir, u[i+1:j]+h[k:])
print "Downloading %s to %s" % (h, fn)
urllib.urlretrieve(h, fn)
t['img'] = fn
return tools
def doit():
a = parse_page(U)
tools = a['tools']
while a['next']:
a = parse_page(a['next'])
tools.extend(a['tools'])
return tools
def cacheem(tools):
"""Caches the given tools into a local Redis database"""
from redis import Redis
r = Redis()
import json
for t in tools:
t['type'] = 'tool'
r.set('wbench:tool:'+t['id'], json.dumps(t))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment