Skip to content

Instantly share code, notes, and snippets.

@mdaniel
Created September 23, 2013 06:22
Show Gist options
  • Save mdaniel/6666997 to your computer and use it in GitHub Desktop.
Save mdaniel/6666997 to your computer and use it in GitHub Desktop.
Outputs a textual description of the provided BBC programme URL (or local file).
#! /usr/bin/env python
import sys
from urllib import urlopen
from bs4 import BeautifulSoup
def build_xmlns_alias_map( html_tag ):
"""
:param html_tag: the `html` Tag that contains the `xmlns` attributes
:return: a dict whose key is the namespace and whose value is the alias in this document
"""
result = {}
html_attrs = html_tag.attrs
for a_name in html_attrs:
a_value = html_attrs[ a_name ]
if a_name.startswith('xmlns:'):
ns_alias = a_name[ len('xmlns:'): ]
result[ a_value ] = ns_alias
return result
def main( argv ):
"""
Scans the provided HTML document and enumerates the MusicSegment structures found therein.
"""
XMLNS_DC = 'http://purl.org/dc/terms/'
XMLNS_FOAF = 'http://xmlns.com/foaf/0.1/'
XMLNS_MO = 'http://purl.org/ontology/mo/'
XMLNS_PO = 'http://purl.org/ontology/po/'
fh = urlopen( argv[1] )
soup = BeautifulSoup( fh.read() )
fh.close()
del fh
html = soup.find('html', recursive=False)
del soup
ns_alias_map = build_xmlns_alias_map( html )
dc_ns = ns_alias_map[ XMLNS_DC ]
mo_ns = ns_alias_map[ XMLNS_MO ]
po_ns = ns_alias_map[ XMLNS_PO ]
foaf_ns = ns_alias_map[ XMLNS_FOAF ]
del ns_alias_map
DcTitleNS = '%s:title' % dc_ns
MusicSegmentNS = '%s:MusicSegment' % po_ns
MusicArtistNS = '%s:MusicArtist' % mo_ns
FoafNameNS = '%s:name' % foaf_ns
segments = html.find( attrs= {'id':'segments'} )
music_segs = segments.find_all( attrs= {'typeof':MusicSegmentNS} )
del segments
for seg in music_segs:
art = seg.find( attrs= {'typeof': MusicArtistNS} )
art_name = art.find( attrs= {'property': FoafNameNS} )
tit = seg.find( attrs= {'property': DcTitleNS} )
release = seg.find( attrs= {'class': r'release'} )
release_label = None
if release is not None:
release_label = release.find( attrs= {'class': r'record-label'} )
print('artist = %s' % art.string)
print('title = %s' % tit.string)
if release_label is not None:
print('label = %s' % release_label.string)
print('')
if __name__ == '__main__':
main( sys.argv )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment