Skip to content

Instantly share code, notes, and snippets.

@mdaniel
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdaniel/8816174 to your computer and use it in GitHub Desktop.
Save mdaniel/8816174 to your computer and use it in GitHub Desktop.
Dump discovered microdata descriptors (optionally along with their values) found in an HTML document
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
from bs4 import BeautifulSoup
def show_em_all(soup, with_values):
# todo: some way of marking a nested itemtype as seen?
# maybe the upward tree traversal would yield a unique path
for it in soup.select('[itemtype]'):
i_type = it.attrs['itemtype']
selector = '[itemtype="%s"]' % i_type
print('.select(%r)' % selector)
expected = soup.select(selector)
if not expected:
raise ValueError('Quoi? %r' % selector)
for p in it.select('[itemprop]'):
#: :type: str or unicode
ip_type = p.attrs['itemprop']
ip_values = ip_type.split()
for ip_value in ip_values:
ip_selector = '[itemprop~="%s"]' % ip_value
if not it.select(ip_selector):
raise ValueError('Quoi? %r' % ip_selector)
if with_values:
value = p.text
else:
value = ''
print('\t.select(%r) := %r' % (ip_selector, value))
def main(argv):
from getopt import getopt
opts, args = getopt(argv[1:], 'v', ['values'])
del argv
show_values = ('-v', '') in opts or ('--values', '') in opts
with open(args[0], 'rb') as fh:
soup = BeautifulSoup(fh.read())
show_em_all(soup, show_values)
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment