Skip to content

Instantly share code, notes, and snippets.

@zmwangx
Created December 27, 2018 19:43
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zmwangx/49049218bd89c21ddabd647896af995a to your computer and use it in GitHub Desktop.
Save zmwangx/49049218bd89c21ddabd647896af995a to your computer and use it in GitHub Desktop.
Python script to extract URLs from HTML documents.
#!/usr/bin/env python3
"""Extract URLs from HTML documents."""
import argparse
import re
import sys
import urllib.parse
import bs4
import requests
_TAG_ATTRS = {
'a': {'href'},
'applet': {'code', 'archive', 'codebase'},
'area': {'href'},
'audio': {'src'},
'base': {'href'},
'blockquote': {'cite'},
'body': {'background'},
'button': {'formaction'},
'del': {'cite'},
'embed': {'src'},
'form': {'action'},
'frame': {'longdesc', 'src'},
'head': {'profile'},
'html': {'manifest'},
'iframe': {'longdesc', 'src'},
'img': {'longdesc', 'src'},
'input': {'formaction', 'src'},
'ins': {'cite'},
'link': {'href'},
'menuitem': {'icon'},
'object': {'archive', 'codebase', 'data'},
'q': {'cite'},
'script': {'src'},
'source': {'src'},
'video': {'src'},
}
"""HTML tags and attributes that can hold URLs.
According to W3Schools, only the listed attributes of the listed tags
(keys) can hold URLs. Both HTML 4.01 and HTML 5 are included.
The data are scraped from w3schools.com on 2015-05-09.
"""
# disguise as Microsoft Edge, because apparently some servers are not
# thrilled to see python-requests in the UA string
REQUEST_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240"}
def urlgrep(pattern=None, content=None, filepath=None, url=None,
selector=None, base=None, deduplicate=True, session=None):
"""Extract URLs matching a pattern from an HTML document.
The HTML document is either passed in full as a string (the
`content` parameter), or is read from a local file (the `filepath`
parameter), or is retrieved from a remote URL (the `url`
parameter). Only one of the these three -- the first one in
`content`, `filepath`, and `url` that is not ``None`` -- is
used. One of the three must be specified.
Scope of search can be refined with an optional CSS selector
(`selector` parameter). If specified, search is performed inside
*all* matched tags but no more. This should help greatly in noise
reduction, especially from web pages that try to offer suggestions
and whatnot at the bottom or in sidebars.
Sometimes we need a base URL to resolve relative URLs. The `url`
argument or its redirection (if any) is automatically used as base
when the document is retrieved from `url`. Otherwise, the `base`
parameter, if specified and not None, is used. If none of the above
applies, ``http://localhost`` is used as a fallback. Note that the
HTML ``<base>`` tag within the document may change the base URL, and
that is respected.
The `pattern` parameter supplies a regex pattern to match parsed
URLs. Only matching URLs are returned. If no pattern is specified,
``r"^(?!javascript:)"`` is used to exclude the "``javascript:``
scheme". You may supply an empty string if you want to include
``javascript:``.
Parameters
----------
pattern : str, optional
Regex pattern to match parsed URLs against. Default is
``r"^(?!javascript:)"``.
content : bytes or str, optional
An HTML document.
filepath : str, optional
Path to a local HTML document.
url : str, optional
URL pointing to an HTML document (note that the ``file`` scheme
is not supported by ``requests``). If no scheme is supplied in
the URL, use ``http://``.
selector : str, optional
CSS selector recognized by BeautifulSoup. If specified, search
scope is limited to all matched tags but no more.
base : str, optional
Base URL used for `content` for `filepath`. If no scheme is
supplied in the URL, use ``http://`` (for a local path, use
``file://``).
deduplicate : str, optional
Deduplicate matching URLs and only keep the first occurrence of
each unique URL. Default is ``True``.
session : requests.Session, optional
If not ``None``, make HTTP requests within this session. Default
is ``None``.
Returns
-------
matching_urls : list
List of parsed absolute URLs matching the given pattern.
Raises
------
ValueError
If content, filepath and url are all None.
OSError
If failed to open the specified file.
requests.exceptions.RequestException
If requests fail to retrieve the URL specified.
"""
# pylint: disable=too-many-arguments,too-many-locals,too-many-branches
urlscheme = re.compile(r"^\w+://")
base = "localhost" if base is None else base
if not urlscheme.match(base):
base = "http://%s" % base
if content is not None:
pass
elif filepath is not None:
with open(filepath, mode='rb') as fileobj:
content = fileobj.read()
elif url is not None:
if not urlscheme.match(url):
url = "http://%s" % url
if session is None:
request = requests.get(url, headers=REQUEST_HEADERS)
else:
request = session.get(url)
content = request.content
base = request.url
else:
raise ValueError("content, filepath and url cannot all be None")
regex = (re.compile(pattern) if pattern is not None
else re.compile(r"^(?!javascript:)"))
soup = bs4.BeautifulSoup(content, "html.parser")
# base URL might be modified by the HTML <base> tag, which must
# reside inside <head>
if soup.head and soup.head.base and "href" in soup.head.base.attrs:
base = soup.head.base["href"]
# select part of the soup with the optional selector
selections = [soup] if selector is None else soup.select(selector)
matching_urls = []
for selection in selections:
for tag in selection.descendants:
if tag.name in _TAG_ATTRS:
for attribute in _TAG_ATTRS[tag.name]:
if attribute in tag.attrs:
parsed_url = urllib.parse.urljoin(base, tag[attribute])
if regex.search(parsed_url):
matching_urls.append(parsed_url)
if deduplicate:
seen = set()
seen_add = seen.add
matching_urls = [url for url in matching_urls
if not (url in seen or seen_add(url))]
return matching_urls
def main():
"""CLI interface."""
description = """Parse URLs from HTML documents. When invoked with
no URLs or files, read from stdin."""
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-u", "--url", action="append",
help="""URL to an HTML document to be
parsed. This option can be specified multiple
times on the command line. "http://" is
automatically attached if the scheme is left
out.""")
parser.add_argument("-s", "--selector",
help="""CSS selector for restricting search
scope. If specified, the search scope is
restricted to all selected tags, and no
more.""")
parser.add_argument("-b", "--base",
help="""Base URL. Only used for files or
stdin. "http://" is automatically attached if
the scheme is left out.""")
parser.add_argument("-p", "--pattern",
help="""Regexp to match against.""")
parser.add_argument("-d", "--preserve-duplicates", action="store_true",
help="""Do not deduplicate URLs within a document.""")
parser.add_argument("-v", "--verbose", action="store_true",
help="""Print additional information to stderr.""")
parser.add_argument("filepaths", metavar="FILE", nargs="*",
help="""Files to be parsed.""")
args = parser.parse_args()
urls = args.url if args.url is not None else []
selector = args.selector
filepaths = args.filepaths
base = args.base
pattern = args.pattern
deduplicate = not args.preserve_duplicates
# verbose if --verbose specified and sources more than one
verbose = len(urls) + len(filepaths) >= 2 if args.verbose else False
returncode = 0
if not urls and not filepaths:
content = sys.stdin.read()
matching_urls = urlgrep(pattern=pattern,
content=content,
selector=selector,
base=base,
deduplicate=deduplicate)
print('\n'.join(matching_urls))
sys.stdout.flush()
else:
for url in urls:
try:
matching_urls = urlgrep(pattern=pattern,
url=url,
selector=selector,
deduplicate=deduplicate)
if verbose and matching_urls:
sys.stderr.write("# from '%s':\n" % url)
sys.stderr.flush()
print('\n'.join(matching_urls))
sys.stdout.flush()
except requests.exceptions.RequestException as err:
sys.stderr.write("error: failed to get '%s'\n" % url)
sys.stderr.write("error: %s\n" % str(err))
sys.stderr.flush()
returncode = 1
for filepath in filepaths:
try:
matching_urls = urlgrep(pattern=pattern,
filepath=filepath,
selector=selector,
base=base,
deduplicate=deduplicate)
if verbose and matching_urls:
sys.stderr.write("# from '%s':\n" % filepath)
sys.stderr.flush()
print('\n'.join(matching_urls))
sys.stdout.flush()
except OSError as err:
sys.stderr.write("error: failed to open '%s'\n" % filepath)
sys.stderr.write("error: %s\n" % str(err))
sys.stderr.flush()
returncode = 1
return returncode
if __name__ == '__main__':
main()
@zmwangx
Copy link
Author

zmwangx commented Dec 27, 2018

A pretty old script I wrote in early 2015. Served me very well throughout the years.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment