Skip to content

Instantly share code, notes, and snippets.

@wetlife
Created September 9, 2017 02:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wetlife/c6a4784da5e6eee04c7fc7a7f6f36d56 to your computer and use it in GitHub Desktop.
Save wetlife/c6a4784da5e6eee04c7fc7a7f6f36d56 to your computer and use it in GitHub Desktop.
Find hrefs in file_path. Distinguish absolute hrefs by the presence of '://' then give counts of relative- and absolute-hrefs.
from bs4 import BeautifulSoup
from requests import request
from pprint import pprint
import re
file_path = './index.html'
with open(file_path, encoding='utf8') as file_object:
file_markup = file_object.read()
file_soup = bs(file_markup, 'lxml')
relative_hrefs = absolute_hrefs = ()
href_tags = file_soup.find_all(href=True)
for tag in href_tags:
if re.search('://',tag['href']):
absolute_hrefs += (tag['href'],)
else:
relative_hrefs += (tag['href'],)
print(f"absolute_hrefs: {absolute_hrefs}")
print(f"relative_hrefs: {relative_hrefs}")
print(f"{len(absolute_hrefs+relative_hrefs)} hrefs were found. {len(absolute_hrefs)} hrefs are absolute and {len(relative_hrefs)} hrefs are relative.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment