Skip to content

Instantly share code, notes, and snippets.

@alanboy
Created December 21, 2021 20:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alanboy/5509bed320a71e73cb95f6cea5ea4306 to your computer and use it in GitHub Desktop.
Save alanboy/5509bed320a71e73cb95f6cea5ea4306 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
#
# Add a found URL to the list of links
#
def add_url(url, href):
#url_parsed = urlparse(href)
#full_url = urljoin(url, href)
#print(">>>>"+full_url)
return
#
# Process a single URL
#
def process_url(url):
#print("========== " + url + " ====================")
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser')
# Title
print (soup.title.string)
# H1 elements usually have important information
h1_elements = soup.find_all('h1,h2')
for el in h1_elements:
print(el.text)
# meta elements have description and other keywords that are useful
meta_elements = soup.find_all('meta')
for el in meta_elements:
#print(el.attrs)
if el.has_attr('name') and el['name'] == 'description':
print(el['content'])
# links
#a_elements = soup.find_all('a')
#for el in a_elements:
# add_url(url, el.get("href").strip())
#
# Process the file
#
with open('urls.txt') as urls_file:
for line in urls_file:
if line.startswith("#"):
continue
process_url(line.strip())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment