Skip to content

Instantly share code, notes, and snippets.

@scmmishra
Created September 27, 2022 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scmmishra/ba5083219817f9289b63a0993800d47e to your computer and use it in GitHub Desktop.
Save scmmishra/ba5083219817f9289b63a0993800d47e to your computer and use it in GitHub Desktop.
Script to extract meta tags from a page
from bs4 import BeautifulSoup
from pprint import pprint
import requests
import csv
const sitemap_url = ''
def get_pages_to_crawl():
pages = []
r = requests.get(sitemap_url)
soup = BeautifulSoup(r.text)
url_tags = soup.find_all("url")
for url in url_tags:
pages.append(url.text)
return pages
def get_meta_tags(page):
r = requests.get(page)
soup = BeautifulSoup(r.text, "html.parser")
meta_tags = soup.find_all("meta")
return parse_meta_tags(meta_tags)
def parse_meta_tags(tags):
meta_set = {
"description": "",
"keywords": "",
"author": "",
"title": "",
"og:title": "",
"og:description": "",
}
allowed_names = meta_set.keys()
for tag in tags:
if tag.get("name") and tag.get("name") in allowed_names:
meta_set[tag.get("name")] = tag.get("content")
elif tag.get("property"):
meta_set[tag.get("property")] = tag.get("content")
return meta_set
def extract_meta_data():
meta_set = []
for page in get_pages_to_crawl():
meta_set.append({"page": page, **get_meta_tags(page)})
keys = meta_set[0].keys()
with open("meta-data.csv", "w", newline="") as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(meta_set)
extract_meta_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment