Skip to content

Instantly share code, notes, and snippets.

@sosi-deadeye
Created April 5, 2020 13:29
Show Gist options
  • Save sosi-deadeye/ac5af7258613a4b72351e43f8ae9bf90 to your computer and use it in GitHub Desktop.
Save sosi-deadeye/ac5af7258613a4b72351e43f8ae9bf90 to your computer and use it in GitHub Desktop.
Code Improvement
"""
https://pastebin.com/raw/dxUqR7RH
https://www.facebook.com/groups/PythonUserDeutschland/?multi_permalinks=2079508368861796&notif_id=1585918939113861&notif_t=group_highlights&ref=notif
"""
import csv
from contextlib import contextmanager
from datetime import datetime
import ratelimit
import requests
from bs4 import BeautifulSoup
@contextmanager
def csv_writer(filename):
with open(filename, "w") as fd:
writer = csv.writer(fd)
writer.writerow(
("Index", "", "", "url", "meta_description_length", "meta_description_text")
)
yield writer
@ratelimit.sleep_and_retry
@ratelimit.RateLimitDecorator(20, 60)
def get_bs4(url):
return BeautifulSoup(requests.get(url).content, "html.parser")
def get_sitemap_urls():
soup = get_bs4("https://draeger-it.blog/sitemap_index.xml")
return [element.text for element in soup.find_all("loc")]
def get_urls_from_sitemaps(sitemaps):
for sitemap_url in sitemaps:
sitemap = get_bs4(sitemap_url)
for loc in sitemap.find_all("loc"):
yield loc.text
def crawl_meta_description(url, index):
parser = get_bs4(url)
content = parser.find("meta", attrs={"name": "description"})
if content is not None:
meta_description = content.get("content", "")
else:
meta_description = ""
return str(index), "", "", url, str(len(meta_description)), meta_description
def crawl():
sitemaps = get_sitemap_urls()
urls = get_urls_from_sitemaps(sitemaps)
with csv_writer("output.csv") as writer:
for index, url in enumerate(urls):
print(f"Starte Crawling ({datetime.now().isoformat()}) {url}")
result = crawl_meta_description(url, index)
writer.writerow(result)
print(f"Ende - Crawling ({datetime.now().isoformat()}) {url}")
if __name__ == "__main__":
crawl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment