Last active
May 21, 2021 08:22
-
-
Save martyglaubitz/3a7a29b30718f638281536e2110afb49 to your computer and use it in GitHub Desktop.
Script which given a URL to a Sitemap, takes each URL and performs a Lighthouse check on it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import dataclasses | |
import json | |
import io | |
import os | |
import os.path | |
import subprocess | |
import sys | |
import urllib.parse | |
import urllib.request | |
import xml.dom.minidom | |
@dataclasses.dataclass | |
class LighthouseResult: | |
url: str | |
accessibility: float | |
best_practices: float | |
performance: float | |
pwa: float | |
seo: float | |
def get_sitemap_urls(sitemap_url): | |
response = urllib.request.urlopen(sitemap_url) | |
sitemap_xml = response.read().decode('utf-8') | |
document = xml.dom.minidom.parseString(sitemap_xml) | |
return [node.firstChild.nodeValue for node in document.getElementsByTagName('loc')] | |
def get_lighthouse_result(url: str) -> LighthouseResult: | |
url_parsed = urllib.parse.urlparse(url) | |
# create a valid filename from the URL | |
output_filename = url_parsed.hostname + url_parsed.path.replace('/', '-') + '.json' | |
target_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), output_filename) | |
if not os.path.exists(target_path): | |
subprocess.call(['lighthouse', '--output', 'json', '--output-path', target_path, url], shell=True) | |
with open(target_path, encoding='utf-8') as json_file: | |
result_json = json.load(json_file) | |
# here is the place to extract more information from the lighthouse result | |
categories = result_json['categories'] | |
accessibility = categories['accessibility'] | |
performance = categories['performance'] | |
pwa = categories['pwa'] | |
best_practices = categories['best-practices'] | |
seo = categories['seo'] | |
return LighthouseResult( | |
url=url, | |
accessibility=accessibility['score'], | |
best_practices=best_practices['score'], | |
performance=performance['score'], | |
pwa=pwa['score'], | |
seo=seo['score'] | |
) | |
def write_csv(lighthouse_results, csv_filepath: str): | |
with open(csv_filepath, 'w', newline='', encoding='utf-8') as csvfile: | |
csv_writer = csv.DictWriter(csvfile, fieldnames=['url','performance', 'accessibility', 'best_practices', 'pwa', 'seo'], delimiter='|', lineterminator='\n') | |
csv_writer.writeheader() | |
for result in lighthouse_results: | |
csv_writer.writerow(dict( | |
url=result.url, | |
performance=result.performance, | |
accessibility=result.accessibility, | |
best_practices=result.best_practices, | |
pwa=result.pwa, | |
seo=result.seo | |
)) | |
if __name__ == "__main__": | |
# usage: python sitemap_lighthouse.py https://your.domain/sitemap.xml lighthouse-results.csv | |
# TODO: use argparse for this | |
sitemap_url = sys.argv[1] | |
csv_filepath = sys.argv[2] | |
if not sitemap_url or not csv_filepath: | |
exit(1) | |
else: | |
urls = get_sitemap_urls(sitemap_url) | |
url_results = [] | |
for url in urls: | |
url_results.append(get_lighthouse_result(url)) | |
write_csv(url_results, csv_filepath) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment