martyglaubitz/sitemap_lighthouse.py

## sitemap_lighthouse.py
import csv
import dataclasses
import json
import io
import os
import os.path
import subprocess
import sys
import urllib.parse
import urllib.request
import xml.dom.minidom

@dataclasses.dataclass
class LighthouseResult:

    url: str

    accessibility: float

    best_practices: float

    performance: float

    pwa: float

    seo: float


def get_sitemap_urls(sitemap_url):
    response = urllib.request.urlopen(sitemap_url)
    sitemap_xml = response.read().decode('utf-8')
    document = xml.dom.minidom.parseString(sitemap_xml)
    return [node.firstChild.nodeValue for node in document.getElementsByTagName('loc')]

def get_lighthouse_result(url: str) -> LighthouseResult:
    url_parsed = urllib.parse.urlparse(url)

    # create a valid filename from the URL
    output_filename = url_parsed.hostname + url_parsed.path.replace('/', '-') + '.json'
    target_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), output_filename)
    if not os.path.exists(target_path):
        subprocess.call(['lighthouse', '--output', 'json', '--output-path', target_path, url], shell=True)

    with open(target_path, encoding='utf-8') as json_file:
        result_json = json.load(json_file)

    # here is the place to extract more information from the lighthouse result
    categories = result_json['categories']
    accessibility = categories['accessibility']
    performance = categories['performance']
    pwa = categories['pwa']
    best_practices = categories['best-practices']
    seo = categories['seo']

    return LighthouseResult(
        url=url,
        accessibility=accessibility['score'],
        best_practices=best_practices['score'],
        performance=performance['score'],
        pwa=pwa['score'],
        seo=seo['score']
    )

def write_csv(lighthouse_results, csv_filepath: str):
    with open(csv_filepath, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=['url','performance', 'accessibility', 'best_practices', 'pwa', 'seo'], delimiter='|', lineterminator='\n')
        csv_writer.writeheader()

        for result in lighthouse_results:
            csv_writer.writerow(dict(
                url=result.url,
                performance=result.performance,
                accessibility=result.accessibility,
                best_practices=result.best_practices,
                pwa=result.pwa,
                seo=result.seo
            ))


if __name__ == "__main__":

    # usage: python sitemap_lighthouse.py https://your.domain/sitemap.xml lighthouse-results.csv
    # TODO: use argparse for this
    sitemap_url = sys.argv[1]
    csv_filepath = sys.argv[2]

    if not sitemap_url or not csv_filepath:
        exit(1)
    else:
        urls = get_sitemap_urls(sitemap_url)
        url_results = []
        for url in urls:
            url_results.append(get_lighthouse_result(url))

        write_csv(url_results, csv_filepath)
	import csv
	import dataclasses
	import json
	import io
	import os
	import os.path
	import subprocess
	import sys
	import urllib.parse
	import urllib.request
	import xml.dom.minidom

	@dataclasses.dataclass
	class LighthouseResult:

	url: str

	accessibility: float

	best_practices: float

	performance: float

	pwa: float

	seo: float


	def get_sitemap_urls(sitemap_url):
	response = urllib.request.urlopen(sitemap_url)
	sitemap_xml = response.read().decode('utf-8')
	document = xml.dom.minidom.parseString(sitemap_xml)
	return [node.firstChild.nodeValue for node in document.getElementsByTagName('loc')]

	def get_lighthouse_result(url: str) -> LighthouseResult:
	url_parsed = urllib.parse.urlparse(url)

	# create a valid filename from the URL
	output_filename = url_parsed.hostname + url_parsed.path.replace('/', '-') + '.json'
	target_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), output_filename)
	if not os.path.exists(target_path):
	subprocess.call(['lighthouse', '--output', 'json', '--output-path', target_path, url], shell=True)

	with open(target_path, encoding='utf-8') as json_file:
	result_json = json.load(json_file)

	# here is the place to extract more information from the lighthouse result
	categories = result_json['categories']
	accessibility = categories['accessibility']
	performance = categories['performance']
	pwa = categories['pwa']
	best_practices = categories['best-practices']
	seo = categories['seo']

	return LighthouseResult(
	url=url,
	accessibility=accessibility['score'],
	best_practices=best_practices['score'],
	performance=performance['score'],
	pwa=pwa['score'],
	seo=seo['score']
	)

	def write_csv(lighthouse_results, csv_filepath: str):
	with open(csv_filepath, 'w', newline='', encoding='utf-8') as csvfile:
	csv_writer = csv.DictWriter(csvfile, fieldnames=['url','performance', 'accessibility', 'best_practices', 'pwa', 'seo'], delimiter='\|', lineterminator='\n')
	csv_writer.writeheader()

	for result in lighthouse_results:
	csv_writer.writerow(dict(
	url=result.url,
	performance=result.performance,
	accessibility=result.accessibility,
	best_practices=result.best_practices,
	pwa=result.pwa,
	seo=result.seo
	))


	if __name__ == "__main__":

	# usage: python sitemap_lighthouse.py https://your.domain/sitemap.xml lighthouse-results.csv
	# TODO: use argparse for this
	sitemap_url = sys.argv[1]
	csv_filepath = sys.argv[2]

	if not sitemap_url or not csv_filepath:
	exit(1)
	else:
	urls = get_sitemap_urls(sitemap_url)
	url_results = []
	for url in urls:
	url_results.append(get_lighthouse_result(url))

	write_csv(url_results, csv_filepath)