sashaboulouds/scrape_ahrefs_posts_and_check_img_attributes.py

## scrape_ahrefs_posts_and_check_img_attributes.py
from lxml import html
import requests
import csv

urls = """https://ahrefs.com/blog/seo-team/
https://ahrefs.com/blog/seo-on-page-factors/
https://ahrefs.com/blog/content-briefs/
https://ahrefs.com/blog/keyword-clustering/
https://ahrefs.com/blog/white-hat-link-building-techniques/
https://ahrefs.com/blog/programmatic-seo/
https://ahrefs.com/blog/impact-removing-content/
https://ahrefs.com/blog/seo-audit-template/
https://ahrefs.com/blog/over-optimization/
https://ahrefs.com/blog/competitor-analysis-tools/
https://ahrefs.com/blog/singapore-seo/
https://ahrefs.com/blog/how-to-increase-organic-traffic/
https://ahrefs.com/blog/seo-newsletters/
https://ahrefs.com/blog/link-building/
https://ahrefs.com/blog/inbound-marketing/
https://ahrefs.com/blog/most-visited-websites/
https://ahrefs.com/blog/title-tag-seo/
https://ahrefs.com/blog/blogging-tips/
https://ahrefs.com/blog/seo-tutorial/""".split('\n')

fieldnames = [
    "url",
    "img_src",
    "img_alt"
]

data = []

s = requests.Session()
for u in urls:
    print('> accessing %s' % u)
    r = s.get(u)
    assert r.ok
    doc = html.fromstring(r.text)
    assert doc
    imgs = doc.xpath('//img[ancestor::noscript]')
    for img in imgs:
        img_src = "".join(img.xpath('.//@src'))
        img_alt = "".join(img.xpath('.//@alt'))
        print(img_src, img_alt)

        values = [u, img_src, img_alt]
        row = dict(zip(fieldnames, values))

        data.append(row)

with open('img_audit.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
    writer.writeheader()
    for d in data:
        writer.writerow(d)
	from lxml import html
	import requests
	import csv

	urls = """https://ahrefs.com/blog/seo-team/
	https://ahrefs.com/blog/seo-on-page-factors/
	https://ahrefs.com/blog/content-briefs/
	https://ahrefs.com/blog/keyword-clustering/
	https://ahrefs.com/blog/white-hat-link-building-techniques/
	https://ahrefs.com/blog/programmatic-seo/
	https://ahrefs.com/blog/impact-removing-content/
	https://ahrefs.com/blog/seo-audit-template/
	https://ahrefs.com/blog/over-optimization/
	https://ahrefs.com/blog/competitor-analysis-tools/
	https://ahrefs.com/blog/singapore-seo/
	https://ahrefs.com/blog/how-to-increase-organic-traffic/
	https://ahrefs.com/blog/seo-newsletters/
	https://ahrefs.com/blog/link-building/
	https://ahrefs.com/blog/inbound-marketing/
	https://ahrefs.com/blog/most-visited-websites/
	https://ahrefs.com/blog/title-tag-seo/
	https://ahrefs.com/blog/blogging-tips/
	https://ahrefs.com/blog/seo-tutorial/""".split('\n')

	fieldnames = [
	"url",
	"img_src",
	"img_alt"
	]

	data = []

	s = requests.Session()
	for u in urls:
	print('> accessing %s' % u)
	r = s.get(u)
	assert r.ok
	doc = html.fromstring(r.text)
	assert doc
	imgs = doc.xpath('//img[ancestor::noscript]')
	for img in imgs:
	img_src = "".join(img.xpath('.//@src'))
	img_alt = "".join(img.xpath('.//@alt'))
	print(img_src, img_alt)

	values = [u, img_src, img_alt]
	row = dict(zip(fieldnames, values))

	data.append(row)

	with open('img_audit.csv', 'w') as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
	writer.writeheader()
	for d in data:
	writer.writerow(d)