sureshkumargondi/crawl_multiple_sites.py

## crawl_multiple_sites.py
from urllib.parse import urlsplit

import advertools as adv


sites = [
    'https://www.who.int',
    'https://www.nytimes.com',
    'https://www.washingtonpost.com',
]

for site in sites:
    domain = urlsplit(site).netloc

    adv.crawl(site,
              output_file=domain + '.jl',
              follow_links=True,
              custom_settings={
                  'LOG_FILE': domain + '.log',
                  # change this to any number of pages
                  'CLOSESPIDER_PAGECOUNT': 50,
                  # resume the same crawl jobs later
                  'JOBDIR': domain
              })
	from urllib.parse import urlsplit

	import advertools as adv


	sites = [
	'https://www.who.int',
	'https://www.nytimes.com',
	'https://www.washingtonpost.com',
	]

	for site in sites:
	domain = urlsplit(site).netloc

	adv.crawl(site,
	output_file=domain + '.jl',
	follow_links=True,
	custom_settings={
	'LOG_FILE': domain + '.log',
	# change this to any number of pages
	'CLOSESPIDER_PAGECOUNT': 50,
	# resume the same crawl jobs later
	'JOBDIR': domain
	})