eliasdabbas/incremental_crawling.py

## incremental_crawling.py
import advertools as adv

adv.crawl(
    # start crawling from this URL(s):
    url_list='https://en.wikipedia.org/wiki/Main_Page',
    # save the crawl output to this file:
    output_file='/home/user_name/wikipedia_en_crawl.jl',
    # Should it follow links?
    follow_links=True,
    # But don't follow all links, only links that match this regex:
    include_url_regex='https://en.wikipedia.org/wiki',
    custom_settings={
        # where to save the crawl job (this manages deduplication, and avoids re-crawling crawled pages):
        'JOBDIR': '/home/user_name/wikipedia_crawl_job',
        # After how many URLs should it stop wraling?
        'CLOSESPIDER_PAGECOUNT': 250,
        # where to save crawl logs:
        'LOG_FILE': '/home/user_name/wikipedia_en_crawl.log'
    })
	import advertools as adv

	adv.crawl(
	# start crawling from this URL(s):
	url_list='https://en.wikipedia.org/wiki/Main_Page',
	# save the crawl output to this file:
	output_file='/home/user_name/wikipedia_en_crawl.jl',
	# Should it follow links?
	follow_links=True,
	# But don't follow all links, only links that match this regex:
	include_url_regex='https://en.wikipedia.org/wiki',
	custom_settings={
	# where to save the crawl job (this manages deduplication, and avoids re-crawling crawled pages):
	'JOBDIR': '/home/user_name/wikipedia_crawl_job',
	# After how many URLs should it stop wraling?
	'CLOSESPIDER_PAGECOUNT': 250,
	# where to save crawl logs:
	'LOG_FILE': '/home/user_name/wikipedia_en_crawl.log'
	})