BenjaminFraser/web_scrape_example_1.py

## web_scrape_example_1.py
from bs4 import BeautifulSoup
import requests
from time import sleep

# select Guardian website - Military news and obtain a set of URLs
crawl_url = 'https://www.theguardian.com/uk/military?page='

# form a set to store unique urls of all articles
guardian_urls = set()

# scrape first 10 pages
for page_no in range(1, 11):
    page = requests.get(crawl_url + str(page_no))
    soup = BeautifulSoup(page.text, 'html.parser')

    # search only for links with data-link-name as 'article'
    news_list = soup.find_all('a', attrs={'data-link-name' : 'article'})

    # form a set of all unique urls to build our dataset
    for link in news_list:
        news_url = link.get('href')
        guardian_urls.add(news_url)

    # delay to avoid contravening robots.txt of webpage
    sleep(1.0)

print("{0} article URLs were obtained from {1}.".format(len(guardian_urls),
                                                            crawl_url))
	from bs4 import BeautifulSoup
	import requests
	from time import sleep

	# select Guardian website - Military news and obtain a set of URLs
	crawl_url = 'https://www.theguardian.com/uk/military?page='

	# form a set to store unique urls of all articles
	guardian_urls = set()

	# scrape first 10 pages
	for page_no in range(1, 11):
	page = requests.get(crawl_url + str(page_no))
	soup = BeautifulSoup(page.text, 'html.parser')

	# search only for links with data-link-name as 'article'
	news_list = soup.find_all('a', attrs={'data-link-name' : 'article'})

	# form a set of all unique urls to build our dataset
	for link in news_list:
	news_url = link.get('href')
	guardian_urls.add(news_url)

	# delay to avoid contravening robots.txt of webpage
	sleep(1.0)

	print("{0} article URLs were obtained from {1}.".format(len(guardian_urls),
	crawl_url))