fomightez/beautiful_soup_to_mine_example_links_from_concepts_page.py

## beautiful_soup_to_mine_example_links_from_concepts_page.py
from bs4 import BeautifulSoup


file_name = "concepts.html"
start_of_example_urls = "http://www.codeskulptor.org/#exampl"


soup = BeautifulSoup(open(file_name))

#print(soup.prettify())
'''
for link in soup.find_all('a'):
    print(link.get('href'))
    '''
# based on http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# and http://stackoverflow.com/questions/9577216/parsing-data-stored-in-urls-via-beautifulsoup
# and http://stackoverflow.com/questions/16096754/remove-none-value-from-a-list-without-removing-the-0-value
urls = [link.get('href') for link in soup.find_all('a') if link.get('href') is not None ]
#print urls

example_urls = []
for the_url in urls:
    #print the_url
    if the_url.startswith(start_of_example_urls):
        if the_url not in example_urls:
            example_urls.append(the_url)
#print example_urls

for example_url in example_urls:
    print example_url
	from bs4 import BeautifulSoup


	file_name = "concepts.html"
	start_of_example_urls = "http://www.codeskulptor.org/#exampl"


	soup = BeautifulSoup(open(file_name))

	#print(soup.prettify())
	'''
	for link in soup.find_all('a'):
	print(link.get('href'))
	'''
	# based on http://www.crummy.com/software/BeautifulSoup/bs4/doc/
	# and http://stackoverflow.com/questions/9577216/parsing-data-stored-in-urls-via-beautifulsoup
	# and http://stackoverflow.com/questions/16096754/remove-none-value-from-a-list-without-removing-the-0-value
	urls = [link.get('href') for link in soup.find_all('a') if link.get('href') is not None ]
	#print urls

	example_urls = []
	for the_url in urls:
	#print the_url
	if the_url.startswith(start_of_example_urls):
	if the_url not in example_urls:
	example_urls.append(the_url)
	#print example_urls

	for example_url in example_urls:
	print example_url