Skip to content

Instantly share code, notes, and snippets.

@shivankgtm
Created December 19, 2019 15:06
Show Gist options
  • Save shivankgtm/2df428a647b70d011d5a38644495b091 to your computer and use it in GitHub Desktop.
Save shivankgtm/2df428a647b70d011d5a38644495b091 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
finalList = []
pages = [ 'https://simple.wikipedia.org/wiki/Category:Natural_resources',
'https://simple.wikipedia.org/wiki/Category:Hydrogen_compounds',
'https://simple.wikipedia.org/wiki/Category:Oxygen_compounds',
'https://simple.wikipedia.org/wiki/Category:Oxides']
for i in range(len(pages)):
page = requests.get(pages[i])
soup = BeautifulSoup(page.content, 'html.parser')
all_links = soup.find_all('a')
for link in all_links:
s = str(link.get('href'))[:5]
if s == 'https':
finalList.append(link.get('href'))
#print(link.get('href'))
print('********')
print(len(finalList))
# Here in Final List you will get all the links.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment