Skip to content

Instantly share code, notes, and snippets.

@VitamintK
Created February 27, 2016 00:07
Show Gist options
  • Save VitamintK/348f4fbe190bdf47eeae to your computer and use it in GitHub Desktop.
Save VitamintK/348f4fbe190bdf47eeae to your computer and use it in GitHub Desktop.
# coding: utf-8
# In[54]:
from bs4 import BeautifulSoup
# In[55]:
import requests
# In[56]:
wikipedia = requests.get("https://en.wikipedia.org/wiki/University_of_California,_Irvine")
# In[57]:
wikipedia
# In[62]:
soup = BeautifulSoup(wikipedia.text)
# In[65]:
print(soup.prettify())
# In[68]:
import re
# In[75]:
url = "https://en.wikipedia.org/wiki/Category:University_of_California,_Irvine_alumni"
# In[77]:
soup2 = BeautifulSoup(requests.get(url).text)
# In[81]:
content = soup2.find(class_="mw-content-ltr")
content
# In[90]:
links = [tag for tag in content.find_all("a") if (':' not in tag['href'] if "href" in tag.attrs else False)]
# In[95]:
from urllib.parse import urljoin
# In[97]:
links = [urljoin(url, link['href']) for link in links]
# In[99]:
len(links)
# In[ ]:
# In[100]:
import time
# In[103]:
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page)
soup
time.sleep(1)
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment