Created
October 4, 2016 21:00
-
-
Save sladjandr/e7b231a4423df737f323cfc10b2eae7d to your computer and use it in GitHub Desktop.
Scrape all mailto links from a page - Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import xlwt | |
from bs4 import BeautifulSoup | |
wb = xlwt.Workbook() | |
ws = wb.add_sheet('Emails') | |
ws.write(0,0,'Emails') | |
emailList= [] | |
r=0 | |
#add url of the page you want to scrape to urlString | |
urlString='Your page URL goes here' | |
#function that extracts all emails from a page you provided and stores them in a list | |
def emailExtractor(urlString): | |
getH=requests.get(urlString) | |
h=getH.content | |
soup=BeautifulSoup(h,'html.parser') | |
mailtos = soup.select('a[href^=mailto]') | |
for i in mailtos: | |
href=i['href'] | |
try: | |
str1, str2 = href.split(':') | |
except ValueError: | |
break | |
emailList.append(str2) | |
emailExtractor(urlString) | |
#adding scraped emails to an excel sheet | |
for email in emailList: | |
r=r+1 | |
ws.write(r,0,email) | |
wb.save('emails.xls') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment