Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pkutaj/6911dacb831d6af5a11f864c3ccd62e7 to your computer and use it in GitHub Desktop.
Save pkutaj/6911dacb831d6af5a11f864c3ccd62e7 to your computer and use it in GitHub Desktop.
2023-06-05-How-to-Scrape-DuckDuckGo-results-for-Email-Addresses-to-use-for-Sendgrid-later.py
import requests
from bs4 import BeautifulSoup
import re
def get_emails(query):
url = f'https://duckduckgo.com/html/?q={query}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('.result__url')
emails = []
for link in links:
if len(emails) >= 20:
break
href = link.get('href')
page = requests.get(href)
page_soup = BeautifulSoup(page.text, 'html.parser')
mailtos = page_soup.select('a[href^=mailto]')
for mailto in mailtos:
email = re.search(r'mailto:(.*)', mailto['href']).group(1)
if email not in emails:
emails.append(email)
return emails
emails = get_emails(input("Enter DuckDuckGo Query: "))
print(emails)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment