Skip to content

Instantly share code, notes, and snippets.

@DxDiagDx
Last active July 26, 2021 06:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DxDiagDx/e13f3b2b2db7827e3e6e8ba831cff28c to your computer and use it in GitHub Desktop.
Save DxDiagDx/e13f3b2b2db7827e3e6e8ba831cff28c to your computer and use it in GitHub Desktop.
Парсер email
import requests
import csv
import re
def get_html(url):
try:
result = requests.get(url)
result.raise_for_status()
return result.text
except(requests.RequestException, ValueError):
print('Server error')
return False
def write_csv(data):
with open('email_on_site.csv', 'a', encoding='utf-8', newline='') as file:
order = ['url', 'uniq_email']
writer = csv.DictWriter(file, fieldnames=order)
writer.writerow(data)
def get_email(html, url):
uniq_emails = []
try:
emails = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", html)
for email in emails:
uniq_emails.append(email)
except:
return False
# Удаляем дубли: превращаем список email во множество,
# так оно может содержать только уникальные элементы,
# и превращаем множество обратно в список
uniq_emails = list(set(uniq_emails))
for uniq_email in uniq_emails:
print(f'{uniq_email}')
data = {'url': url, 'uniq_email': uniq_email}
write_csv(data)
def main():
# ссылки на сайты в файле 'sities.csv' в столбце 'site'
with open('sities.csv', newline='', encoding='utf-8') as file:
urls = csv.DictReader(file)
n = 0
for url in urls:
n += 1
print(f"{n}. {url['site']}")
html = get_html(url['site'])
get_email(html, url['site'])
if __name__ == '__main__':
main()
@DxDiagDx
Copy link
Author

В функции get_email найденные почты можно сразу складывать во множество. Без всяких лишних преобразований.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment