Last active
July 26, 2021 06:24
-
-
Save DxDiagDx/e13f3b2b2db7827e3e6e8ba831cff28c to your computer and use it in GitHub Desktop.
Парсер email
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
import re | |
def get_html(url): | |
try: | |
result = requests.get(url) | |
result.raise_for_status() | |
return result.text | |
except(requests.RequestException, ValueError): | |
print('Server error') | |
return False | |
def write_csv(data): | |
with open('email_on_site.csv', 'a', encoding='utf-8', newline='') as file: | |
order = ['url', 'uniq_email'] | |
writer = csv.DictWriter(file, fieldnames=order) | |
writer.writerow(data) | |
def get_email(html, url): | |
uniq_emails = [] | |
try: | |
emails = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", html) | |
for email in emails: | |
uniq_emails.append(email) | |
except: | |
return False | |
# Удаляем дубли: превращаем список email во множество, | |
# так оно может содержать только уникальные элементы, | |
# и превращаем множество обратно в список | |
uniq_emails = list(set(uniq_emails)) | |
for uniq_email in uniq_emails: | |
print(f'{uniq_email}') | |
data = {'url': url, 'uniq_email': uniq_email} | |
write_csv(data) | |
def main(): | |
# ссылки на сайты в файле 'sities.csv' в столбце 'site' | |
with open('sities.csv', newline='', encoding='utf-8') as file: | |
urls = csv.DictReader(file) | |
n = 0 | |
for url in urls: | |
n += 1 | |
print(f"{n}. {url['site']}") | |
html = get_html(url['site']) | |
get_email(html, url['site']) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
В функции get_email найденные почты можно сразу складывать во множество. Без всяких лишних преобразований.