Created
July 6, 2021 16:23
-
-
Save yshalsager/94a73d0982fca7d77efba2bb1dbfc266 to your computer and use it in GitHub Desktop.
Scraps quran pictures links from 7a9ad website into text file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from urllib.parse import urlparse | |
from requests import get | |
from bs4 import BeautifulSoup | |
from requests.api import head | |
def get_url_hostname(url): | |
parsed_uri = urlparse(url) | |
return f'{parsed_uri.scheme}://{parsed_uri.netloc}' | |
def get_request_headers(url): | |
return { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Connection': 'keep-alive', | |
'Referer': f'{get_url_hostname(url)}/index.php', | |
'Upgrade-Insecure-Requests': '1', | |
} | |
def get_images_list(url): | |
html = get(url, headers=get_request_headers(url)).text | |
soup = BeautifulSoup(html, "html.parser") | |
images = soup.find_all("img") | |
image_urls = [] | |
for image in images: | |
image_url = image["src"] | |
if image_url.startswith("/images/mota/"): | |
image_urls.append(f"{get_url_hostname(url)}{image_url}") | |
return image_urls | |
def scrap_images(url): | |
response = get(url, headers=get_request_headers(url)) | |
soup = BeautifulSoup(response.text, "html.parser") | |
pages = soup.select('.list-title') | |
images = [] | |
for page in pages: | |
page_url = f"{get_url_hostname(url)}/{page.a.get('href')}" | |
images += get_images_list(page_url) | |
with open('links.txt', 'w') as out: | |
for image in images: | |
out.write(image + '\n') | |
if __name__ == "__main__": | |
from sys import argv | |
url = argv[1] if len(argv) > 1 else 'https://7a9ad.com/index.php?option=com_content&view=category&id=118&Itemid=480' | |
scrap_images(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment