Skip to content

Instantly share code, notes, and snippets.

@yshalsager
Created July 6, 2021 16:23
Show Gist options
  • Save yshalsager/94a73d0982fca7d77efba2bb1dbfc266 to your computer and use it in GitHub Desktop.
Save yshalsager/94a73d0982fca7d77efba2bb1dbfc266 to your computer and use it in GitHub Desktop.
Scraps quran pictures links from 7a9ad website into text file.
#!/usr/bin/env python3
from urllib.parse import urlparse
from requests import get
from bs4 import BeautifulSoup
from requests.api import head
def get_url_hostname(url):
parsed_uri = urlparse(url)
return f'{parsed_uri.scheme}://{parsed_uri.netloc}'
def get_request_headers(url):
return {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Referer': f'{get_url_hostname(url)}/index.php',
'Upgrade-Insecure-Requests': '1',
}
def get_images_list(url):
html = get(url, headers=get_request_headers(url)).text
soup = BeautifulSoup(html, "html.parser")
images = soup.find_all("img")
image_urls = []
for image in images:
image_url = image["src"]
if image_url.startswith("/images/mota/"):
image_urls.append(f"{get_url_hostname(url)}{image_url}")
return image_urls
def scrap_images(url):
response = get(url, headers=get_request_headers(url))
soup = BeautifulSoup(response.text, "html.parser")
pages = soup.select('.list-title')
images = []
for page in pages:
page_url = f"{get_url_hostname(url)}/{page.a.get('href')}"
images += get_images_list(page_url)
with open('links.txt', 'w') as out:
for image in images:
out.write(image + '\n')
if __name__ == "__main__":
from sys import argv
url = argv[1] if len(argv) > 1 else 'https://7a9ad.com/index.php?option=com_content&view=category&id=118&Itemid=480'
scrap_images(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment