Skip to content

Instantly share code, notes, and snippets.

@OElesin
Created July 3, 2023 19:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OElesin/e21d2dc7bd84fd8a0d88d7e9ef0f5ec8 to your computer and use it in GitHub Desktop.
Save OElesin/e21d2dc7bd84fd8a0d88d7e9ef0f5ec8 to your computer and use it in GitHub Desktop.
Covenant Nation Web Scraper to AWS S3
from bs4 import BeautifulSoup
import re
from urllib.parse import unquote
import time
import boto3, json, os
import requests
from bs4 import BeautifulSoup
s3 = boto3.client('s3')
media_objects_bucket_name = 's3 bucket name' # Replace with your S3 bucket name
BASE_URL = "https://elibrary.insightsforliving.org/categories/4f8b4b73-0e68-46c2-a7fb-0828a50cc6ad?page={page_num}&q=Pastor%20Poju%20Oyemade"
page_nums = range(2, 10)
def extract_message_url(base_string):
"""
"""
match = re.search(r"'([^']+)'", base_string)
if match:
return match.group(1)
else:
print("No match found.")
return None
def upload_message_to_s3(file_content, filename):
"""
"""
s3_key = f'{filename}'
try:
s3.put_object(Body=file_content, Bucket=media_objects_bucket_name, Key=s3_key)
print(f"File uploaded to S3 successfully: s3://{media_objects_bucket_name}/{s3_key}")
except NoCredentialsError:
print("AWS credentials not found. Make sure you have configured your AWS credentials.")
def clean_string(string):
return re.sub(r'[^0-9a-zA-Z._-]', '', string)
def download_message_from_url(message_url):
"""
"""
response = requests.head(message_url, allow_redirects=True)
final_url = response.url
filename = unquote(os.path.basename(final_url).split('?')[0]).replace(' ', '-')
mp3_response = requests.get(final_url)
upload_message_to_s3(mp3_response.content, clean_string(filename))
print(f"Downloaded {filename} successfully!")
for i in page_nums:
URL = BASE_URL.format(page_num=i)
# Send a GET request to the webpage
response = requests.get(URL)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the list of articles
messages = soup.find_all('div', class_='col-lg-3 col-6 mb-4')
for message in messages:
link_tag = message.find('div', onclick=True)
download_uri = extract_message_url(link_tag.get('onclick'))
full_download_url = f'https://elibrary.insightsforliving.org{download_uri}'
download_message_from_url(full_download_url)
time.sleep(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment