lowweihong/sgx_download_links.py

## sgx_download_links.py
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import os

def get_pdf(url, folder, filename):
    """
        Downloading the pdf files associated with the specified URL
        and output the downloaded pdf files into specified directory
    """
    based_url = 'https://links.sgx.com'
    pdf_url = based_url+url
    res = requests.get(pdf_url)
    # Create the folder if not exist
    if not os.path.exists(folder):
        os.mkdir(folder)
    # Output pdf file
    with open(os.path.join(folder, filename), 'wb') as f:
        f.write(res.content)
    return

# Get the link to pdf files for each records found
for idx, row in df.iterrows():
    res = requests.get(row['url'])
    soup = BeautifulSoup(res.text)
    # folder = row['issuer_name'].replace(' ', '_')
    # subfolder = row['category_name'].replace(' ', '_')
    try:
        filename = soup.find('a').text
        part_url = soup.find('a')['href']
        get_pdf(part_url, default_folder, filename)
    except:
        # No attached pdf file to be downloaded on specified link
        pass
	from pydrive.auth import GoogleAuth
	from pydrive.drive import GoogleDrive
	import requests
	import pandas as pd
	from datetime import datetime, timedelta
	from bs4 import BeautifulSoup
	import os

	def get_pdf(url, folder, filename):
	"""
	Downloading the pdf files associated with the specified URL
	and output the downloaded pdf files into specified directory
	"""
	based_url = 'https://links.sgx.com'
	pdf_url = based_url+url
	res = requests.get(pdf_url)
	# Create the folder if not exist
	if not os.path.exists(folder):
	os.mkdir(folder)
	# Output pdf file
	with open(os.path.join(folder, filename), 'wb') as f:
	f.write(res.content)
	return

	# Get the link to pdf files for each records found
	for idx, row in df.iterrows():
	res = requests.get(row['url'])
	soup = BeautifulSoup(res.text)
	# folder = row['issuer_name'].replace(' ', '_')
	# subfolder = row['category_name'].replace(' ', '_')
	try:
	filename = soup.find('a').text
	part_url = soup.find('a')['href']
	get_pdf(part_url, default_folder, filename)
	except:
	# No attached pdf file to be downloaded on specified link
	pass