Created
August 10, 2019 03:37
-
-
Save lowweihong/8d36ddccd57cc299f2b5352ae714d0a5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydrive.auth import GoogleAuth | |
from pydrive.drive import GoogleDrive | |
import requests | |
import pandas as pd | |
from datetime import datetime, timedelta | |
from bs4 import BeautifulSoup | |
import os | |
def get_pdf(url, folder, filename): | |
""" | |
Downloading the pdf files associated with the specified URL | |
and output the downloaded pdf files into specified directory | |
""" | |
based_url = 'https://links.sgx.com' | |
pdf_url = based_url+url | |
res = requests.get(pdf_url) | |
# Create the folder if not exist | |
if not os.path.exists(folder): | |
os.mkdir(folder) | |
# Output pdf file | |
with open(os.path.join(folder, filename), 'wb') as f: | |
f.write(res.content) | |
return | |
# Get the link to pdf files for each records found | |
for idx, row in df.iterrows(): | |
res = requests.get(row['url']) | |
soup = BeautifulSoup(res.text) | |
# folder = row['issuer_name'].replace(' ', '_') | |
# subfolder = row['category_name'].replace(' ', '_') | |
try: | |
filename = soup.find('a').text | |
part_url = soup.find('a')['href'] | |
get_pdf(part_url, default_folder, filename) | |
except: | |
# No attached pdf file to be downloaded on specified link | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment