Created
May 12, 2018 05:00
-
-
Save armsp/53696632735afc27ad678ddc94f870e7 to your computer and use it in GitHub Desktop.
Script that automates downloads of a certain set of pdf documents from http://www.ti.com/analog-circuit/circuit-cookbook.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pdf File downloader for TI Links | |
from bs4 import BeautifulSoup | |
import requests | |
import urllib3 | |
import re | |
import os | |
DOWNLOAD_FOLDER = os.path.join("D:",os.sep,"electronics","TI_pdfs") | |
os.mkdir(DOWNLOAD_FOLDER) | |
http = urllib3.PoolManager() | |
url = 'http://www.ti.com/analog-circuit/circuit-cookbook.html' | |
response = http.request('GET', url) | |
soup = BeautifulSoup(response.data) | |
find = re.compile("http://www.ti.com/lit/") | |
download_links = [] | |
for link in soup.find_all('a'): | |
match = re.match(find, link.get('href')) | |
if match: | |
#print(link.get('href')) | |
download_links.append(link.get('href')) | |
#print(download_links) | |
for links in download_links[5:]: | |
response = requests.get(links) | |
if response.history: | |
print ("Request was redirected") | |
print(response.url) | |
r = response.url | |
# The redirected url then uses meta refresh | |
meta_response = http.request('GET',r) | |
meta_soup = BeautifulSoup(meta_response.data) | |
meta_result = meta_soup.find('meta',attrs={'http-equiv':'Refresh'}) | |
#print(meta_result) | |
wait,text = meta_result["content"].split(";") | |
#print(text.lower()) | |
#if text.strip().lower().startswith("url="): | |
# print("Found the final link!") | |
final_url = text.strip()[4:] | |
#print(final_url) | |
pdf_name = final_url.split('/')[-1] | |
response = requests.get(final_url) | |
pdf_path = os.path.join(DOWNLOAD_FOLDER,pdf_name) | |
with open(pdf_path, 'wb') as f: | |
f.write(response.content) | |
'''for larger pdf files | |
with open(filename, 'wb') as fd: | |
for chunk in r.iter_content(chunk_size=128): | |
fd.write(chunk) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment