Skip to content

Instantly share code, notes, and snippets.

@armsp
Created May 12, 2018 05:00
Show Gist options
  • Save armsp/53696632735afc27ad678ddc94f870e7 to your computer and use it in GitHub Desktop.
Save armsp/53696632735afc27ad678ddc94f870e7 to your computer and use it in GitHub Desktop.
Script that automates downloads of a certain set of pdf documents from http://www.ti.com/analog-circuit/circuit-cookbook.html
# pdf File downloader for TI Links
from bs4 import BeautifulSoup
import requests
import urllib3
import re
import os
DOWNLOAD_FOLDER = os.path.join("D:",os.sep,"electronics","TI_pdfs")
os.mkdir(DOWNLOAD_FOLDER)
http = urllib3.PoolManager()
url = 'http://www.ti.com/analog-circuit/circuit-cookbook.html'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
find = re.compile("http://www.ti.com/lit/")
download_links = []
for link in soup.find_all('a'):
match = re.match(find, link.get('href'))
if match:
#print(link.get('href'))
download_links.append(link.get('href'))
#print(download_links)
for links in download_links[5:]:
response = requests.get(links)
if response.history:
print ("Request was redirected")
print(response.url)
r = response.url
# The redirected url then uses meta refresh
meta_response = http.request('GET',r)
meta_soup = BeautifulSoup(meta_response.data)
meta_result = meta_soup.find('meta',attrs={'http-equiv':'Refresh'})
#print(meta_result)
wait,text = meta_result["content"].split(";")
#print(text.lower())
#if text.strip().lower().startswith("url="):
# print("Found the final link!")
final_url = text.strip()[4:]
#print(final_url)
pdf_name = final_url.split('/')[-1]
response = requests.get(final_url)
pdf_path = os.path.join(DOWNLOAD_FOLDER,pdf_name)
with open(pdf_path, 'wb') as f:
f.write(response.content)
'''for larger pdf files
with open(filename, 'wb') as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment