armsp/download pdf.py

## download pdf.py
# pdf File downloader for TI Links
from bs4 import BeautifulSoup
import requests
import urllib3
import re
import os

DOWNLOAD_FOLDER = os.path.join("D:",os.sep,"electronics","TI_pdfs")
os.mkdir(DOWNLOAD_FOLDER)

http = urllib3.PoolManager()
url = 'http://www.ti.com/analog-circuit/circuit-cookbook.html'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
find = re.compile("http://www.ti.com/lit/")
download_links = []
for link in soup.find_all('a'):
    match = re.match(find, link.get('href'))
    if match:
        #print(link.get('href'))
        download_links.append(link.get('href'))
#print(download_links)

for links in download_links[5:]:
    response = requests.get(links)
    if response.history:
        print ("Request was redirected")
        print(response.url)
        r = response.url
        # The redirected url then uses meta refresh
        meta_response = http.request('GET',r)
        meta_soup = BeautifulSoup(meta_response.data)
        meta_result = meta_soup.find('meta',attrs={'http-equiv':'Refresh'})
        #print(meta_result)
        wait,text = meta_result["content"].split(";")
        #print(text.lower())
        #if text.strip().lower().startswith("url="):
        #  print("Found the final link!")
        final_url = text.strip()[4:]
        #print(final_url)
        pdf_name = final_url.split('/')[-1]
        response = requests.get(final_url)
        pdf_path = os.path.join(DOWNLOAD_FOLDER,pdf_name)
        with open(pdf_path, 'wb') as f:
            f.write(response.content)

'''for larger pdf files
with open(filename, 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)
'''
	# pdf File downloader for TI Links
	from bs4 import BeautifulSoup
	import requests
	import urllib3
	import re
	import os

	DOWNLOAD_FOLDER = os.path.join("D:",os.sep,"electronics","TI_pdfs")
	os.mkdir(DOWNLOAD_FOLDER)

	http = urllib3.PoolManager()
	url = 'http://www.ti.com/analog-circuit/circuit-cookbook.html'
	response = http.request('GET', url)
	soup = BeautifulSoup(response.data)
	find = re.compile("http://www.ti.com/lit/")
	download_links = []
	for link in soup.find_all('a'):
	match = re.match(find, link.get('href'))
	if match:
	#print(link.get('href'))
	download_links.append(link.get('href'))
	#print(download_links)

	for links in download_links[5:]:
	response = requests.get(links)
	if response.history:
	print ("Request was redirected")
	print(response.url)
	r = response.url
	# The redirected url then uses meta refresh
	meta_response = http.request('GET',r)
	meta_soup = BeautifulSoup(meta_response.data)
	meta_result = meta_soup.find('meta',attrs={'http-equiv':'Refresh'})
	#print(meta_result)
	wait,text = meta_result["content"].split(";")
	#print(text.lower())
	#if text.strip().lower().startswith("url="):
	# print("Found the final link!")
	final_url = text.strip()[4:]
	#print(final_url)
	pdf_name = final_url.split('/')[-1]
	response = requests.get(final_url)
	pdf_path = os.path.join(DOWNLOAD_FOLDER,pdf_name)
	with open(pdf_path, 'wb') as f:
	f.write(response.content)

	'''for larger pdf files
	with open(filename, 'wb') as fd:
	for chunk in r.iter_content(chunk_size=128):
	fd.write(chunk)
	'''