deepwilson/Scrape "awesome repos" for ArXiv papers to send to Kindle

## Scrape "awesome repos" for ArXiv papers to send to Kindle
'''Modified from Source -----> "https://github.com/jyguo1729/web-scraping-for-PDF-file" '''
import requests
from bs4 import BeautifulSoup

def get_title(url):
    # url = 'https://arxiv.org/abs/1108.3525'
    html = requests.get(url)
    soup = BeautifulSoup(html.text,'html.parser')
    title = soup.select_one('h1.title.mathjax').text.replace('Title:', '')
    return title

#import the library used to query a website
import urllib.request
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.parse import urljoin
import os
import sys


#specify the url
try:
    url=sys.argv[1]
except IndexError:
    url='http://web.cs.ucla.edu/~yzsun/classes/2018Fall_CS145/schedule.html'

#Query the website and return the html to the variable 'page'
page = urllib.request.urlopen(url)


#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)
#print(soup.prettify())

all_link=soup.find_all("a")
A=[]
B=[]
for link in all_link:
    A.append(link.contents[0])
    B.append(urljoin(url,link['href']))

df=pd.DataFrame(A,columns=['Description'])
df['link']=B


dirname = os.path.dirname(__file__)
#dirname="C:\py\crawler"
relpath='output'
path= os.path.join(dirname, relpath,"output.csv")
df.to_csv(path)


for link in B:
    print('*'*80)
    print(link)
    continue
    if 'arxiv' not in link:
        continue
    else:
        if 'abs' in link:
            title = get_title(link)
            link = link.replace('abs', 'pdf')+'.pdf'
        else:
            link = link.replace('/pdf', '/abs')
            link = link.replace('.pdf', '')
            title = get_title(link)
            link = link.replace('abs', 'pdf')+'.pdf'

        print('download link --------------->', link)
        file_name = title+'.pdf' #link.split('/')[-1]
        print("Link ----> ",file_name)
        print(link)

    #test if link is open
    try:
        u=urllib.request.urlopen(link)
    except urllib.error.URLError as e:
        print(e.reason)
        continue

    #determine file name end with .pdf, skip this file otherwise


    meta = u.info()
    if(meta['Content-Type']!='application/pdf'):
        print(file_name," is not a PDF file")
        continue

    #set abosolute path for the file
    path_file_name = os.path.join(dirname, relpath,file_name)
    print("path_file_name is",path_file_name)

    #download file
    # urlretrieve(link, path_file_name)

    request = urllib.request.urlopen(link, timeout=500)
    with open(path_file_name, 'wb') as f:
        try:
            f.write(request.read())
        except Exception as e:
            print("error in download ...............", e)
	'''Modified from Source -----> "https://github.com/jyguo1729/web-scraping-for-PDF-file" '''
	import requests
	from bs4 import BeautifulSoup

	def get_title(url):
	# url = 'https://arxiv.org/abs/1108.3525'
	html = requests.get(url)
	soup = BeautifulSoup(html.text,'html.parser')
	title = soup.select_one('h1.title.mathjax').text.replace('Title:', '')
	return title

	#import the library used to query a website
	import urllib.request
	from urllib.request import urlretrieve
	from bs4 import BeautifulSoup
	import numpy as np
	import pandas as pd
	from urllib.parse import urljoin
	import os
	import sys




	#specify the url
	try:
	url=sys.argv[1]
	except IndexError:
	url='http://web.cs.ucla.edu/~yzsun/classes/2018Fall_CS145/schedule.html'

	#Query the website and return the html to the variable 'page'
	page = urllib.request.urlopen(url)


	#Parse the html in the 'page' variable, and store it in Beautiful Soup format
	soup = BeautifulSoup(page)
	#print(soup.prettify())

	all_link=soup.find_all("a")
	A=[]
	B=[]
	for link in all_link:
	A.append(link.contents[0])
	B.append(urljoin(url,link['href']))

	df=pd.DataFrame(A,columns=['Description'])
	df['link']=B


	dirname = os.path.dirname(__file__)
	#dirname="C:\py\crawler"
	relpath='output'
	path= os.path.join(dirname, relpath,"output.csv")
	df.to_csv(path)


	for link in B:
	print(''80)
	print(link)
	continue
	if 'arxiv' not in link:
	continue
	else:
	if 'abs' in link:
	title = get_title(link)
	link = link.replace('abs', 'pdf')+'.pdf'
	else:
	link = link.replace('/pdf', '/abs')
	link = link.replace('.pdf', '')
	title = get_title(link)
	link = link.replace('abs', 'pdf')+'.pdf'

	print('download link --------------->', link)
	file_name = title+'.pdf' #link.split('/')[-1]
	print("Link ----> ",file_name)
	print(link)

	#test if link is open
	try:
	u=urllib.request.urlopen(link)
	except urllib.error.URLError as e:
	print(e.reason)
	continue

	#determine file name end with .pdf, skip this file otherwise




	meta = u.info()
	if(meta['Content-Type']!='application/pdf'):
	print(file_name," is not a PDF file")
	continue

	#set abosolute path for the file
	path_file_name = os.path.join(dirname, relpath,file_name)
	print("path_file_name is",path_file_name)

	#download file
	# urlretrieve(link, path_file_name)

	request = urllib.request.urlopen(link, timeout=500)
	with open(path_file_name, 'wb') as f:
	try:
	f.write(request.read())
	except Exception as e:
	print("error in download ...............", e)