Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save deepwilson/c1583ceeedc4da467a29014b78043322 to your computer and use it in GitHub Desktop.
Save deepwilson/c1583ceeedc4da467a29014b78043322 to your computer and use it in GitHub Desktop.
Download ArXiv papers from "awesome" repos related to DL and CV
'''Modified from Source -----> "https://github.com/jyguo1729/web-scraping-for-PDF-file" '''
import requests
from bs4 import BeautifulSoup
def get_title(url):
# url = 'https://arxiv.org/abs/1108.3525'
html = requests.get(url)
soup = BeautifulSoup(html.text,'html.parser')
title = soup.select_one('h1.title.mathjax').text.replace('Title:', '')
return title
#import the library used to query a website
import urllib.request
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.parse import urljoin
import os
import sys
#specify the url
try:
url=sys.argv[1]
except IndexError:
url='http://web.cs.ucla.edu/~yzsun/classes/2018Fall_CS145/schedule.html'
#Query the website and return the html to the variable 'page'
page = urllib.request.urlopen(url)
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)
#print(soup.prettify())
all_link=soup.find_all("a")
A=[]
B=[]
for link in all_link:
A.append(link.contents[0])
B.append(urljoin(url,link['href']))
df=pd.DataFrame(A,columns=['Description'])
df['link']=B
dirname = os.path.dirname(__file__)
#dirname="C:\py\crawler"
relpath='output'
path= os.path.join(dirname, relpath,"output.csv")
df.to_csv(path)
for link in B:
print('*'*80)
print(link)
continue
if 'arxiv' not in link:
continue
else:
if 'abs' in link:
title = get_title(link)
link = link.replace('abs', 'pdf')+'.pdf'
else:
link = link.replace('/pdf', '/abs')
link = link.replace('.pdf', '')
title = get_title(link)
link = link.replace('abs', 'pdf')+'.pdf'
print('download link --------------->', link)
file_name = title+'.pdf' #link.split('/')[-1]
print("Link ----> ",file_name)
print(link)
#test if link is open
try:
u=urllib.request.urlopen(link)
except urllib.error.URLError as e:
print(e.reason)
continue
#determine file name end with .pdf, skip this file otherwise
meta = u.info()
if(meta['Content-Type']!='application/pdf'):
print(file_name," is not a PDF file")
continue
#set abosolute path for the file
path_file_name = os.path.join(dirname, relpath,file_name)
print("path_file_name is",path_file_name)
#download file
# urlretrieve(link, path_file_name)
request = urllib.request.urlopen(link, timeout=500)
with open(path_file_name, 'wb') as f:
try:
f.write(request.read())
except Exception as e:
print("error in download ...............", e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment