Skip to content

Instantly share code, notes, and snippets.

@vwrs
Created May 22, 2018 13:43
Show Gist options
  • Save vwrs/25a222afc1c4f1d5df97df2db236f4b0 to your computer and use it in GitHub Desktop.
Save vwrs/25a222afc1c4f1d5df97df2db236f4b0 to your computer and use it in GitHub Desktop.
NIPS paper downloader
import os
import wget
import requests
from bs4 import BeautifulSoup
# use Tor
local_proxy = 'socks5://127.1:9050'
socks_proxy = {
'http': local_proxy,
'https': local_proxy
}
current_ip = requests.get(
url='http://icanhazip.com/',
proxies=socks_proxy,
verify=False
)
print('original IP: ', current_ip)
current_ip = requests.get(
url='http://icanhazip.com/',
proxies=socks_proxy,
verify=False
)
print('Tor IP: ', current_ip)
# start crawling
http = 'https://papers.nips.cc/'
rq = requests.get(http)
sp = BeautifulSoup(rq.text, 'html5lib')
layerage = []
uls = sp.find_all('ul')
for ul in uls:
lis = ul.find_all('li')
for li in lis:
link = li.find('a').get('href')
if '/' != link:
layerage.append(link)
else:
pass
for age in layerage:
http1 = http + age.replace('/', '', 1)
print(http1)
rq = requests.get(http1)
sp = BeautifulSoup(rq.text, 'html5lib')
uls = sp.find_all('ul')
for ul in uls:
lis = ul.find_all('li')
for li in lis:
link = li.find('a').get("href")
if '/' != link:
link2 = http + link.replace('/', '', 1)
os.system('wget -r -nc -A pdf ' + link2)
else:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment