Scrape best papers site
docker build --rm --force-rm -t . 

docker run --rm -it -v $(pwd)/results:/root/papers
FROM python:2-alpine
RUN pip install \
beautifulsoup4 \
COPY /usr/local/bin/
RUN chmod +x /usr/local/bin/
CMD [""]
# Import python libraries.
import hashlib
import os
import random
import signal
import sys
from time import sleep
# Import external deps.
from bs4 import BeautifulSoup
import requests
# Handle control+C.
def signal_handler(sig, frame):
print('You pressed Ctrl+C! Exiting...')
def get_google_scholar_pdf(page):
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
page = requests.get(page, headers=headers)
print("page content: " + page.content)
soup = BeautifulSoup(page.content, 'html.parser')
pdf = ""
# Find the link with the pdf.
for link in soup.find_all('a'):
if link is not None:
href = link['href']
print("href: " + href)
if href.endswith(".pdf"):
return pdf
return pdf
def main(page):
# Make the parent directory.
parentDir = "papers"
if not os.path.exists(parentDir):
page = requests.get(page)
soup = BeautifulSoup(page.content, 'html.parser')
# Find all the table rows with content.
# Skip the first header row.
for row in soup.find_all('table')[0].find_all('tr')[1:]:
links ='td a')
if links is not None:
link = links[0]
text = link.text
href = link['href']
# Follow the href to get the PDF.
pdf = get_google_scholar_pdf(href)
print("pdf: " + pdf)
# Sleep so it does not think we are a bot.
signal.signal(signal.SIGINT, signal_handler)
