Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape best papers site
docker build --rm --force-rm -t r.j3ss.co/scrape . 

docker run --rm -it -v $(pwd)/results:/root/papers r.j3ss.co/scrape
FROM python:2-alpine
RUN pip install \
beautifulsoup4 \
requests
COPY papers.py /usr/local/bin/
RUN chmod +x /usr/local/bin/papers.py
WORKDIR /root
CMD ["papers.py"]
#!/usr/local/bin/python
# Import python libraries.
import hashlib
import os
import random
import signal
import sys
from time import sleep
# Import external deps.
from bs4 import BeautifulSoup
import requests
# Handle control+C.
def signal_handler(sig, frame):
print('You pressed Ctrl+C! Exiting...')
sys.exit(0)
def get_google_scholar_pdf(page):
headers={
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
}
page = requests.get(page, headers=headers)
print("page content: " + page.content)
soup = BeautifulSoup(page.content, 'html.parser')
pdf = ""
# Find the link with the pdf.
for link in soup.find_all('a'):
if link is not None:
href = link['href']
print("href: " + href)
if href.endswith(".pdf"):
return pdf
return pdf
def main(page):
# Make the parent directory.
parentDir = "papers"
if not os.path.exists(parentDir):
os.makedirs(parentDir)
page = requests.get(page)
soup = BeautifulSoup(page.content, 'html.parser')
# Find all the table rows with content.
# Skip the first header row.
for row in soup.find_all('table')[0].find_all('tr')[1:]:
links = row.select('td a')
if links is not None:
link = links[0]
text = link.text
href = link['href']
# Follow the href to get the PDF.
pdf = get_google_scholar_pdf(href)
print("pdf: " + pdf)
# Sleep so it does not think we are a bot.
sleep(1)
signal.signal(signal.SIGINT, signal_handler)
main("https://jeffhuang.com/best_paper_awards.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment