jessfraz/Dockerfile

## README.md

      
    Raw
  

              README.md
            
          
    docker build --rm --force-rm -t r.j3ss.co/scrape . 

docker run --rm -it -v $(pwd)/results:/root/papers r.j3ss.co/scrape

  
## Dockerfile
FROM python:2-alpine

RUN pip install \
	beautifulsoup4 \
	requests

COPY papers.py /usr/local/bin/
RUN chmod +x /usr/local/bin/papers.py

WORKDIR /root

CMD ["papers.py"]

## papers.py
#!/usr/local/bin/python

# Import python libraries.
import hashlib
import os
import random
import signal
import sys
from time import sleep

# Import external deps.
from bs4 import BeautifulSoup
import requests

# Handle control+C.
def signal_handler(sig, frame):
	print('You pressed Ctrl+C! Exiting...')
	sys.exit(0)

def get_google_scholar_pdf(page):
	headers={
		'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
	}
	page = requests.get(page, headers=headers)
	print("page content: " + page.content)
	soup = BeautifulSoup(page.content, 'html.parser')
	pdf = ""

	# Find the link with the pdf.
	for link in soup.find_all('a'):
		if link is not None:
			href = link['href']
			print("href: " + href)
			if href.endswith(".pdf"):
				return pdf

	return pdf


def main(page):
	# Make the parent directory.
	parentDir = "papers"
	if not os.path.exists(parentDir):
		os.makedirs(parentDir)

	page = requests.get(page)
	soup = BeautifulSoup(page.content, 'html.parser')

	# Find all the table rows with content.
	# Skip the first header row.
	for row in soup.find_all('table')[0].find_all('tr')[1:]:
		links = row.select('td a')
		if links is not None:
			link = links[0]
			text = link.text
			href = link['href']
			# Follow the href to get the PDF.
			pdf = get_google_scholar_pdf(href)
			print("pdf: " + pdf)
			# Sleep so it does not think we are a bot.
			sleep(1)

signal.signal(signal.SIGINT, signal_handler)
main("https://jeffhuang.com/best_paper_awards.html")
	FROM python:2-alpine

	RUN pip install \
	beautifulsoup4 \
	requests

	COPY papers.py /usr/local/bin/
	RUN chmod +x /usr/local/bin/papers.py

	WORKDIR /root

	CMD ["papers.py"]
	#!/usr/local/bin/python

	# Import python libraries.
	import hashlib
	import os
	import random
	import signal
	import sys
	from time import sleep

	# Import external deps.
	from bs4 import BeautifulSoup
	import requests

	# Handle control+C.
	def signal_handler(sig, frame):
	print('You pressed Ctrl+C! Exiting...')
	sys.exit(0)

	def get_google_scholar_pdf(page):
	headers={
	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0',
	}
	page = requests.get(page, headers=headers)
	print("page content: " + page.content)
	soup = BeautifulSoup(page.content, 'html.parser')
	pdf = ""

	# Find the link with the pdf.
	for link in soup.find_all('a'):
	if link is not None:
	href = link['href']
	print("href: " + href)
	if href.endswith(".pdf"):
	return pdf

	return pdf


	def main(page):
	# Make the parent directory.
	parentDir = "papers"
	if not os.path.exists(parentDir):
	os.makedirs(parentDir)

	page = requests.get(page)
	soup = BeautifulSoup(page.content, 'html.parser')

	# Find all the table rows with content.
	# Skip the first header row.
	for row in soup.find_all('table')[0].find_all('tr')[1:]:
	links = row.select('td a')
	if links is not None:
	link = links[0]
	text = link.text
	href = link['href']
	# Follow the href to get the PDF.
	pdf = get_google_scholar_pdf(href)
	print("pdf: " + pdf)
	# Sleep so it does not think we are a bot.
	sleep(1)

	signal.signal(signal.SIGINT, signal_handler)
	main("https://jeffhuang.com/best_paper_awards.html")