toosuto-r/Dockerfile

## Dockerfile
FROM ubuntu:focal

COPY googleScraperSetup.sh /googleScraperSetup.sh
RUN chmod 777 /googleScraperSetup.sh
RUN /googleScraperSetup.sh

COPY googleScraper.py /googleScraper.py

CMD python3 /googleScraper.py >/pyoutlog.log 2>&1

# e.g. docker run -d -v outputs:/outputs --name googleScraperDeploy googlescraper

## googleScraper.py
import pandas as pd
import smtplib
import os
import time
import random
import schedule
import logging

from tabulate import tabulate
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from collections import Counter

logging.basicConfig(filename='/outputs/outlogs.log', format='%(asctime)s: %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

def getPage(url):
    logger.info(f'Getting URL: {url}')
    options = Options()
    options.add_argument('--headless=new')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1200")
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    driver.get(url)
    return driver.page_source

def checkDivClass(soup, d):
    chk = True
    for x in soup.find_all("div", {'class':d}):
        # Should have at least 2 children
        c = list(x.children)
        chk &= 2<=len(c)
        if not chk:
            return
        # Second child should have 'data-snf' attr
        chk &= c[1].has_attr('data-snf')
    if chk:
        return d
    return

def getDivClass(divd, soup):
    for a,b in divd.items():
        # Check there are between 6 and 12 appearances
        if 6 <= b <= 12:
            # Verify against other criteria
            if v := checkDivClass(soup, a):
                return v
    return

def getTable(page):
    soup = BeautifulSoup(page, 'html.parser')
    divd = {}
    divs = soup.find_all('div')
    for x in divs:
        if x.has_attr('class'):
            # Reconstruct div class to inspection state
            cn = ' '.join(x['class'])
            # Count them up
            divd[cn] = divd.get(cn, 0) + 1
    divClass = getDivClass(divd, soup)
    res = []
    for a in soup.find_all('div',{'class':divClass}):
        # The first two children contain the headline link and blurb
        b, c = list(a.children)[:2]
        title = next(b.strings)
        url = a.find('a', href=True)['href']
        desc = c.text
        res.append((title,url,desc))
    return res

def getGoogleSearch(query, limit):
    res = []
    q = 'https://google.com/search?q='+query.replace(' ','+')+'&start='
    for l in range(0,limit,10):
        p = getPage(q + str(l))
        time.sleep(random.uniform(5,10))
        print(f'Offset {l}')
        t = getTable(p)
        res += t
    r = pd.DataFrame(res, columns = ['Title', 'URL', 'Snippet'])
    return r

def sendResultEmail(data):
    subject = "DI Google Results"
    sender = "****@dataintellect.com"
    recipients = ["****@gmail.com","****@dataintellect.com"]
    password = "hunter2"
    text = """
        Hello,
        This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.

        {table}

        """
    html = """
        <html><body><p>Hello,</p>
        <p>This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.</p>
        {table}
        </body></html>
        """
    text = text.format(table=tabulate(data, headers="keys", tablefmt="grid", showindex=False))
    html = html.format(table=tabulate(data, headers="keys", tablefmt="html", showindex=False))
    msg = MIMEMultipart(
        "alternative", None, [MIMEText(text), MIMEText(html,'html')])
    msg['Subject'] = subject
    msg['From'] = sender
    msg['To'] = ', '.join(recipients)
    with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp_server:
        smtp_server.login(sender, password)
        smtp_server.sendmail(sender, recipients, msg.as_string())

def runAndUpdNew(query, limit):
    r = getGoogleSearch(query, limit)
    if not os.path.isfile('lastrun.csv'):
        r[:0].to_csv('lastrun.csv',index=False)
    oldr = pd.read_csv('lastrun.csv')
    comp = pd.merge(oldr,r,how='outer',indicator=True)
    comp.drop('_merge', axis=1).to_csv('lastrun.csv', index=False)
    data = comp[comp._merge=='right_only'].drop('_merge', axis=1)
    sendResultEmail(data)

def getDIGoogle():
    runAndUpdNew('https://www.google.com/search?q=%22data+intellect%22+-site%3A%22dataintellect.com%22',50)

if __name__ == '__main__':
    # Simple scheduler and run on start
    getDIGoogle()
    schedule.every().day.at("08:00").do(getDIGoogle)
    while True:
        schedule.run_pending()
        time.sleep(1)

## googleScraperSetup.sh
#! /bin/bash
apt-get update
# Install with a non-interactive frontend to get around asking for a timezone
# Otherwise, this is a dependency for Chrome
DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata

apt-get install -y pip wget
# Strange version clash – libudev needs to be downgraded
apt-get install -y libudev1=249.11-0ubuntu3

# Download and install Chrome
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
apt install -y -f ./google-chrome-stable_current_amd64.deb

pip install selenium beautifulsoup4 pandas tabulate schedule
	FROM ubuntu:focal

	COPY googleScraperSetup.sh /googleScraperSetup.sh
	RUN chmod 777 /googleScraperSetup.sh
	RUN /googleScraperSetup.sh

	COPY googleScraper.py /googleScraper.py

	CMD python3 /googleScraper.py >/pyoutlog.log 2>&1

	# e.g. docker run -d -v outputs:/outputs --name googleScraperDeploy googlescraper
	import pandas as pd
	import smtplib
	import os
	import time
	import random
	import schedule
	import logging

	from tabulate import tabulate
	from email.mime.multipart import MIMEMultipart
	from email.mime.text import MIMEText
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service as ChromeService
	from webdriver_manager.chrome import ChromeDriverManager
	from bs4 import BeautifulSoup
	from collections import Counter

	logging.basicConfig(filename='/outputs/outlogs.log', format='%(asctime)s: %(message)s')
	logger = logging.getLogger(__name__)
	logger.setLevel('INFO')

	def getPage(url):
	logger.info(f'Getting URL: {url}')
	options = Options()
	options.add_argument('--headless=new')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-gpu')
	options.add_argument("--window-size=1920,1200")
	driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
	driver.get(url)
	return driver.page_source

	def checkDivClass(soup, d):
	chk = True
	for x in soup.find_all("div", {'class':d}):
	# Should have at least 2 children
	c = list(x.children)
	chk &= 2<=len(c)
	if not chk:
	return
	# Second child should have 'data-snf' attr
	chk &= c[1].has_attr('data-snf')
	if chk:
	return d
	return

	def getDivClass(divd, soup):
	for a,b in divd.items():
	# Check there are between 6 and 12 appearances
	if 6 <= b <= 12:
	# Verify against other criteria
	if v := checkDivClass(soup, a):
	return v
	return

	def getTable(page):
	soup = BeautifulSoup(page, 'html.parser')
	divd = {}
	divs = soup.find_all('div')
	for x in divs:
	if x.has_attr('class'):
	# Reconstruct div class to inspection state
	cn = ' '.join(x['class'])
	# Count them up
	divd[cn] = divd.get(cn, 0) + 1
	divClass = getDivClass(divd, soup)
	res = []
	for a in soup.find_all('div',{'class':divClass}):
	# The first two children contain the headline link and blurb
	b, c = list(a.children)[:2]
	title = next(b.strings)
	url = a.find('a', href=True)['href']
	desc = c.text
	res.append((title,url,desc))
	return res

	def getGoogleSearch(query, limit):
	res = []
	q = 'https://google.com/search?q='+query.replace(' ','+')+'&start='
	for l in range(0,limit,10):
	p = getPage(q + str(l))
	time.sleep(random.uniform(5,10))
	print(f'Offset {l}')
	t = getTable(p)
	res += t
	r = pd.DataFrame(res, columns = ['Title', 'URL', 'Snippet'])
	return r

	def sendResultEmail(data):
	subject = "DI Google Results"
	sender = "****@dataintellect.com"
	recipients = ["**@gmail.com","**@dataintellect.com"]
	password = "hunter2"
	text = """
	Hello,
	This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.

	{table}

	"""
	html = """
	<html><body><p>Hello,</p>
	<p>This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.</p>
	{table}
	</body></html>
	"""
	text = text.format(table=tabulate(data, headers="keys", tablefmt="grid", showindex=False))
	html = html.format(table=tabulate(data, headers="keys", tablefmt="html", showindex=False))
	msg = MIMEMultipart(
	"alternative", None, [MIMEText(text), MIMEText(html,'html')])
	msg['Subject'] = subject
	msg['From'] = sender
	msg['To'] = ', '.join(recipients)
	with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp_server:
	smtp_server.login(sender, password)
	smtp_server.sendmail(sender, recipients, msg.as_string())

	def runAndUpdNew(query, limit):
	r = getGoogleSearch(query, limit)
	if not os.path.isfile('lastrun.csv'):
	r[:0].to_csv('lastrun.csv',index=False)
	oldr = pd.read_csv('lastrun.csv')
	comp = pd.merge(oldr,r,how='outer',indicator=True)
	comp.drop('_merge', axis=1).to_csv('lastrun.csv', index=False)
	data = comp[comp._merge=='right_only'].drop('_merge', axis=1)
	sendResultEmail(data)

	def getDIGoogle():
	runAndUpdNew('https://www.google.com/search?q=%22data+intellect%22+-site%3A%22dataintellect.com%22',50)

	if __name__ == '__main__':
	# Simple scheduler and run on start
	getDIGoogle()
	schedule.every().day.at("08:00").do(getDIGoogle)
	while True:
	schedule.run_pending()
	time.sleep(1)
	#! /bin/bash
	apt-get update
	# Install with a non-interactive frontend to get around asking for a timezone
	# Otherwise, this is a dependency for Chrome
	DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata

	apt-get install -y pip wget
	# Strange version clash – libudev needs to be downgraded
	apt-get install -y libudev1=249.11-0ubuntu3

	# Download and install Chrome
	wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
	apt install -y -f ./google-chrome-stable_current_amd64.deb

	pip install selenium beautifulsoup4 pandas tabulate schedule