Skip to content

Instantly share code, notes, and snippets.

@toosuto-r
Created July 18, 2023 09:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toosuto-r/d9317d41920cca5a94803a511c5f38f2 to your computer and use it in GitHub Desktop.
Save toosuto-r/d9317d41920cca5a94803a511c5f38f2 to your computer and use it in GitHub Desktop.
Google scraper blog snippet
FROM ubuntu:focal
COPY googleScraperSetup.sh /googleScraperSetup.sh
RUN chmod 777 /googleScraperSetup.sh
RUN /googleScraperSetup.sh
COPY googleScraper.py /googleScraper.py
CMD python3 /googleScraper.py >/pyoutlog.log 2>&1
# e.g. docker run -d -v outputs:/outputs --name googleScraperDeploy googlescraper
import pandas as pd
import smtplib
import os
import time
import random
import schedule
import logging
from tabulate import tabulate
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from collections import Counter
logging.basicConfig(filename='/outputs/outlogs.log', format='%(asctime)s: %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')
def getPage(url):
logger.info(f'Getting URL: {url}')
options = Options()
options.add_argument('--headless=new')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.get(url)
return driver.page_source
def checkDivClass(soup, d):
chk = True
for x in soup.find_all("div", {'class':d}):
# Should have at least 2 children
c = list(x.children)
chk &= 2<=len(c)
if not chk:
return
# Second child should have 'data-snf' attr
chk &= c[1].has_attr('data-snf')
if chk:
return d
return
def getDivClass(divd, soup):
for a,b in divd.items():
# Check there are between 6 and 12 appearances
if 6 <= b <= 12:
# Verify against other criteria
if v := checkDivClass(soup, a):
return v
return
def getTable(page):
soup = BeautifulSoup(page, 'html.parser')
divd = {}
divs = soup.find_all('div')
for x in divs:
if x.has_attr('class'):
# Reconstruct div class to inspection state
cn = ' '.join(x['class'])
# Count them up
divd[cn] = divd.get(cn, 0) + 1
divClass = getDivClass(divd, soup)
res = []
for a in soup.find_all('div',{'class':divClass}):
# The first two children contain the headline link and blurb
b, c = list(a.children)[:2]
title = next(b.strings)
url = a.find('a', href=True)['href']
desc = c.text
res.append((title,url,desc))
return res
def getGoogleSearch(query, limit):
res = []
q = 'https://google.com/search?q='+query.replace(' ','+')+'&start='
for l in range(0,limit,10):
p = getPage(q + str(l))
time.sleep(random.uniform(5,10))
print(f'Offset {l}')
t = getTable(p)
res += t
r = pd.DataFrame(res, columns = ['Title', 'URL', 'Snippet'])
return r
def sendResultEmail(data):
subject = "DI Google Results"
sender = "****@dataintellect.com"
recipients = ["****@gmail.com","****@dataintellect.com"]
password = "hunter2"
text = """
Hello,
This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.
{table}
"""
html = """
<html><body><p>Hello,</p>
<p>This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.</p>
{table}
</body></html>
"""
text = text.format(table=tabulate(data, headers="keys", tablefmt="grid", showindex=False))
html = html.format(table=tabulate(data, headers="keys", tablefmt="html", showindex=False))
msg = MIMEMultipart(
"alternative", None, [MIMEText(text), MIMEText(html,'html')])
msg['Subject'] = subject
msg['From'] = sender
msg['To'] = ', '.join(recipients)
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp_server:
smtp_server.login(sender, password)
smtp_server.sendmail(sender, recipients, msg.as_string())
def runAndUpdNew(query, limit):
r = getGoogleSearch(query, limit)
if not os.path.isfile('lastrun.csv'):
r[:0].to_csv('lastrun.csv',index=False)
oldr = pd.read_csv('lastrun.csv')
comp = pd.merge(oldr,r,how='outer',indicator=True)
comp.drop('_merge', axis=1).to_csv('lastrun.csv', index=False)
data = comp[comp._merge=='right_only'].drop('_merge', axis=1)
sendResultEmail(data)
def getDIGoogle():
runAndUpdNew('https://www.google.com/search?q=%22data+intellect%22+-site%3A%22dataintellect.com%22',50)
if __name__ == '__main__':
# Simple scheduler and run on start
getDIGoogle()
schedule.every().day.at("08:00").do(getDIGoogle)
while True:
schedule.run_pending()
time.sleep(1)
#! /bin/bash
apt-get update
# Install with a non-interactive frontend to get around asking for a timezone
# Otherwise, this is a dependency for Chrome
DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
apt-get install -y pip wget
# Strange version clash – libudev needs to be downgraded
apt-get install -y libudev1=249.11-0ubuntu3
# Download and install Chrome
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
apt install -y -f ./google-chrome-stable_current_amd64.deb
pip install selenium beautifulsoup4 pandas tabulate schedule
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment