Created
July 18, 2023 09:25
-
-
Save toosuto-r/d9317d41920cca5a94803a511c5f38f2 to your computer and use it in GitHub Desktop.
Google scraper blog snippet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM ubuntu:focal | |
COPY googleScraperSetup.sh /googleScraperSetup.sh | |
RUN chmod 777 /googleScraperSetup.sh | |
RUN /googleScraperSetup.sh | |
COPY googleScraper.py /googleScraper.py | |
CMD python3 /googleScraper.py >/pyoutlog.log 2>&1 | |
# e.g. docker run -d -v outputs:/outputs --name googleScraperDeploy googlescraper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import smtplib | |
import os | |
import time | |
import random | |
import schedule | |
import logging | |
from tabulate import tabulate | |
from email.mime.multipart import MIMEMultipart | |
from email.mime.text import MIMEText | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.chrome.service import Service as ChromeService | |
from webdriver_manager.chrome import ChromeDriverManager | |
from bs4 import BeautifulSoup | |
from collections import Counter | |
logging.basicConfig(filename='/outputs/outlogs.log', format='%(asctime)s: %(message)s') | |
logger = logging.getLogger(__name__) | |
logger.setLevel('INFO') | |
def getPage(url): | |
logger.info(f'Getting URL: {url}') | |
options = Options() | |
options.add_argument('--headless=new') | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-gpu') | |
options.add_argument("--window-size=1920,1200") | |
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) | |
driver.get(url) | |
return driver.page_source | |
def checkDivClass(soup, d): | |
chk = True | |
for x in soup.find_all("div", {'class':d}): | |
# Should have at least 2 children | |
c = list(x.children) | |
chk &= 2<=len(c) | |
if not chk: | |
return | |
# Second child should have 'data-snf' attr | |
chk &= c[1].has_attr('data-snf') | |
if chk: | |
return d | |
return | |
def getDivClass(divd, soup): | |
for a,b in divd.items(): | |
# Check there are between 6 and 12 appearances | |
if 6 <= b <= 12: | |
# Verify against other criteria | |
if v := checkDivClass(soup, a): | |
return v | |
return | |
def getTable(page): | |
soup = BeautifulSoup(page, 'html.parser') | |
divd = {} | |
divs = soup.find_all('div') | |
for x in divs: | |
if x.has_attr('class'): | |
# Reconstruct div class to inspection state | |
cn = ' '.join(x['class']) | |
# Count them up | |
divd[cn] = divd.get(cn, 0) + 1 | |
divClass = getDivClass(divd, soup) | |
res = [] | |
for a in soup.find_all('div',{'class':divClass}): | |
# The first two children contain the headline link and blurb | |
b, c = list(a.children)[:2] | |
title = next(b.strings) | |
url = a.find('a', href=True)['href'] | |
desc = c.text | |
res.append((title,url,desc)) | |
return res | |
def getGoogleSearch(query, limit): | |
res = [] | |
q = 'https://google.com/search?q='+query.replace(' ','+')+'&start=' | |
for l in range(0,limit,10): | |
p = getPage(q + str(l)) | |
time.sleep(random.uniform(5,10)) | |
print(f'Offset {l}') | |
t = getTable(p) | |
res += t | |
r = pd.DataFrame(res, columns = ['Title', 'URL', 'Snippet']) | |
return r | |
def sendResultEmail(data): | |
subject = "DI Google Results" | |
sender = "****@dataintellect.com" | |
recipients = ["****@gmail.com","****@dataintellect.com"] | |
password = "hunter2" | |
text = """ | |
Hello, | |
This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50. | |
{table} | |
""" | |
html = """ | |
<html><body><p>Hello,</p> | |
<p>This is the table of the top 50 results from google on DI. If you see a full table, the service has restarted or is running for the first time. If you see a small table, these are new results which have recently turned up in the top 50.</p> | |
{table} | |
</body></html> | |
""" | |
text = text.format(table=tabulate(data, headers="keys", tablefmt="grid", showindex=False)) | |
html = html.format(table=tabulate(data, headers="keys", tablefmt="html", showindex=False)) | |
msg = MIMEMultipart( | |
"alternative", None, [MIMEText(text), MIMEText(html,'html')]) | |
msg['Subject'] = subject | |
msg['From'] = sender | |
msg['To'] = ', '.join(recipients) | |
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp_server: | |
smtp_server.login(sender, password) | |
smtp_server.sendmail(sender, recipients, msg.as_string()) | |
def runAndUpdNew(query, limit): | |
r = getGoogleSearch(query, limit) | |
if not os.path.isfile('lastrun.csv'): | |
r[:0].to_csv('lastrun.csv',index=False) | |
oldr = pd.read_csv('lastrun.csv') | |
comp = pd.merge(oldr,r,how='outer',indicator=True) | |
comp.drop('_merge', axis=1).to_csv('lastrun.csv', index=False) | |
data = comp[comp._merge=='right_only'].drop('_merge', axis=1) | |
sendResultEmail(data) | |
def getDIGoogle(): | |
runAndUpdNew('https://www.google.com/search?q=%22data+intellect%22+-site%3A%22dataintellect.com%22',50) | |
if __name__ == '__main__': | |
# Simple scheduler and run on start | |
getDIGoogle() | |
schedule.every().day.at("08:00").do(getDIGoogle) | |
while True: | |
schedule.run_pending() | |
time.sleep(1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
apt-get update | |
# Install with a non-interactive frontend to get around asking for a timezone | |
# Otherwise, this is a dependency for Chrome | |
DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata | |
apt-get install -y pip wget | |
# Strange version clash – libudev needs to be downgraded | |
apt-get install -y libudev1=249.11-0ubuntu3 | |
# Download and install Chrome | |
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb | |
apt install -y -f ./google-chrome-stable_current_amd64.deb | |
pip install selenium beautifulsoup4 pandas tabulate schedule |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment