Skip to content

Instantly share code, notes, and snippets.

@jq2
Created February 1, 2018 07:33
Show Gist options
  • Save jq2/0426f28c0d91c3720dc264e1a7c7e065 to your computer and use it in GitHub Desktop.
Save jq2/0426f28c0d91c3720dc264e1a7c7e065 to your computer and use it in GitHub Desktop.
[BOT] Google search scraper (Chrome/headless)
#!/usr/bin/env python
# @filename: chrome_remote_dbg.py
# @author: NullDotDEV
# @description: Download and parse Google search results from the pipeline command-line.
# @last-updated: Thu Feb 1 05:31:28 -02 2018
# ===============================================================================
# HowTo: Using this script.
# Using this script is very simple, just type:
# Example: $ python chrome_remote_dbg.py > list_of_google-search-results-dataset.txt
import os,sys,requests
from selenium import webdriver
from selenium.webdriver import Chrome
base_url = 'https://duckduckgo.org'
chromium_driver_path = '/usr/lib/bin/chromium/chromedriver'
class Goo():
def __init__(self, host, port):
self._host = host
self._port = port
print('An instance of the Goo class was created\n\
Details os this instance:\n\
host:%s\n\
port:%s' % (host, port)
)
self.options = webdriver.ChromeOptions()
self.options.add_argument('--headless')
self.options.add_argument("--no-sandbox")
self.options.add_argument("--disable-gpu")
#self.options.add_argument("--remote-debugging-port=9222")
self.options.add_argument("--screen-size=1200x800")
self.chrome_drv = webdriver.Chrome(executable_path='/usr/lib/chromium/chromedriver', chrome_options=self.options)
self._page_source = None
def add_option(self, option):
self.options.add_argument(option)
print('Seting up chromium webdriver options')
def get_page(self, page):
print('Starting...')
# driver = webdriver.Remote(command_executor=base_url, desired_capabilities=options.to_capabilities())
self.chrome_drv.get(page)
#print('OKAY, GET')
self._page_source = self.chrome_drv.page_source
#print('The source page lenght is: %s' % len(self._page_source))
#print('EOF')
temp_data = self.chrome_drv.find_elements_by_tag_name('h3')
span_tag = self.chrome_drv.find_elements_by_class_name('st')
ahref_urls = self.chrome_drv.find_elements_by_class_name('_Rm')
#for h3_title, span_st, href_url in zip(temp_data, span_tag, ahref_urls):
#print('TITLE:%s\nDESC:%s\nURL:%s\n\n' % (h3_title.text, span_st.text, href_url.text))
#return self._page_source
return (temp_data, span_tag, ahref_urls)
def go(self, url):
self.chrome_drv.get(url)
"""
>>> import os,sys,requests
>>> from bs4 import BeautifulSoup
>>> from selenium import webdriver
self.options = webdriver.ChromeOptions()
>>> chrome_drv = webdriver.Chrome(executable_path='chromedriver', chrome_options=options,\
service_args=['--dump-dom', 'https://httpbin.org'])
>>> chrome_drv.current_url
>>> data = chrome_drv.page_source
>>> soup = BeautifulSoup(data, 'lxml')
>>> type(data)
>>> chrome_drv.get('https://cse.google.com/?q=remix&cx=<CX_TOKEN>:<CX_TOKEN>')
>>> chrome_drv.title
>>> data = chrome_drv.page_source
>>> soup = BeautifulSoup(data, 'lxml')
>>> for anchor in soup.find_all('a'):
print(anchor.text)
#s_anchors.append(anchor)
for anchor in soup.find_all('a'):
tags = ['http://cse', 'https://code']
if anchor.has_attr('href'):
for tag in tags:
if not anchor['href'].startswith('http://cse.')\
and not anchor['href'].startswith('http://code.'):
print(anchor['href'])
#s_anchors.append(anchor)
from selenium.webdriver.common.keys import Keys
# select the search box field on google search
search_field.clear()
search_field.send_keys('inurl:google')
search_field.send_keys(Keys.ENTER)
search_field = chrome_drv.find_element_by_css_selector('#lst-ib')
# print text output if any
next_page_field.text
# google next page (2 of ???)
next_page_field = chrome_drv.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(2) > a')
# google search results titles
next_page_field = chrome_drv.find_element_by_css_selector('#rso > div:nth-child(1) > div > div:nth-child(1) > div > div > h3 > a')
for i in range(2, 99):
print(chrome_drv.find_element_by_css_selector('#rso > div:nth-child(2) > div > div:nth-child(' + str(i) + ') > div > div > h3 > a').text)
#filter/map
>>> list_of_urls=[]
>>> for i in range(1,100):
list_of_urls.append(chrome_drv.find_element_by_css_selector('#rso > div > div > div:nth-child(' + str(i) + ') > div > div > div > div > div > cite').text.split(sep='...')[0].split(sep=' ')[0])
"""
def main():
host = 'localhost'
port = '9222'
base_url = 'https://google.com.br/search?q=Google+Search&num=100'
gg = Goo(host, port)
gg.go('https://google.com.br/')
g_data = gg.get_page(base_url)
for title, desc, link in zip(g_data[0], g_data[1], g_data[2]):
print('%s\n%s\n%s\n\n' % (title.text, desc.text, link.text))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment