jq2/chrome_remote_dbg.py

## chrome_remote_dbg.py
#!/usr/bin/env python
# @filename: chrome_remote_dbg.py
# @author: NullDotDEV
# @description: Download and parse Google search results from the pipeline command-line.
# @last-updated: Thu Feb  1 05:31:28 -02 2018
# ===============================================================================
# HowTo: Using this script.
# Using this script is very simple, just type:
# Example: $ python chrome_remote_dbg.py > list_of_google-search-results-dataset.txt


import os,sys,requests
from selenium import webdriver
from selenium.webdriver import Chrome

base_url = 'https://duckduckgo.org'
chromium_driver_path = '/usr/lib/bin/chromium/chromedriver'


class Goo():
    def __init__(self, host, port):
        self._host = host
        self._port = port
        print('An instance of the Goo class was created\n\
            Details os this instance:\n\
            host:%s\n\
            port:%s' % (host, port)
        )
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-gpu")
        #self.options.add_argument("--remote-debugging-port=9222")
        self.options.add_argument("--screen-size=1200x800")
        self.chrome_drv = webdriver.Chrome(executable_path='/usr/lib/chromium/chromedriver', chrome_options=self.options)
        self._page_source = None


    def add_option(self, option):
        self.options.add_argument(option)
        print('Seting up chromium webdriver options')


    def get_page(self, page):
        print('Starting...')
        # driver = webdriver.Remote(command_executor=base_url, desired_capabilities=options.to_capabilities())
        self.chrome_drv.get(page)
        #print('OKAY, GET')
        self._page_source = self.chrome_drv.page_source
        #print('The source page lenght is: %s' % len(self._page_source))
        #print('EOF')

        temp_data = self.chrome_drv.find_elements_by_tag_name('h3')
        span_tag = self.chrome_drv.find_elements_by_class_name('st')
        ahref_urls = self.chrome_drv.find_elements_by_class_name('_Rm')

        #for h3_title, span_st, href_url in zip(temp_data, span_tag, ahref_urls):
        #print('TITLE:%s\nDESC:%s\nURL:%s\n\n' % (h3_title.text, span_st.text, href_url.text))
        #return self._page_source
        return (temp_data, span_tag, ahref_urls)


    def go(self, url):
              self.chrome_drv.get(url)


"""
>>> import os,sys,requests
>>> from bs4 import BeautifulSoup
>>> from selenium import webdriver
self.options = webdriver.ChromeOptions()
>>> chrome_drv = webdriver.Chrome(executable_path='chromedriver', chrome_options=options,\
service_args=['--dump-dom', 'https://httpbin.org'])

>>> chrome_drv.current_url
>>> data = chrome_drv.page_source
>>> soup = BeautifulSoup(data, 'lxml')
>>> type(data)
>>> chrome_drv.get('https://cse.google.com/?q=remix&cx=<CX_TOKEN>:<CX_TOKEN>')
>>> chrome_drv.title
>>> data = chrome_drv.page_source
>>> soup = BeautifulSoup(data, 'lxml')
>>> for anchor in soup.find_all('a'):
	print(anchor.text)
	#s_anchors.append(anchor)

for anchor in soup.find_all('a'):
	tags = ['http://cse', 'https://code']
	if anchor.has_attr('href'):
		for tag in tags:
			if not anchor['href'].startswith('http://cse.')\
			and not anchor['href'].startswith('http://code.'):
				print(anchor['href'])
	#s_anchors.append(anchor)


from selenium.webdriver.common.keys import Keys
# select the search box field on google search
search_field.clear()
search_field.send_keys('inurl:google')
search_field.send_keys(Keys.ENTER)

search_field = chrome_drv.find_element_by_css_selector('#lst-ib')
# print text output if any
next_page_field.text


# google next page (2 of ???)
next_page_field = chrome_drv.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(2) > a')

# google search results titles
next_page_field = chrome_drv.find_element_by_css_selector('#rso > div:nth-child(1) > div > div:nth-child(1) > div > div > h3 > a')
for i in range(2, 99):
	print(chrome_drv.find_element_by_css_selector('#rso > div:nth-child(2) > div > div:nth-child(' + str(i) + ') > div > div > h3 > a').text)


#filter/map

>>> list_of_urls=[]
>>> for i in range(1,100):
	list_of_urls.append(chrome_drv.find_element_by_css_selector('#rso > div > div > div:nth-child(' + str(i) + ') > div > div > div > div > div > cite').text.split(sep='...')[0].split(sep=' ')[0])

"""


def main():
    host = 'localhost'
    port = '9222'
    base_url = 'https://google.com.br/search?q=Google+Search&num=100'
    gg = Goo(host, port)
    gg.go('https://google.com.br/')
    g_data = gg.get_page(base_url)
    for title, desc, link in zip(g_data[0], g_data[1], g_data[2]):
        print('%s\n%s\n%s\n\n' % (title.text, desc.text, link.text))


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# @filename: chrome_remote_dbg.py
	# @author: NullDotDEV
	# @description: Download and parse Google search results from the pipeline command-line.
	# @last-updated: Thu Feb 1 05:31:28 -02 2018
	# ===============================================================================
	# HowTo: Using this script.
	# Using this script is very simple, just type:
	# Example: $ python chrome_remote_dbg.py > list_of_google-search-results-dataset.txt


	import os,sys,requests
	from selenium import webdriver
	from selenium.webdriver import Chrome

	base_url = 'https://duckduckgo.org'
	chromium_driver_path = '/usr/lib/bin/chromium/chromedriver'



	class Goo():
	def __init__(self, host, port):
	self._host = host
	self._port = port
	print('An instance of the Goo class was created\n\
	Details os this instance:\n\
	host:%s\n\
	port:%s' % (host, port)
	)
	self.options = webdriver.ChromeOptions()
	self.options.add_argument('--headless')
	self.options.add_argument("--no-sandbox")
	self.options.add_argument("--disable-gpu")
	#self.options.add_argument("--remote-debugging-port=9222")
	self.options.add_argument("--screen-size=1200x800")
	self.chrome_drv = webdriver.Chrome(executable_path='/usr/lib/chromium/chromedriver', chrome_options=self.options)
	self._page_source = None


	def add_option(self, option):
	self.options.add_argument(option)
	print('Seting up chromium webdriver options')


	def get_page(self, page):
	print('Starting...')
	# driver = webdriver.Remote(command_executor=base_url, desired_capabilities=options.to_capabilities())
	self.chrome_drv.get(page)
	#print('OKAY, GET')
	self._page_source = self.chrome_drv.page_source
	#print('The source page lenght is: %s' % len(self._page_source))
	#print('EOF')

	temp_data = self.chrome_drv.find_elements_by_tag_name('h3')
	span_tag = self.chrome_drv.find_elements_by_class_name('st')
	ahref_urls = self.chrome_drv.find_elements_by_class_name('_Rm')

	#for h3_title, span_st, href_url in zip(temp_data, span_tag, ahref_urls):
	#print('TITLE:%s\nDESC:%s\nURL:%s\n\n' % (h3_title.text, span_st.text, href_url.text))
	#return self._page_source
	return (temp_data, span_tag, ahref_urls)


	def go(self, url):
	self.chrome_drv.get(url)


	"""
	>>> import os,sys,requests
	>>> from bs4 import BeautifulSoup
	>>> from selenium import webdriver
	self.options = webdriver.ChromeOptions()
	>>> chrome_drv = webdriver.Chrome(executable_path='chromedriver', chrome_options=options,\
	service_args=['--dump-dom', 'https://httpbin.org'])

	>>> chrome_drv.current_url
	>>> data = chrome_drv.page_source
	>>> soup = BeautifulSoup(data, 'lxml')
	>>> type(data)
	>>> chrome_drv.get('https://cse.google.com/?q=remix&cx=<CX_TOKEN>:<CX_TOKEN>')
	>>> chrome_drv.title
	>>> data = chrome_drv.page_source
	>>> soup = BeautifulSoup(data, 'lxml')
	>>> for anchor in soup.find_all('a'):
	print(anchor.text)
	#s_anchors.append(anchor)

	for anchor in soup.find_all('a'):
	tags = ['http://cse', 'https://code']
	if anchor.has_attr('href'):
	for tag in tags:
	if not anchor['href'].startswith('http://cse.')\
	and not anchor['href'].startswith('http://code.'):
	print(anchor['href'])
	#s_anchors.append(anchor)




	from selenium.webdriver.common.keys import Keys
	# select the search box field on google search
	search_field.clear()
	search_field.send_keys('inurl:google')
	search_field.send_keys(Keys.ENTER)

	search_field = chrome_drv.find_element_by_css_selector('#lst-ib')
	# print text output if any
	next_page_field.text


	# google next page (2 of ???)
	next_page_field = chrome_drv.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(2) > a')

	# google search results titles
	next_page_field = chrome_drv.find_element_by_css_selector('#rso > div:nth-child(1) > div > div:nth-child(1) > div > div > h3 > a')
	for i in range(2, 99):
	print(chrome_drv.find_element_by_css_selector('#rso > div:nth-child(2) > div > div:nth-child(' + str(i) + ') > div > div > h3 > a').text)



	#filter/map

	>>> list_of_urls=[]
	>>> for i in range(1,100):
	list_of_urls.append(chrome_drv.find_element_by_css_selector('#rso > div > div > div:nth-child(' + str(i) + ') > div > div > div > div > div > cite').text.split(sep='...')[0].split(sep=' ')[0])

	"""



	def main():
	host = 'localhost'
	port = '9222'
	base_url = 'https://google.com.br/search?q=Google+Search&num=100'
	gg = Goo(host, port)
	gg.go('https://google.com.br/')
	g_data = gg.get_page(base_url)
	for title, desc, link in zip(g_data[0], g_data[1], g_data[2]):
	print('%s\n%s\n%s\n\n' % (title.text, desc.text, link.text))



	if __name__ == '__main__':
	main()