Phyks/google_now.py

## google_now.py
#!/usr/bin/env python3
import sys
import urllib.parse

import html2text
import scrapy

from scrapy.crawler import CrawlerProcess

results = []
class MyPipeline():
    def process_item(self, item, spider):
        results.append(dict(item))


class GoogleNowSpider(scrapy.Spider):
    name = "googlenow"

    def __init__(self, query="", *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = ["https://www.google.fr/search?q=%s" %
                           urllib.parse.quote(query)]

    def parse(self, response):
        if len(response.css("h2.r")) > 0:
            # Google calc
            return self.parse_calc(response)
        elif len(response.css("td#rhs_block>*")) > 0:
            return self.parse_col(response)
        else:
            return self.parse_rest(response)

    def parse_calc(self, response):
        return {
            "type": "calc",
            "result": response.css("h2.r::text")[0].extract()
        }

    def parse_col(self, response):
        return {
            "type": "col",
            "result": html2text.html2text(
                response.xpath("//td[@id='rhs_block']/ol/*[1]")[0].extract())
        }

    def parse_rest(self, response):
        return {
            "type": "other",
            "result": html2text.html2text(
                response.xpath("//div[@id='ires']/ol/*[1]")[0].extract())
        }


if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit("Usage: %s QUERY" % sys.argv[0])

    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'ITEM_PIPELINES': {'__main__.MyPipeline': 1},
        'LOG_LEVEL': 'ERROR'
    })

    process.crawl(GoogleNowSpider, query=(" ".join(sys.argv[1:])))
    process.start()

    print(results[0]["result"])
	#!/usr/bin/env python3
	import sys
	import urllib.parse

	import html2text
	import scrapy

	from scrapy.crawler import CrawlerProcess

	results = []
	class MyPipeline():
	def process_item(self, item, spider):
	results.append(dict(item))


	class GoogleNowSpider(scrapy.Spider):
	name = "googlenow"

	def __init__(self, query="", args, *kwargs):
	super().__init__(args, *kwargs)
	self.start_urls = ["https://www.google.fr/search?q=%s" %
	urllib.parse.quote(query)]

	def parse(self, response):
	if len(response.css("h2.r")) > 0:
	# Google calc
	return self.parse_calc(response)
	elif len(response.css("td#rhs_block>*")) > 0:
	return self.parse_col(response)
	else:
	return self.parse_rest(response)

	def parse_calc(self, response):
	return {
	"type": "calc",
	"result": response.css("h2.r::text")[0].extract()
	}

	def parse_col(self, response):
	return {
	"type": "col",
	"result": html2text.html2text(
	response.xpath("//td[@id='rhs_block']/ol/*[1]")[0].extract())
	}

	def parse_rest(self, response):
	return {
	"type": "other",
	"result": html2text.html2text(
	response.xpath("//div[@id='ires']/ol/*[1]")[0].extract())
	}


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	sys.exit("Usage: %s QUERY" % sys.argv[0])

	process = CrawlerProcess({
	'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
	'ITEM_PIPELINES': {'__main__.MyPipeline': 1},
	'LOG_LEVEL': 'ERROR'
	})

	process.crawl(GoogleNowSpider, query=(" ".join(sys.argv[1:])))
	process.start()

	print(results[0]["result"])