Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
scrape google
import scrapy
import re
class QuotesSpider(scrapy.Spider):
name = "microsoft"
custom_settings = {
# specifies exported fields and order
'FEED_EXPORT_FIELDS': ["url"],
}
f = open("input/test1.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
result = {
"url":""
}
with file("a.csv","a+") as pdfs:
for url in response.xpath(".//div/h3/a/@href").extract():
try:
pdfs.write(re.findall('url\?q=(.+)&sa',url)[0]+"\n")
except Exception:
pass
next_page_url = 'https://www.google.co.in/' + response.xpath('//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a/@href').extract_first()
request = scrapy.Request(url=next_page_url)
yield request
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment