Skip to content

Instantly share code, notes, and snippets.

@T31337
Created July 6, 2017 04:13
Show Gist options
  • Save T31337/0a903b5913f00ab0b0368a3d6970f2f6 to your computer and use it in GitHub Desktop.
Save T31337/0a903b5913f00ab0b0368a3d6970f2f6 to your computer and use it in GitHub Desktop.
Scrapy & BeautifulSoup Based Python Spider
from bs4 import BeautifulSoup, SoupStrainer
from scrapy.selector import HtmlXPathSelector
from urllib import request
import requests
from scrapy.linkextractors import LinkExtractor
import requests.utils
import scrapy.link
import scrapy
import scrapy.spiders
from scrapy.spiders.crawl import CrawlSpider, Rule
from fake_useragent import UserAgent, FakeUserAgentError
import urllib
import lxml.html
import sys,os
global footer, spiderLog, spiderDebug, logFile, fileLocation
def setupGlobals():
global footer, spiderLog, spiderDebug, logFile, fileLocation
spiderDebug=True
logFile=True
spiderLog = "spiderLog.txt"
footer = "\n==========================\n"
fileLocation="./"
class Data:
urlsList = []
class SpiderCrawler(scrapy.spiders.Spider):
name="CrawlSpider"
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(LinkExtractor(allow=('item\.php',)), callback='parse_item'),
)
def __init__(self, category=None, *args, **kwargs):
super(SpiderCrawler, self).__init__(*args, **kwargs)
self.start_urls = Data.urlsList
print("Initiating SpiderCrawler...")
self.headers = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36",
'From': 'www.google.com/',
'Accept-Encoding': ', '.join(('gzip', 'deflate')),
'Accept': '*/*',
'Connection': 'keep-alive',
}
def readFile(self,file_name):
fileHeader = "\n=====FileData=====\n"
print(fileHeader)
#if logFile:
#file.write(fileHeader)
with open(file_name, "r") as f:
text = f.readlines()
for line in text:
print(line)
#if logFile:
# file.write(line)
if line[0]=="#" or line[1] == "#":
pass
else:
if spiderDebug:
print("\nFound URL: " + line+"\n")
Data.urlsList.append(line.strip("\n"))
urlsHeader = "\n========UrlsList==========\n"
print(urlsHeader)
for url in Data.urlsList:
print(url)
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = scrapy.Item()
item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
return item
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
def printLinks(self):
linksHeader="\n===========Links==========\m"
if logFile:
file.write(linksHeader)
for url in Data.urlsList:
r = requests.get(Data.url)
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('a'):
print(link.get('href'))
if logFile:
file.write(link+"\n")
print("\n=====================\m")
def getLinks(self):
connection = request.urlopen(Data.urlsList)
dom = lxml.html.fromstring(connection.read())
if(spiderDebug):
print("=====Links======\n")
for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links)
print(link)
if logFile:
file.write(link+"\n")
return dom.xpath('//a/@href') # select the url in href for all a tags(links)
def getURL(self,url):
response = requests.get(self,url)
session = requests.Session()
session.headers.update(self.headers)
session.get(url)
return response.request
def SpiderSite(self,url):
r=requests.get(url)
data = r.text
soup = BeautifulSoup(data,"lxml")
return soup
def make_soup(self,url):
html = request.urlopen(self,url).read()
return BeautifulSoup(html, "lxml")
def get_category_links(section_url):
html = request.urlopen(section_url).read()
soup = BeautifulSoup(html, "lxml")
boccat = soup.find("a", "href")
category_links = [section_url + dd.a["href"] for dd in boccat.findAll("a")]
if spiderDebug:
print("\nCategoryLinks:\n"+category_links)
return category_links
def parse_page(self, response):
self.log("\n\n\n We got data! \n\n\n")
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ol[@id=\'result-set\']/li')
for site in sites:
item = site.select('a')
item['title'] = site.select('./h2/a/text()').extract()
item['link'] = site.select('./h2/a/@href').extract()
yield item
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[@class='html']")
for titles in titles:
title = titles.select("a/text()").extract()
link = titles.select("a/@href").extract()
print(title, link)
if logFile:
file.write(dataHeader)
file.write("{0} | {1}\n".format(title,link))
return titles
def get(self,url):
with request.urlopen(self,url) as r:
return r.read()
def downloadFile(self,url, file=None):
if not file:
file = url.split('/')[-1]
with open(file, 'wb') as f:
f.write(self.get(url))
file.close()
def download(self,url, file_name):
# open in binary mode
with open(file_name, "wb") as file:
# get request
response = self.get(url)
# write to file
file.write(response.content)
file.close()
def getData(self,url):
file_name = '~/Downloads/web/data.txt'
fileName="~/Downloads/web/data_download.txt"
response = urllib.request.urlopen(self,url)
data = response.read() # a `bytes` object
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
# Download the file from `url` and save it locally under `file_name`:
urllib.request.urlretrieve(self,url, file_name)
# Download the file from `url`, save it in a temporary directory and get the
# path to it (e.g. '/tmp/tmpb48zma.txt') in the `file_name` variable:
file_name, headers = urllib.request.urlretrieve(self,url)
req = requests.get(self,url)
file = open(fileName, 'wb')
for chunk in req.iter_content(100000):
file.write(chunk)
file.close()
def extractURLS(self,url):
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
soup = crawler.make_soup(soup).find_all("a")
if(spiderDebug):
print("\n===========URLS=================\n")
for item in soup:
print(item)
print("\n================================\n")
return crawler.make_soup(soup).find_all("a")
if __name__ == "__main__":
setupGlobals()
file = open(spiderLog,'w')
head = "\n=========SpiderInfo===========\n"
print(head)
print("Setting Up The Spider...\n")
crawler = SpiderCrawler(CrawlSpider)
crawler.start_requests()
print("Adding URLS Form download.txt To The Spider Crawl List...\n")
try:
crawler.readFile('download.txt')
except Exception as e:
print("Error:\n"+str(e))
dataHeader = "\n===========SpiderData==============\n"
print(dataHeader)
if logFile:
file.write(dataHeader)
for uri in Data.urlsList:
urlHeader = "\n=============Base URL=================\n"+uri+"\n========================================\n"
print(urlHeader)
if logFile:
file.write(urlHeader)
links = crawler.SpiderSite(uri)
for link in links.find_all('a'):
print(link.get('href'))
if logFile:
file.write(link.get("href")+"\n")
if logFile:
file.write(footer)
print("===========================")
print("{0} Saved To {1}".format(file.name,fileLocation))
print("===========================")
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment