code-for-coffee/scraper.py

## scraper.py
import scrapy
import bs4

from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
import os, nltk, pandas as pd, numpy as np, bs4, urllib, re, robobrowser, requests, csv, collections, scrapy
from bs4 import BeautifulSoup, NavigableString, SoupStrainer


def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    # elif re.match('<!--.*-->', unicode(element.encode('utf-8'))):
    #     return False
    elif re.match('^\n$', unicode(element)):
        return False
    elif element.startswith(' start'):
        return False
    elif element.startswith(' end'):
        return False
    elif element.startswith(' mobile'):
        return False
    elif element.startswith(' END'):
        return False
    elif element.startswith(' BEGIN'):
        return False
    elif re.search('^Begin Body$', unicode(element)):
        return False
    elif re.search('^End Body$', unicode(element)):
        return False
    elif re.search( '^ \xa0 $', unicode(element)):
        return False
    elif re.search('^\xa0$', unicode(element)):
        return False
    elif re.search('MailChimp', unicode(element)):
        return False
    elif re.search('^>$', unicode(element)):
        return False
    elif re.search('^<$', unicode(element)):
        return False
    elif re.match('^ $', unicode(element)):
        return False
    elif re.match('^, $', unicode(element)):
        return False
    elif re.match('^.$', unicode(element)):
        return False
    elif re.match('^ | $', unicode(element)):
        return False
    elif re.match('^____$', unicode(element)):
        return False
    elif re.match('^___$', unicode(element)):
        return False
    elif re.match('^Page.+Div.+End$', unicode(element)):
        return False
    elif re.match('^PAGE.+CONTENT$', unicode(element)):
        return False
    elif re.match('^.+Div.+End$', unicode(element)):
        return False
    elif re.match('^end top_wording$', unicode(element)):
        return False
    elif re.match('^end body$', unicode(element)):
        return False
    elif re.match('^end footerLeft$', unicode(element)):
        return False
    elif re.match('^end top$', unicode(element)):
        return False
    elif re.match('^end container$', unicode(element)):
        return False
    elif re.match('^end footer$', unicode(element)):
        return False
    elif re.search('\n\t\t.+', unicode(element)):
        return False
    elif element.startswith('[if lt IE 10]'):
        return False
    elif element.startswith('[if lt IE 9]'):
        return False
    elif element.endswith('[endif]'):
        return False
    elif re.search('<a', unicode(element)):
        return False
    elif element.startswith('[if gt IE 8]'):
        return False
    elif re.search('<script', unicode(element)):
        return False
    elif re.search('<option', unicode(element)):
        return False
    elif re.search('<span', unicode(element)):
        return False
    elif re.search('<input', unicode(element)):
        return False
    else:
        return True

# usage scrapy runspider file.py -o test.json
class Scraper(scrapy.Spider):

    inc = 1 # incrementor for pk
    # save pk

    name = "scraper"
    start_urls = [
        'http://arstechnica.com'
    ]
    visited_urls = [

    ]
    allowed_domains = ['arstechnica.com']

    rules = (
        Rule(LinkExtractor(allow=()), callback='parse', follow=True)
    )

    def parse(self, response):
        ##self.inc at current state is pk
        ## increment up for next process
        self.inc = self.inc + 1
        self.visited_urls.append(response.url)
        html = response.body
        soup=BeautifulSoup(html, 'html.parser')
        data=soup.findAll(text=True)
        list1=[i for i in data if visible(i)] #the "if" condition here filters out visible text using the function above
        children_page_urls = soup('a')
        children_pages = []
        for link in children_page_urls:
            # get urls
            # if a url only links to rel root of the domain
            # prepend domain name
            url = str(link.get('href'))
            # print(type(url))
            # print(url[:1])
            if (url[:1] =="/"):
                url = self.start_urls[0] + url;
            print(url)
            # Save each url to a row in CSV
            # here
            children_pages.append(url)
        ## important: this is all the urls on then page, formatted
        print(children_page_urls)

        join=' '.join(list1)
        print('**\tSoupy Data')
        # save the data join - this i sall o fthe text words (visible only)
        # this is set of words after john's coworker's validation calls
        print(join) # <--- save the words

        print('********************************')
        print('Hey there! Update!')
        print('Here are all the URLs I visited!')
        print(self.visited_urls)
        print('********************************')


        # for link in response.css('a'):
        #     # print(link)
        #     print(link.xpath('@href'))#.extract()
        # for node in response.css('body'):
        #     print('**\tLooping through DOM Elements\t**')
        #     print(node)
        #     print('**\tEnd Loop\t**')
        #     self.inc = self.inc + 1
        # for quote in response.css('div.quote'):
        #     yield {
        #         'text': quote.css('span.text::text').extract_first(),
        #         'author': quote.xpath('span/small/text()').extract_first(),
        #     }

        # next_page = response.css('li.next a::attr("href")').extract_first()

        # if next_page is not None:
        #     next_page = response.urljoin(next_page)
        #     yield scrapy.Request(next_page, callback=self.parse)
	import scrapy
	import bs4

	from scrapy.selector import HtmlXPathSelector
	from scrapy.linkextractors import LinkExtractor
	from scrapy.spiders import Rule
	import os, nltk, pandas as pd, numpy as np, bs4, urllib, re, robobrowser, requests, csv, collections, scrapy
	from bs4 import BeautifulSoup, NavigableString, SoupStrainer


	def visible(element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
	return False
	# elif re.match('<!--.*-->', unicode(element.encode('utf-8'))):
	# return False
	elif re.match('^\n$', unicode(element)):
	return False
	elif element.startswith(' start'):
	return False
	elif element.startswith(' end'):
	return False
	elif element.startswith(' mobile'):
	return False
	elif element.startswith(' END'):
	return False
	elif element.startswith(' BEGIN'):
	return False
	elif re.search('^Begin Body$', unicode(element)):
	return False
	elif re.search('^End Body$', unicode(element)):
	return False
	elif re.search( '^ \xa0 $', unicode(element)):
	return False
	elif re.search('^\xa0$', unicode(element)):
	return False
	elif re.search('MailChimp', unicode(element)):
	return False
	elif re.search('^>$', unicode(element)):
	return False
	elif re.search('^<$', unicode(element)):
	return False
	elif re.match('^ $', unicode(element)):
	return False
	elif re.match('^, $', unicode(element)):
	return False
	elif re.match('^.$', unicode(element)):
	return False
	elif re.match('^ \| $', unicode(element)):
	return False
	elif re.match('^____$', unicode(element)):
	return False
	elif re.match('^___$', unicode(element)):
	return False
	elif re.match('^Page.+Div.+End$', unicode(element)):
	return False
	elif re.match('^PAGE.+CONTENT$', unicode(element)):
	return False
	elif re.match('^.+Div.+End$', unicode(element)):
	return False
	elif re.match('^end top_wording$', unicode(element)):
	return False
	elif re.match('^end body$', unicode(element)):
	return False
	elif re.match('^end footerLeft$', unicode(element)):
	return False
	elif re.match('^end top$', unicode(element)):
	return False
	elif re.match('^end container$', unicode(element)):
	return False
	elif re.match('^end footer$', unicode(element)):
	return False
	elif re.search('\n\t\t.+', unicode(element)):
	return False
	elif element.startswith('[if lt IE 10]'):
	return False
	elif element.startswith('[if lt IE 9]'):
	return False
	elif element.endswith('[endif]'):
	return False
	elif re.search('<a', unicode(element)):
	return False
	elif element.startswith('[if gt IE 8]'):
	return False
	elif re.search('<script', unicode(element)):
	return False
	elif re.search('<option', unicode(element)):
	return False
	elif re.search('<span', unicode(element)):
	return False
	elif re.search('<input', unicode(element)):
	return False
	else:
	return True

	# usage scrapy runspider file.py -o test.json
	class Scraper(scrapy.Spider):

	inc = 1 # incrementor for pk
	# save pk

	name = "scraper"
	start_urls = [
	'http://arstechnica.com'
	]
	visited_urls = [

	]
	allowed_domains = ['arstechnica.com']

	rules = (
	Rule(LinkExtractor(allow=()), callback='parse', follow=True)
	)

	def parse(self, response):
	##self.inc at current state is pk
	## increment up for next process
	self.inc = self.inc + 1
	self.visited_urls.append(response.url)
	html = response.body
	soup=BeautifulSoup(html, 'html.parser')
	data=soup.findAll(text=True)
	list1=[i for i in data if visible(i)] #the "if" condition here filters out visible text using the function above
	children_page_urls = soup('a')
	children_pages = []
	for link in children_page_urls:
	# get urls
	# if a url only links to rel root of the domain
	# prepend domain name
	url = str(link.get('href'))
	# print(type(url))
	# print(url[:1])
	if (url[:1] =="/"):
	url = self.start_urls[0] + url;
	print(url)
	# Save each url to a row in CSV
	# here
	children_pages.append(url)
	## important: this is all the urls on then page, formatted
	print(children_page_urls)

	join=' '.join(list1)
	print('**\tSoupy Data')
	# save the data join - this i sall o fthe text words (visible only)
	# this is set of words after john's coworker's validation calls
	print(join) # <--- save the words

	print('********************************')
	print('Hey there! Update!')
	print('Here are all the URLs I visited!')
	print(self.visited_urls)
	print('********************************')


	# for link in response.css('a'):
	# # print(link)
	# print(link.xpath('@href'))#.extract()
	# for node in response.css('body'):
	# print('\tLooping through DOM Elements\t')
	# print(node)
	# print('\tEnd Loop\t')
	# self.inc = self.inc + 1
	# for quote in response.css('div.quote'):
	# yield {
	# 'text': quote.css('span.text::text').extract_first(),
	# 'author': quote.xpath('span/small/text()').extract_first(),
	# }

	# next_page = response.css('li.next a::attr("href")').extract_first()

	# if next_page is not None:
	# next_page = response.urljoin(next_page)
	# yield scrapy.Request(next_page, callback=self.parse)