Skip to content

Instantly share code, notes, and snippets.

@code-for-coffee
Last active May 23, 2018 10:00
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save code-for-coffee/c9e3b273fec20a54e26aff47b24c5800 to your computer and use it in GitHub Desktop.
Save code-for-coffee/c9e3b273fec20a54e26aff47b24c5800 to your computer and use it in GitHub Desktop.
Python Webscrape Example
import scrapy
import bs4
from scrapy.selector import HtmlXPathSelector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
import os, nltk, pandas as pd, numpy as np, bs4, urllib, re, robobrowser, requests, csv, collections, scrapy
from bs4 import BeautifulSoup, NavigableString, SoupStrainer
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
# elif re.match('<!--.*-->', unicode(element.encode('utf-8'))):
# return False
elif re.match('^\n$', unicode(element)):
return False
elif element.startswith(' start'):
return False
elif element.startswith(' end'):
return False
elif element.startswith(' mobile'):
return False
elif element.startswith(' END'):
return False
elif element.startswith(' BEGIN'):
return False
elif re.search('^Begin Body$', unicode(element)):
return False
elif re.search('^End Body$', unicode(element)):
return False
elif re.search( '^ \xa0 $', unicode(element)):
return False
elif re.search('^\xa0$', unicode(element)):
return False
elif re.search('MailChimp', unicode(element)):
return False
elif re.search('^>$', unicode(element)):
return False
elif re.search('^<$', unicode(element)):
return False
elif re.match('^ $', unicode(element)):
return False
elif re.match('^, $', unicode(element)):
return False
elif re.match('^.$', unicode(element)):
return False
elif re.match('^ | $', unicode(element)):
return False
elif re.match('^____$', unicode(element)):
return False
elif re.match('^___$', unicode(element)):
return False
elif re.match('^Page.+Div.+End$', unicode(element)):
return False
elif re.match('^PAGE.+CONTENT$', unicode(element)):
return False
elif re.match('^.+Div.+End$', unicode(element)):
return False
elif re.match('^end top_wording$', unicode(element)):
return False
elif re.match('^end body$', unicode(element)):
return False
elif re.match('^end footerLeft$', unicode(element)):
return False
elif re.match('^end top$', unicode(element)):
return False
elif re.match('^end container$', unicode(element)):
return False
elif re.match('^end footer$', unicode(element)):
return False
elif re.search('\n\t\t.+', unicode(element)):
return False
elif element.startswith('[if lt IE 10]'):
return False
elif element.startswith('[if lt IE 9]'):
return False
elif element.endswith('[endif]'):
return False
elif re.search('<a', unicode(element)):
return False
elif element.startswith('[if gt IE 8]'):
return False
elif re.search('<script', unicode(element)):
return False
elif re.search('<option', unicode(element)):
return False
elif re.search('<span', unicode(element)):
return False
elif re.search('<input', unicode(element)):
return False
else:
return True
# usage scrapy runspider file.py -o test.json
class Scraper(scrapy.Spider):
inc = 1 # incrementor for pk
# save pk
name = "scraper"
start_urls = [
'http://arstechnica.com'
]
visited_urls = [
]
allowed_domains = ['arstechnica.com']
rules = (
Rule(LinkExtractor(allow=()), callback='parse', follow=True)
)
def parse(self, response):
##self.inc at current state is pk
## increment up for next process
self.inc = self.inc + 1
self.visited_urls.append(response.url)
html = response.body
soup=BeautifulSoup(html, 'html.parser')
data=soup.findAll(text=True)
list1=[i for i in data if visible(i)] #the "if" condition here filters out visible text using the function above
children_page_urls = soup('a')
children_pages = []
for link in children_page_urls:
# get urls
# if a url only links to rel root of the domain
# prepend domain name
url = str(link.get('href'))
# print(type(url))
# print(url[:1])
if (url[:1] =="/"):
url = self.start_urls[0] + url;
print(url)
# Save each url to a row in CSV
# here
children_pages.append(url)
## important: this is all the urls on then page, formatted
print(children_page_urls)
join=' '.join(list1)
print('**\tSoupy Data')
# save the data join - this i sall o fthe text words (visible only)
# this is set of words after john's coworker's validation calls
print(join) # <--- save the words
print('********************************')
print('Hey there! Update!')
print('Here are all the URLs I visited!')
print(self.visited_urls)
print('********************************')
# for link in response.css('a'):
# # print(link)
# print(link.xpath('@href'))#.extract()
# for node in response.css('body'):
# print('**\tLooping through DOM Elements\t**')
# print(node)
# print('**\tEnd Loop\t**')
# self.inc = self.inc + 1
# for quote in response.css('div.quote'):
# yield {
# 'text': quote.css('span.text::text').extract_first(),
# 'author': quote.xpath('span/small/text()').extract_first(),
# }
# next_page = response.css('li.next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment