Last active
May 23, 2018 10:00
-
-
Save code-for-coffee/c9e3b273fec20a54e26aff47b24c5800 to your computer and use it in GitHub Desktop.
Python Webscrape Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
import bs4 | |
from scrapy.selector import HtmlXPathSelector | |
from scrapy.linkextractors import LinkExtractor | |
from scrapy.spiders import Rule | |
import os, nltk, pandas as pd, numpy as np, bs4, urllib, re, robobrowser, requests, csv, collections, scrapy | |
from bs4 import BeautifulSoup, NavigableString, SoupStrainer | |
def visible(element): | |
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: | |
return False | |
# elif re.match('<!--.*-->', unicode(element.encode('utf-8'))): | |
# return False | |
elif re.match('^\n$', unicode(element)): | |
return False | |
elif element.startswith(' start'): | |
return False | |
elif element.startswith(' end'): | |
return False | |
elif element.startswith(' mobile'): | |
return False | |
elif element.startswith(' END'): | |
return False | |
elif element.startswith(' BEGIN'): | |
return False | |
elif re.search('^Begin Body$', unicode(element)): | |
return False | |
elif re.search('^End Body$', unicode(element)): | |
return False | |
elif re.search( '^ \xa0 $', unicode(element)): | |
return False | |
elif re.search('^\xa0$', unicode(element)): | |
return False | |
elif re.search('MailChimp', unicode(element)): | |
return False | |
elif re.search('^>$', unicode(element)): | |
return False | |
elif re.search('^<$', unicode(element)): | |
return False | |
elif re.match('^ $', unicode(element)): | |
return False | |
elif re.match('^, $', unicode(element)): | |
return False | |
elif re.match('^.$', unicode(element)): | |
return False | |
elif re.match('^ | $', unicode(element)): | |
return False | |
elif re.match('^____$', unicode(element)): | |
return False | |
elif re.match('^___$', unicode(element)): | |
return False | |
elif re.match('^Page.+Div.+End$', unicode(element)): | |
return False | |
elif re.match('^PAGE.+CONTENT$', unicode(element)): | |
return False | |
elif re.match('^.+Div.+End$', unicode(element)): | |
return False | |
elif re.match('^end top_wording$', unicode(element)): | |
return False | |
elif re.match('^end body$', unicode(element)): | |
return False | |
elif re.match('^end footerLeft$', unicode(element)): | |
return False | |
elif re.match('^end top$', unicode(element)): | |
return False | |
elif re.match('^end container$', unicode(element)): | |
return False | |
elif re.match('^end footer$', unicode(element)): | |
return False | |
elif re.search('\n\t\t.+', unicode(element)): | |
return False | |
elif element.startswith('[if lt IE 10]'): | |
return False | |
elif element.startswith('[if lt IE 9]'): | |
return False | |
elif element.endswith('[endif]'): | |
return False | |
elif re.search('<a', unicode(element)): | |
return False | |
elif element.startswith('[if gt IE 8]'): | |
return False | |
elif re.search('<script', unicode(element)): | |
return False | |
elif re.search('<option', unicode(element)): | |
return False | |
elif re.search('<span', unicode(element)): | |
return False | |
elif re.search('<input', unicode(element)): | |
return False | |
else: | |
return True | |
# usage scrapy runspider file.py -o test.json | |
class Scraper(scrapy.Spider): | |
inc = 1 # incrementor for pk | |
# save pk | |
name = "scraper" | |
start_urls = [ | |
'http://arstechnica.com' | |
] | |
visited_urls = [ | |
] | |
allowed_domains = ['arstechnica.com'] | |
rules = ( | |
Rule(LinkExtractor(allow=()), callback='parse', follow=True) | |
) | |
def parse(self, response): | |
##self.inc at current state is pk | |
## increment up for next process | |
self.inc = self.inc + 1 | |
self.visited_urls.append(response.url) | |
html = response.body | |
soup=BeautifulSoup(html, 'html.parser') | |
data=soup.findAll(text=True) | |
list1=[i for i in data if visible(i)] #the "if" condition here filters out visible text using the function above | |
children_page_urls = soup('a') | |
children_pages = [] | |
for link in children_page_urls: | |
# get urls | |
# if a url only links to rel root of the domain | |
# prepend domain name | |
url = str(link.get('href')) | |
# print(type(url)) | |
# print(url[:1]) | |
if (url[:1] =="/"): | |
url = self.start_urls[0] + url; | |
print(url) | |
# Save each url to a row in CSV | |
# here | |
children_pages.append(url) | |
## important: this is all the urls on then page, formatted | |
print(children_page_urls) | |
join=' '.join(list1) | |
print('**\tSoupy Data') | |
# save the data join - this i sall o fthe text words (visible only) | |
# this is set of words after john's coworker's validation calls | |
print(join) # <--- save the words | |
print('********************************') | |
print('Hey there! Update!') | |
print('Here are all the URLs I visited!') | |
print(self.visited_urls) | |
print('********************************') | |
# for link in response.css('a'): | |
# # print(link) | |
# print(link.xpath('@href'))#.extract() | |
# for node in response.css('body'): | |
# print('**\tLooping through DOM Elements\t**') | |
# print(node) | |
# print('**\tEnd Loop\t**') | |
# self.inc = self.inc + 1 | |
# for quote in response.css('div.quote'): | |
# yield { | |
# 'text': quote.css('span.text::text').extract_first(), | |
# 'author': quote.xpath('span/small/text()').extract_first(), | |
# } | |
# next_page = response.css('li.next a::attr("href")').extract_first() | |
# if next_page is not None: | |
# next_page = response.urljoin(next_page) | |
# yield scrapy.Request(next_page, callback=self.parse) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment