Skip to content

Instantly share code, notes, and snippets.

@maksimKorzh
Last active November 25, 2019 08:56
Show Gist options
  • Save maksimKorzh/f164b887389caaaba503c5fd65d4c5aa to your computer and use it in GitHub Desktop.
Save maksimKorzh/f164b887389caaaba503c5fd65d4c5aa to your computer and use it in GitHub Desktop.
A simple class to inherit from while writing one time scrapers
#
# Libraries
#
from bs4 import BeautifulSoup
from tabulate import *
import requests
import time
import json
import csv
# Scraper class to inherit from
class Scraper:
# URLs to crawl
urls = []
# Base URL
base_url = ''
# The number of pages to be scraped
page_number = 0
# Results list
results = []
# Run scraper
def run(self):
# Loop over the range of pages to crawl
for index in range(1, self.page_number + 1):
# Populate URLs list with pages to crawl
self.urls.append(self.base_url + str(index))
# Loop over the URLs
for url in self.urls:
# Make HTTP GET request
response = requests.get(url)
print('GET: %s | Status code: %s' % (url, response.status_code))
# Call parse method when the response is obtained
self.parse(response)
# 2 seconds delay to avoid torturing web sites
time.sleep(2)
# User's parse function to extract data
def parse(self, response):
pass
# Pretty print results to console
def print_results(self):
# Make sure results available
if len(self.results):
# Results in dictionary format case
if type(self.results[0]) == dict:
print(tabulate([row.values() for row in self.results], self.results[0].keys(), tablefmt='fancy_grid'))
# Results in list format case
if type(self.results[0]) == list:
print(tabulate(self.results, tablefmt='fancy_grid'))
# Export results as CSV file
def export_csv(self, filename):
# Create file stream
with open(filename, 'w', newline='') as csv_file:
# Make sure results available
if len(self.results):
# Results in dictionary format case
if type(self.results[0]) == dict:
# Create dictionary writer
writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
# Write column names
writer.writeheader()
# Loop over results
for row in self.results:
writer.writerow(row)
# Results in list format case
elif type(self.results[0]) == list:
# Create writer
writer = csv.writer(csv_file)
# Write results
writer.writerows(self.results)
# Return on unsupported results type
else:
print('ERROR! Unsupported results type!')
return
# Return if no results available
else:
print('Failed to export "%s" - no results to store!' % filename)
# Export results in JSON format
def export_json(self, filename):
# Create file stream
with open(filename, 'w') as json_file:
# Write data in JSON format
json_file.write(json.dumps(self.results, indent=2))
# Import Scraper class and dependencies
from lib.ots import *
# Create QuotesScraper class inherited from Scraper class
class TemplateScraper(Scraper):
# Define url to scrape data from
urls = []
# Parse response for each page
def parse(self, response):
# Parse content
content = BeautifulSoup(response.text, 'lxml')
# Create QuotesScraper instance
scraper = TemplateScraper()
# Run QuotesScraper
scraper.run()
# Pretty print results to console
scraper.print_results()
# Export extracted data to CSV file
scraper.export_csv('./data/template.csv')
# Export extracted data to JSON file
scraper.export_json('./data/template.json')
from ots import *
class AgentScraper(Scraper):
base_url = 'https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/'
page_number = 2
def parse(self, response):
content = BeautifulSoup(response.text, 'lxml')
table = content.find('table')
rows = table.findAll('tr')
if response.url.split('/')[-1] == '1':
self.columns = [header.text.strip('\n') for header in rows[0].findAll('th')]
for row in rows:
if len(row.findAll('td')):
self.results.append([data.text for data in row.findAll('td')])
print('rows', len(rows))
scraper = AgentScraper()
scraper.run()
scraper.results.insert(0, scraper.columns)
scraper.print_results()
scraper.export_csv('./data/user_agents.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment