Skip to content

Instantly share code, notes, and snippets.

@4sushi
Created July 1, 2019 13:16
Show Gist options
  • Save 4sushi/330399d9a12992fe11467a2fb462ca96 to your computer and use it in GitHub Desktop.
Save 4sushi/330399d9a12992fe11467a2fb462ca96 to your computer and use it in GitHub Desktop.
Set header with Scrapely lib
# Example to set header with Scrapely
# Test with python 3.6
# pip install requests scrapely
from scrapely import Scraper
from scrapely.htmlpage import HtmlPage
import requests
def get_html_page(url, headers=None, encoding='utf-8'):
r = requests.get(url, headers=headers)
if r.status_code != 200:
raise Exception('Bad request: code return != 200')
body = r.content.decode(encoding)
return HtmlPage(url=url, headers=headers, body=body, encoding=encoding)
s = Scraper()
headers = {'USER-AGENT': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0'}
encoding = 'utf-8'
url1 = 'http://pypi.python.org/pypi/w3lib/1.1'
data = {'name': 'w3lib 1.1', 'author': 'Scrapy project', 'description': 'Library of web-related functions'}
html_page = get_html_page(url1, headers, encoding)
s.train_from_htmlpage(html_page, data)
url2 = 'http://pypi.python.org/pypi/Django/1.3'
print(s.scrape(url2))
# [{'description': ['A high-level Python Web framework that encourages rapid development and clean, pragmatic design.'], 'name': ['\n Django 1.3\n ']}]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment