Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Created July 30, 2020 06:28
Show Gist options
  • Save yeiichi/86cbd00861d54c64b2251238bb5b8dcc to your computer and use it in GitHub Desktop.
Save yeiichi/86cbd00861d54c64b2251238bb5b8dcc to your computer and use it in GitHub Desktop.
Beautiful Soup simmer pot
# Page fetcher part:
import requests
from bs4 import BeautifulSoup
import random
# User agent definition:
# You can check your User Agent at ifconfig.me
UA_LIST = {
'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \
AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) \
Gecko/20100101 Firefox/76.0',
'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
my_ua = random.choice(['SAFARI', 'FIREFOX', 'CHROME'])
my_headers = {'user-agent': UA_LIST[my_ua]}
class HtmlFetcher:
def __init__(self, target_url, encoding='utf_8'):
self.url = target_url
self.encoding = encoding
self.user_agent = my_ua
def soup(self):
# Fetch the target web page and return tortoise soup.
try:
response = requests.get(self.url, headers = my_headers,
timeout=(3.05, 27))
response.encoding = self.encoding
return BeautifulSoup(response.text, 'lxml')
except Exception as exc:
print(f'Error: {exc}')
if __name__ == '__main__':
target_page = input('Target page URL? >> ')
print(HtmlFetcher(target_page).soup().prettify())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment