Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active July 3, 2020 00:32
Show Gist options
  • Save yeiichi/7322bcfd82c80a505d011e8815d53508 to your computer and use it in GitHub Desktop.
Save yeiichi/7322bcfd82c80a505d011e8815d53508 to your computer and use it in GitHub Desktop.
Fetch a page data using User Agent information.
# Version 1.0.1
# 2020-07-03
import requests
import random
from bs4 import BeautifulSoup
# User agent definition:
# You can check your User Agent at ifconfig.me
UA_LIST = {
'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \
AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) \
Gecko/20100101 Firefox/76.0',
'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
my_ua = random.choice(['SAFARI', 'FIREFOX', 'CHROME'])
my_headers = {'user-agent': UA_LIST[my_ua]}
class HtmlFetcher:
def __init__(self, target_url, encoding):
self.url = target_url
self.encoding = encoding
self.user_agent = my_ua
def soup(self):
# Fetch the target web page and return the response.
try:
response = requests.get(self.url, headers = my_headers,
timeout=(3.05, 27))
response.encoding = self.encoding
except Exception as exc:
print(f'Error: {exc}')
tortoise = BeautifulSoup(response.text, 'lxml')
return tortoise
if __name__ == '__main__':
target = input('URL? >> ')
encoding = input('Encoding? >> ')
hf = HtmlFetcher(target, encoding)
print('\nURL: ', hf.url,
'\nUA : ', hf.user_agent,
'\nEncoding used: ', hf.encoding,
'\nSoup:\n', hf.soup()
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment