Last active
January 16, 2019 16:17
-
-
Save ShaikeA/d081441c4bd85118c33a5b0f3f98bbf4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def random_header(logger): | |
# Create a dict of accept headers for each user-agent. | |
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"} | |
# Get a random user-agent. We used Chrome and Firefox user agents. | |
# Take a look at fake-useragent project's page to see all other options - https://pypi.org/project/fake-useragent/ | |
try: | |
# Getting a user agent using the fake_useragent package | |
ua = UserAgent() | |
if random.random() > 0.5: | |
random_user_agent = ua.chrome | |
else: | |
random_user_agent = ua.firefox | |
# In case there's a problem with fake-useragent package, we still want the scraper to function | |
# so there's a list of user-agents that we created and swap to another user agent. | |
# Be aware of the fact that this list should be updated from time to time. | |
# List of user agents can be found here - https://developers.whatismybrowser.com/. | |
except FakeUserAgentError as error: | |
# Save a message into a logs file. See more details below in the post. | |
logger.error("FakeUserAgent didn't work. Generating headers from the pre-defined set of headers. error: {}".format(error)) | |
user_agents = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"] # Just for case user agents are not extracted from fake-useragent package | |
random_user_agent = random.choice(user_agents) | |
# Create the headers dict. It's important to match between the user-agent and the accept headers as seen in line 35 | |
finally: | |
valid_accept = accepts['Firefox'] if random_user_agent.find('Firefox') > 0 else accepts['Safari, Chrome'] | |
headers = {"User-Agent": random_user_agent, | |
"Accept": valid_accept} | |
return headers |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment