This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from requests.adapters import HTTPAdapter | |
from urllib3.util.retry import Retry | |
from random_useragent import random_useragent | |
class RequestsHelper(object): | |
"""HTTP Request helper""" | |
def __init__(self): | |
super(RequestsHelper, self).__init__() | |
""" Create session for long TTL pages""" | |
session = requests.Session() | |
retries = 5 | |
backoff_factor = 0.3 | |
status_forcelist = (500, 502, 504) | |
retry = Retry( | |
total=retries, | |
read=retries, | |
connect=retries, | |
backoff_factor=backoff_factor, | |
status_forcelist=status_forcelist, | |
) | |
adapter = HTTPAdapter(max_retries=retry) | |
session.mount("http://", adapter) | |
session.mount("https://", adapter) | |
self.session = session | |
def _generate_random_useragent(self): | |
""" | |
Generate random user-agent string | |
:return: User-agent string | |
""" | |
randomizer = random_useragent.Randomize() | |
return randomizer.random_agent("desktop", "windows") | |
def _is_cloudflare_protected(self, r): | |
""" | |
Check if page has protected with Cloudflare | |
:param object r: | |
:return: Cloudflare anti-ddos challenge status | |
:rtype: bool | |
""" | |
server = r.headers.get("Server", "") | |
return ( | |
r.status_code == 503 | |
and server.startswith("cloudflare") | |
and b"jschl_vc" in r.content | |
and b"jschl_answer" in r.content | |
) | |
def resolve_redirection(self, url): | |
""" | |
Resolve redirection and return final url without download page | |
:param str url: | |
:return: Return final url | |
""" | |
r = self.session.head(url, allow_redirects=True) | |
return r.url | |
def get(self, url): | |
""" | |
Fetch page content or make HTTP GET request | |
:param str url: Endpoint or page URL | |
:return: Requests object | |
:raises Exception when error happened | |
""" | |
try: | |
r = self.session.get( | |
url, | |
headers={"User-Agent": self._generate_random_useragent()}, | |
timeout=5, | |
) | |
except Exception as e: | |
raise e | |
if self._is_cloudflare_protected(r): | |
import cfscrape | |
scrapper = cfscrape.create_scraper() | |
r = scrapper.get(r.url, timeout=5) | |
if b"indows-1254" in r.content: | |
r.encoding = "Windows-1254" | |
else: | |
r.encoding = "utf-8" | |
return r | |
def post(self, url, data): | |
""" | |
Fetch page content or make HTTP GET request | |
:param str url: Endpoint or page URL | |
:param dict data: Post data dictionary | |
:return: Requests object | |
:raises Exception when error happened | |
""" | |
try: | |
r = self.session.post( | |
url, | |
data=data, | |
headers={"User-Agent": self._generate_random_useragent()}, | |
timeout=5, | |
) | |
except Exception as e: | |
raise e | |
if self._is_cloudflare_protected(r): | |
import cfscrape # pip install cfscrape | |
scrapper = cfscrape.create_scraper() | |
r = scrapper.post(r.url, data=data, timeout=5) | |
if b"indows-1254" in r.content: | |
r.encoding = "Windows-1254" | |
else: | |
r.encoding = "utf-8" | |
return r |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment