Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Created April 15, 2024 17:32
Show Gist options
  • Save greg-randall/5c227c5106543fd61152ccdbcd98b2cc to your computer and use it in GitHub Desktop.
Save greg-randall/5c227c5106543fd61152ccdbcd98b2cc to your computer and use it in GitHub Desktop.
Try several methods of getting a webpage, starting with just a basic page download, then using lynx, then using Pyppeteer which runs Chrome.
import requests
import asyncio
from termcolor import cprint
from pyppeteer import launch
from pyppeteer_stealth import stealth
import subprocess
def page_content_vaild(page_content):
excluded_strings = ["Page Not Found", "Human Verification", "About Lynx"]
if not any(excluded_string.lower() in page_content.lower() for excluded_string in excluded_strings) and not page_content.strip() == "":
return True
else:
return False
def basic_pull(url):
try:
response = requests.get(url)
page_content = response.text
if page_content_vaild(page_content):
return page_content
else:
cprint(f"basic pull failed", "magenta")
return False
except:
return False
def lynx_pull(url):
try:
result = subprocess.run(['lynx', f"-source {url}"], stdout=subprocess.PIPE)
page_content = result.stdout.decode('utf-8')
if page_content_vaild(page_content):
return page_content
else:
cprint(f"lynx pull failed", "magenta")
return False
except:
return False
def pyppeteer_pull(url):
try:
async def get_page_raw():
browser = await launch()
page = await browser.newPage()
await stealth(page)
await page.goto(url)
page_raw = await page.content()
return page_raw
try:
page_content = asyncio.get_event_loop().run_until_complete(get_page_raw())
except:
return False
if page_content_vaild(page_content):
return page_content
else:
cprint(f"pyppeteer pull failed", "magenta")
return False
except:
return False
def get_page_content(url):
#the order here can be swapped around depending on your preferences or if one of them always fails:
# basic pull is fastest
# lynx is more reliable since it's a real browser
# pyppeteer is slowest but most reliable since it's a full version of Chrome
cprint(f"trying basic pull", "green")
output = basic_pull(url)
if output:
return output
cprint(f"trying lynx pull", "green")
output = lynx_pull(url)
if output:
return output
cprint(f"trying pyppeteer pull", "green")
output = pyppeteer_pull(url)
if output:
return output
return False
url="https://example.com"
print(get_page_content(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment