Created
October 5, 2024 19:11
-
-
Save Noman654/8f73c95098580ee6b980cbc3d890b5fb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import json | |
from urllib.parse import quote, urlparse | |
from typing import List, Dict, Any | |
import requests | |
from bs4 import BeautifulSoup | |
API_KEY = '' | |
class WebshareProxyChecker: | |
def __init__(self, api_key): | |
self.api_key = api_key | |
self.proxies = [] | |
self.base_url = "https://proxy.webshare.io/api" | |
def fetch_proxies(self): | |
headers = { | |
"Authorization": f"Token {self.api_key}" | |
} | |
response = requests.get(f"{self.base_url}/proxy/list/", headers=headers) | |
if response.status_code == 200: | |
self.proxies = response.json()['results'] | |
print(f"Fetched {len(self.proxies)} proxies") | |
else: | |
print(f"Failed to fetch proxies. Status code: {response.status_code}") | |
def get_random_proxy(self): | |
if not self.proxies: | |
self.fetch_proxies() | |
return random.choice(self.proxies) if self.proxies else None | |
def get_decoding_params(gn_art_id: str) -> Dict[str, str]: | |
""" | |
Fetches decoding parameters for a given Google News article ID. | |
:param gn_art_id: The Google News article ID. | |
:return: A dictionary containing the signature, timestamp, and article ID. | |
""" | |
response = requests.get(f"https://news.google.com/articles/{gn_art_id}") | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "lxml") | |
div = soup.select_one("c-wiz > div") | |
return { | |
"signature": div.get("data-n-a-sg"), | |
"timestamp": div.get("data-n-a-ts"), | |
"gn_art_id": gn_art_id, | |
} | |
def decode_urls(articles: List[Dict[str, str]], proxie) -> List[str]: | |
""" | |
Decodes URLs from Google News articles. | |
:param articles: A list of dictionaries containing article parameters. | |
:return: A list of decoded URLs. | |
""" | |
articles_reqs = [ | |
[ | |
"Fbv4je", | |
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]', | |
] | |
for art in articles | |
] | |
payload = f"f.req={quote(json.dumps([articles_reqs]))}" | |
headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"} | |
response = requests.post( | |
url="https://news.google.com/_/DotsSplashUi/data/batchexecute", | |
headers=headers, | |
proxies=proxie, | |
data=payload, | |
) | |
response.raise_for_status() | |
return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]] | |
def decode_all_urls(encoded_urls: List[str], proxy_checker: WebshareProxyChecker) -> str: | |
""" | |
Decodes all encoded URLs to their original form using a proxy. | |
:param encoded_urls: A list of encoded URLs. | |
:param proxy_checker: An instance of WebshareProxyChecker to fetch proxies. | |
:return: The first decoded URL. | |
:raises Exception: If the input is not a list of URLs. | |
""" | |
if isinstance(encoded_urls, str): | |
encoded_urls = [encoded_urls] | |
if not isinstance(encoded_urls, list): | |
raise Exception("Parameter is wrong; a list of URLs is expected") | |
# Get a random proxy | |
proxy = proxy_checker.get_random_proxy() | |
proxies = { | |
"http": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}", | |
"https": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}" | |
} if proxy else None | |
articles_params = [ | |
get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls | |
] | |
decoded_url = decode_urls(articles_params, proxies) | |
return decoded_url | |
if __name__ =='__main__': | |
web_proxy = WebshareProxyChecker(API_KEY) | |
encoded_urls = [ | |
"https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5", | |
"https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen", | |
] | |
decoded_urls = decode_all_urls(encoded_urls, web_proxy) | |
print(decoded_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment