Noman654/decode_google_news_url_with_proxies.py

## decode_google_news_url_with_proxies.py
import random
import json
from urllib.parse import quote, urlparse
from typing import List, Dict, Any

import requests
from bs4 import BeautifulSoup

API_KEY = ''

class WebshareProxyChecker:
    def __init__(self, api_key):
        self.api_key = api_key
        self.proxies = []
        self.base_url = "https://proxy.webshare.io/api"

    def fetch_proxies(self):
        headers = {
            "Authorization": f"Token {self.api_key}"
        }
        response = requests.get(f"{self.base_url}/proxy/list/", headers=headers)
        if response.status_code == 200:
            self.proxies = response.json()['results']
            print(f"Fetched {len(self.proxies)} proxies")
        else:
            print(f"Failed to fetch proxies. Status code: {response.status_code}")

    def get_random_proxy(self):
        if not self.proxies:
            self.fetch_proxies()
        return random.choice(self.proxies) if self.proxies else None


def get_decoding_params(gn_art_id: str) -> Dict[str, str]:
    """
    Fetches decoding parameters for a given Google News article ID.

    :param gn_art_id: The Google News article ID.
    :return: A dictionary containing the signature, timestamp, and article ID.
    """
    response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")
    div = soup.select_one("c-wiz > div")
    return {
        "signature": div.get("data-n-a-sg"),
        "timestamp": div.get("data-n-a-ts"),
        "gn_art_id": gn_art_id,
    }


def decode_urls(articles: List[Dict[str, str]], proxie) -> List[str]:
    """
    Decodes URLs from Google News articles.

    :param articles: A list of dictionaries containing article parameters.
    :return: A list of decoded URLs.
    """
    articles_reqs = [
        [
            "Fbv4je",
            f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
        ]
        for art in articles
    ]
    payload = f"f.req={quote(json.dumps([articles_reqs]))}"
    headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
    response = requests.post(
        url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
        headers=headers,
        proxies=proxie,
        data=payload,
    )
    response.raise_for_status()
    return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]


def decode_all_urls(encoded_urls: List[str], proxy_checker: WebshareProxyChecker) -> str:
    """
    Decodes all encoded URLs to their original form using a proxy.

    :param encoded_urls: A list of encoded URLs.
    :param proxy_checker: An instance of WebshareProxyChecker to fetch proxies.
    :return: The first decoded URL.
    :raises Exception: If the input is not a list of URLs.
    """
    if isinstance(encoded_urls, str):
        encoded_urls = [encoded_urls]

    if not isinstance(encoded_urls, list):
        raise Exception("Parameter is wrong; a list of URLs is expected")

    # Get a random proxy
    proxy = proxy_checker.get_random_proxy()
    proxies = {
        "http": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}",
        "https": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}"
    } if proxy else None

    articles_params = [
        get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls
    ]
    decoded_url = decode_urls(articles_params, proxies)
    return decoded_url


if __name__ =='__main__':
    web_proxy = WebshareProxyChecker(API_KEY)
    encoded_urls = [
    "https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5",
    "https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen",
    ]
    decoded_urls = decode_all_urls(encoded_urls, web_proxy)

    print(decoded_urls)
	import random
	import json
	from urllib.parse import quote, urlparse
	from typing import List, Dict, Any

	import requests
	from bs4 import BeautifulSoup

	API_KEY = ''

	class WebshareProxyChecker:
	def __init__(self, api_key):
	self.api_key = api_key
	self.proxies = []
	self.base_url = "https://proxy.webshare.io/api"

	def fetch_proxies(self):
	headers = {
	"Authorization": f"Token {self.api_key}"
	}
	response = requests.get(f"{self.base_url}/proxy/list/", headers=headers)
	if response.status_code == 200:
	self.proxies = response.json()['results']
	print(f"Fetched {len(self.proxies)} proxies")
	else:
	print(f"Failed to fetch proxies. Status code: {response.status_code}")

	def get_random_proxy(self):
	if not self.proxies:
	self.fetch_proxies()
	return random.choice(self.proxies) if self.proxies else None




	def get_decoding_params(gn_art_id: str) -> Dict[str, str]:
	"""
	Fetches decoding parameters for a given Google News article ID.

	:param gn_art_id: The Google News article ID.
	:return: A dictionary containing the signature, timestamp, and article ID.
	"""
	response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "lxml")
	div = soup.select_one("c-wiz > div")
	return {
	"signature": div.get("data-n-a-sg"),
	"timestamp": div.get("data-n-a-ts"),
	"gn_art_id": gn_art_id,
	}


	def decode_urls(articles: List[Dict[str, str]], proxie) -> List[str]:
	"""
	Decodes URLs from Google News articles.

	:param articles: A list of dictionaries containing article parameters.
	:return: A list of decoded URLs.
	"""
	articles_reqs = [
	[
	"Fbv4je",
	f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
	]
	for art in articles
	]
	payload = f"f.req={quote(json.dumps([articles_reqs]))}"
	headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
	response = requests.post(
	url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
	headers=headers,
	proxies=proxie,
	data=payload,
	)
	response.raise_for_status()
	return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]




	def decode_all_urls(encoded_urls: List[str], proxy_checker: WebshareProxyChecker) -> str:
	"""
	Decodes all encoded URLs to their original form using a proxy.

	:param encoded_urls: A list of encoded URLs.
	:param proxy_checker: An instance of WebshareProxyChecker to fetch proxies.
	:return: The first decoded URL.
	:raises Exception: If the input is not a list of URLs.
	"""
	if isinstance(encoded_urls, str):
	encoded_urls = [encoded_urls]

	if not isinstance(encoded_urls, list):
	raise Exception("Parameter is wrong; a list of URLs is expected")

	# Get a random proxy
	proxy = proxy_checker.get_random_proxy()
	proxies = {
	"http": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}",
	"https": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}"
	} if proxy else None

	articles_params = [
	get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls
	]
	decoded_url = decode_urls(articles_params, proxies)
	return decoded_url


	if __name__ =='__main__':
	web_proxy = WebshareProxyChecker(API_KEY)
	encoded_urls = [
	"https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5",
	"https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen",
	]
	decoded_urls = decode_all_urls(encoded_urls, web_proxy)

	print(decoded_urls)