Skip to content

Instantly share code, notes, and snippets.

@Noman654
Created October 5, 2024 19:11
Show Gist options
  • Save Noman654/8f73c95098580ee6b980cbc3d890b5fb to your computer and use it in GitHub Desktop.
Save Noman654/8f73c95098580ee6b980cbc3d890b5fb to your computer and use it in GitHub Desktop.
import random
import json
from urllib.parse import quote, urlparse
from typing import List, Dict, Any
import requests
from bs4 import BeautifulSoup
API_KEY = ''
class WebshareProxyChecker:
def __init__(self, api_key):
self.api_key = api_key
self.proxies = []
self.base_url = "https://proxy.webshare.io/api"
def fetch_proxies(self):
headers = {
"Authorization": f"Token {self.api_key}"
}
response = requests.get(f"{self.base_url}/proxy/list/", headers=headers)
if response.status_code == 200:
self.proxies = response.json()['results']
print(f"Fetched {len(self.proxies)} proxies")
else:
print(f"Failed to fetch proxies. Status code: {response.status_code}")
def get_random_proxy(self):
if not self.proxies:
self.fetch_proxies()
return random.choice(self.proxies) if self.proxies else None
def get_decoding_params(gn_art_id: str) -> Dict[str, str]:
"""
Fetches decoding parameters for a given Google News article ID.
:param gn_art_id: The Google News article ID.
:return: A dictionary containing the signature, timestamp, and article ID.
"""
response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
div = soup.select_one("c-wiz > div")
return {
"signature": div.get("data-n-a-sg"),
"timestamp": div.get("data-n-a-ts"),
"gn_art_id": gn_art_id,
}
def decode_urls(articles: List[Dict[str, str]], proxie) -> List[str]:
"""
Decodes URLs from Google News articles.
:param articles: A list of dictionaries containing article parameters.
:return: A list of decoded URLs.
"""
articles_reqs = [
[
"Fbv4je",
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
]
for art in articles
]
payload = f"f.req={quote(json.dumps([articles_reqs]))}"
headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
response = requests.post(
url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
headers=headers,
proxies=proxie,
data=payload,
)
response.raise_for_status()
return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]
def decode_all_urls(encoded_urls: List[str], proxy_checker: WebshareProxyChecker) -> str:
"""
Decodes all encoded URLs to their original form using a proxy.
:param encoded_urls: A list of encoded URLs.
:param proxy_checker: An instance of WebshareProxyChecker to fetch proxies.
:return: The first decoded URL.
:raises Exception: If the input is not a list of URLs.
"""
if isinstance(encoded_urls, str):
encoded_urls = [encoded_urls]
if not isinstance(encoded_urls, list):
raise Exception("Parameter is wrong; a list of URLs is expected")
# Get a random proxy
proxy = proxy_checker.get_random_proxy()
proxies = {
"http": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}",
"https": f"http://{proxy['username']}:{proxy['password']}@{proxy['proxy_address']}:{proxy['ports']['http']}"
} if proxy else None
articles_params = [
get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls
]
decoded_url = decode_urls(articles_params, proxies)
return decoded_url
if __name__ =='__main__':
web_proxy = WebshareProxyChecker(API_KEY)
encoded_urls = [
"https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5",
"https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen",
]
decoded_urls = decode_all_urls(encoded_urls, web_proxy)
print(decoded_urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment