-
-
Save huksley/bc3cb046157a99cd9d1517b32f91a99e to your computer and use it in GitHub Desktop.
/** | |
* This magically uses batchexecute protocol. It's not documented, but it works. | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2024 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
const fetchDecodedBatchExecute = (id: string) => { | |
const s = | |
'[[["Fbv4je","[\\"garturlreq\\",[[\\"en-US\\",\\"US\\",[\\"FINANCE_TOP_INDICES\\",\\"WEB_TEST_1_0_0\\"],null,null,1,1,\\"US:en\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],\\"en-US\\",\\"US\\",1,[2,3,4,8],1,0,\\"655000234\\",0,0,null,0],\\"' + | |
id + | |
'\\"]",null,"generic"]]]'; | |
return fetch("https://news.google.com/_/DotsSplashUi/data/batchexecute?" + "rpcids=Fbv4je", { | |
headers: { | |
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8", | |
Referrer: "https://news.google.com/" | |
}, | |
body: "f.req=" + encodeURIComponent(s), | |
method: "POST" | |
}) | |
.then(e => e.text()) | |
.then(s => { | |
const header = '[\\"garturlres\\",\\"'; | |
const footer = '\\",'; | |
if (!s.includes(header)) { | |
throw new Error("header not found: " + s); | |
} | |
const start = s.substring(s.indexOf(header) + header.length); | |
if (!start.includes(footer)) { | |
throw new Error("footer not found"); | |
} | |
const url = start.substring(0, start.indexOf(footer)); | |
return url; | |
}); | |
}; | |
/** | |
* Google News started generate encoded, internal URLs for RSS items | |
* https://news.google.com/rss/search?q=New%20York%20when%3A30d&hl=en-US&gl=US&ceid=US:en | |
* | |
* This script decodes URLs into original one, for example URL | |
* https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5 | |
* | |
* contains this | |
* https://techcrunch.com/2022/10/27/new-york-post-hacked-offensive-tweets/ | |
* | |
* In path after articles/ goes Base64 encoded binary data | |
* | |
* Format is the following: | |
* <prefix> <len bytes> <URL bytes> <len bytes> <amp URL bytes> [<suffix>] | |
* | |
* <prefix> - 0x08, 0x13, 0x22 | |
* <suffix> - 0xd2, 0x01, 0x00 (sometimes missing??) | |
* <len bytes> - formatted as 0x40 or 0x81 0x01 sometimes | |
* | |
* | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* https://news.google.com/rss/articles/CBMidkFVX3lxTFB1QmFsSi1Zc3dLQkpNLThKTXExWXBGWlE0eERJQ2hLRENIOFJzRTlsRnM1NS1Hc2FlbjdIMlZ3eWNQa0JqeVYzZGs1Y0hKaUtTUko2dmJabUtVMWZob0lNSFNCa3NLQ05ROGh4cVZfVTYyUDVxc2c?oc=5 | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* | |
* FIXME: What will happen if URL more than 255 bytes?? | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2022 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
export const decodeGoogleNewsUrl = async (sourceUrl: string) => { | |
const url = new URL(sourceUrl); | |
const path = url.pathname.split("/"); | |
if ( | |
url.hostname === "news.google.com" && | |
path.length > 1 && | |
path[path.length - 2] === "articles" | |
) { | |
const base64 = path[path.length - 1]; | |
let str = atob(base64); | |
const prefix = Buffer.from([0x08, 0x13, 0x22]).toString("binary"); | |
if (str.startsWith(prefix)) { | |
str = str.substring(prefix.length); | |
} | |
const suffix = Buffer.from([0xd2, 0x01, 0x00]).toString("binary"); | |
if (str.endsWith(suffix)) { | |
str = str.substring(0, str.length - suffix.length); | |
} | |
// One or two bytes to skip | |
const bytes = Uint8Array.from(str, c => c.charCodeAt(0)); | |
const len = bytes.at(0)!; | |
if (len >= 0x80) { | |
str = str.substring(2, len + 2); | |
} else { | |
str = str.substring(1, len + 1); | |
} | |
if (str.startsWith("AU_yqL")) { | |
// New style encoding, introduced in July 2024. Not yet known how to decode offline. | |
const url = await fetchDecodedBatchExecute(base64); | |
return url; | |
} | |
return str; | |
} else { | |
return sourceUrl; | |
} | |
}; |
@DevBey yes I use data center proxies. Here is my plan
Proxy Details
250 Proxy Server
36 Countries
1,000 GB bandwidth / month
Refreshes
Your proxy list automatically refreshes every 7 days
1/1 on-demand proxy list refreshes available
10 proxy replacements available
I tried using proxies but still am getting 429's. Maybe this is a dumb question, but does one have to use the same proxy to query google news that one uses to decode the received url?
What I was doing was using a proxy for the request to: https://news.google.com/_/DotsSplashUi/data/batchexecute
, I rotated the proxies but very quickly started getting 429's. I tried webshare and brightdata services. For webshare I cycled through their 10 free proxies, with 15 second sleep between each request. For brightdata, I used their default cycling scheme, trying both Datacenter and Residential proxies.
Could someone successfully using proxies post a sketch of the methid they use?
@vincenzon, could you let me know how many links you need to process? I don't process hundreds of links per second or even per minute. Right now, I use one proxy per 2 requests (batch and id). Maybe google blocks that free proxy list ? I've previously had a list of 100 proxies and I've seen failed requests. It may also be related to the proxies' countries or your http client. You can try to use this https://github.com/apify/got-scraping/ if you use js/ts.
P.S. Maybe you can share your code; it would help to find an issue.
@sviatoslav-lebediev
ouch, yeah then data center proxy probably won't work for us, as we make close to 40k requests twice a day in a batch of around 15-20 minutes.
we didn't had this issue with residential proxy of geonode until the new method of google decoding where our proxy usage has jumped to 2-4GB a day. now it's just too expensive.
Can yomebody share an example of the payload you are sending to "https://news.google.com/_/DotsSplashUi/data/batchexecute" ? I am not sure if my [s and 's are correct in my String creation. I get an error 400. I assume the JSON is wrong. Thank you
const articlesReq = [
'Fbv4je',
`["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"${article.gn_art_id}",${article.timestamp},"${article.signature}"]`,
];
// YOU HAVE ASKED FOR THIS ?
console.log('json', JSON.stringify([[articlesReq]]));
console.log('req', new URLSearchParams({ 'f.req': JSON.stringify([[articlesReq]]) }).toString());
const response = await axios.post(
'https://news.google.com/_/DotsSplashUi/data/batchexecute',
new URLSearchParams({ 'f.req': JSON.stringify([[articlesReq]]) }).toString(),
{
headers: { 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' },
httpAgent: httpsAgent,
httpsAgent: httpsAgent,
},
);
json
[[["Fbv4je","[\"garturlreq\",[[\"X\",\"X\",[\"X\",\"X\"],null,null,1,1,\"US:en\",null,1,null,null,null,null,null,0,1],\"X\",\"X\",1,[1,1,1],1,1,null,0,0,null,0],\"CBMiWkFVX3lxTE5oeWxmMktPRUhtZElHdFUzVjVvd1ROenJ3ZlVnTjdmQWp5TC1odzBONjl3dFR5Q3hyWDVYbWctNnQ3UEJIenBPaUFVc3lyZGFkNUZCcWE2MXR1Z9IBX0FVX3lxTE5vSDlocENUdXo5ZGcwc3VtUkIzd3pnRno1eUNLZUt5N3dCbHZ0cVNPd3Z3YVo5bHEyNlA0VWVOOWZtTWQzbDExV0RqcUNkcEJFWWY2RS1jY0w0UkZGVHNV\",1725891265,\"ATR1dL_XLq-dZDbrw-RmmfSzfjKw\"]"]]]
req
f.req=%5B%5B%5B%22Fbv4je%22%2C%22%5B%5C%22garturlreq%5C%22%2C%5B%5B%5C%22X%5C%22%2C%5C%22X%5C%22%2C%5B%5C%22X%5C%22%2C%5C%22X%5C%22%5D%2Cnull%2Cnull%2C1%2C1%2C%5C%22US%3Aen%5C%22%2Cnull%2C1%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C0%2C1%5D%2C%5C%22X%5C%22%2C%5C%22X%5C%22%2C1%2C%5B1%2C1%2C1%5D%2C1%2C1%2Cnull%2C0%2C0%2Cnull%2C0%5D%2C%5C%22CBMiWkFVX3lxTE5oeWxmMktPRUhtZElHdFUzVjVvd1ROenJ3ZlVnTjdmQWp5TC1odzBONjl3dFR5Q3hyWDVYbWctNnQ3UEJIenBPaUFVc3lyZGFkNUZCcWE2MXR1Z9IBX0FVX3lxTE5vSDlocENUdXo5ZGcwc3VtUkIzd3pnRno1eUNLZUt5N3dCbHZ0cVNPd3Z3YVo5bHEyNlA0VWVOOWZtTWQzbDExV0RqcUNkcEJFWWY2RS1jY0w0UkZGVHNV%5C%22%2C1725891186%2C%5C%22ATR1dL8hcaHFmzP5O4mgzNSWnlQU%5C%22%5D%22%5D%5D%5D
@sviatoslav-lebediev This was very helpful and I could fix my code! Thank you very much!
@sviatoslav-lebediev the code I am using is below. It starts to 429 pretty quick.
The main loop is:
def get_news_urls(news, proxies):
prx_at = 0
for n in news:
try:
prx_at, proxy = get_next_proxy(proxies, prx_at)
if proxy is not None:
url = decode_google_news_url(n['url'], proxy['url'])
n['actual_url'] = url
print(url)
sleep(15)
except Exception as e:
if str(e).startswith('429 Client Error'):
proxy['valid'] = False
The get_next_proxy
is returning the next proxy from a list in the case of webshare, and just the brightdata it is the bd url which does the rotation for you under the hood (at least that's my understanding). The list of news
objects are obtained using the gnews
package and look like:
https://news.google.com/rss/articles/CBMimwFBVV95cUxQTFNqdl84NzFwZkxYNy1lbnVVWnN4X2xqdDlFVzJwNjIxdFI5c0trUnJ3R3prMlg1cVBERVNERDFDNExtQVlVVzBaUjluZ25uUk10NDk3QTBJbmpjbnA1X0tZTUlIaFlQVEJ1eHR4WFlaYkpfaVFUOFg4YlVnZm1RcFlZWDF3Q1d0M3MtLVlkMVpveW1oY244d19wQdIBowFBVV95cUxNUGJwWDRIRHdCRVZtSkVBTnRVelQ1NlVOSjQtSjRlblBOb280NE1mQ2VHYmZlZkF1LXY0TGNUNGxTS2RsaUotT3NKdWVseXdfY2M5c1JmNDNWdXpZMTRSb3RJV0l0VDNGRHJHTjRzNF9iYjNBRlBOMXEtUXlzTnVmYmxNVFZJSmlzOVFzZWpIWjdsX0lIZUFicThZS1cyd05EXzV3?oc=5&hl=en-US&gl=US&ceid=US:en
https://news.google.com/rss/articles/CBMikAFBVV95cUxNOVVNUjcxVS1uajdnZV9wUWxyaVB2S2kxQXdhTkNtdm1qcllkVVczUEp4MjZtSUl5eGJMdm9haC1xWWlPUUNpWndDeDE3NmcwYnR4UUV5TDZfdHNNM1dGbjdOU0o1YlViM1c1MDNMdXp6b21yZ2dqaE1jNnJudjZvYWVGSzVQdWtNdXREcmVSOVjSAaIBQVVfeXFMTU1wUUJTclp5Q2VNVVRGTlJFNW9hbkdjTThuWVFQeklFWFZWY2RvbHctYjVCTWxjZVBrWVZxYzQtUWhKM2lGUzdackZUS01lRVdPbS1vaWwzcjJZX3Q4bnFUbnloSjhhTjVRX1dzbXdiWHo3RzJXN2cwS2hGSERPR25hN3kyck96WWpzNThLbzJSOGlIYTQzRkZhVmRNdkktTXpR?oc=5&hl=en-US&gl=US&ceid=US:en
The decode_google_news
is taken from above. I think my only change was to add a header in an attempt to get it to work.
def decode_google_news_url(source_url, proxy_url):
article = get_decoding_params(urlparse(source_url).path.split("/")[-1])
articles_req = [
"Fbv4je",
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{article["gn_art_id"]}",{article["timestamp"]},"{article["signature"]}"]',
]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
response = requests.post(
url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' },
data=f"f.req={quote(json.dumps([[articles_req]]))}",
proxies={'http': proxy_url, 'https': proxy_url}
)
response.raise_for_status()
return json.loads(json.loads(response.text.split("\n\n")[1])[:-2][0][2])[1]
def get_decoding_params(gn_art_id):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
}
response = requests.get(
f"https://news.google.com/articles/{gn_art_id}", headers=headers
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
div = soup.select_one("c-wiz > div")
return {
"signature": div.get("data-n-a-sg"),
"timestamp": div.get("data-n-a-ts"),
"gn_art_id": gn_art_id,
}
PYTHON SOLUTION
I don't know the exact algorithm used by Google to encode/decode the URLs, but I found a way to decode them using reverse engineering by inspecting the requests made by the browser in the redirection chain.
pip install beautifulsoup4 lxml
import json from urllib.parse import quote, urlparse import requests from bs4 import BeautifulSoup def get_decoding_params(gn_art_id): response = requests.get(f"https://news.google.com/articles/{gn_art_id}") response.raise_for_status() soup = BeautifulSoup(response.text, "lxml") div = soup.select_one("c-wiz > div") return { "signature": div.get("data-n-a-sg"), "timestamp": div.get("data-n-a-ts"), "gn_art_id": gn_art_id, } def decode_urls(articles): articles_reqs = [ [ "Fbv4je", f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]', ] for art in articles ] payload = f"f.req={quote(json.dumps([articles_reqs]))}" headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"} response = requests.post( url="https://news.google.com/_/DotsSplashUi/data/batchexecute", headers=headers, data=payload, ) response.raise_for_status() return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]] # Example usage encoded_urls = [ "https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5", "https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen", ] articles_params = [get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls] decoded_urls = decode_urls(articles_params) print(decoded_urls)
Works great and to lower/get rid of 429 errors, I changed
response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
to
response = requests.get(f"https://news.google.com/rss/articles/{gn_art_id}")
PYTHON SOLUTION
I don't know the exact algorithm used by Google to encode/decode the URLs, but I found a way to decode them using reverse engineering by inspecting the requests made by the browser in the redirection chain.
pip install beautifulsoup4 lxml
import json from urllib.parse import quote, urlparse import requests from bs4 import BeautifulSoup def get_decoding_params(gn_art_id): response = requests.get(f"https://news.google.com/articles/{gn_art_id}") response.raise_for_status() soup = BeautifulSoup(response.text, "lxml") div = soup.select_one("c-wiz > div") return { "signature": div.get("data-n-a-sg"), "timestamp": div.get("data-n-a-ts"), "gn_art_id": gn_art_id, } def decode_urls(articles): articles_reqs = [ [ "Fbv4je", f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]', ] for art in articles ] payload = f"f.req={quote(json.dumps([articles_reqs]))}" headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"} response = requests.post( url="https://news.google.com/_/DotsSplashUi/data/batchexecute", headers=headers, data=payload, ) response.raise_for_status() return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]] # Example usage encoded_urls = [ "https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5", "https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen", ] articles_params = [get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls] decoded_urls = decode_urls(articles_params) print(decoded_urls)
Works great and to lower/get rid of 429 errors, I changed
response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
to
response = requests.get(f"https://news.google.com/rss/articles/{gn_art_id}")
I wanted to do the same. Can you check how many links we can get before we receive a 429 error?
@vincenzon How many links do you need to process? How many proxies do you have? How many requests per second/minute do you make?
Yes, the new challenge is to hack the requests limits...
so it’s making multiple requests? seems like google really doesn’t want us to decode urls. what are the chances they block us?
We can use proxy for somewhat hack this there is so many free proxies is available in the market I am using webshare for proxies I implement the solution in python you can try just insert the api key
code in python
A lot slower, but this is what i used for a while (uses selenium to access the url and get the redirection, ignore the usage of ValueError, it's just laziness lol)
def get_correct_url(url):
if not url.startswith("https://news.google.com"):
return url
# Setup Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--no-sandbox") # Disable sandboxing for Docker
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--headless") # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
chrome_options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = webdriver.Chrome(service=Service(), options=chrome_options)
final_url = url
try:
# Open the URL with Selenium
driver.get(url)
WebDriverWait(driver, 60).until(lambda driver: not driver.current_url.startswith("https://news.google.com"))
final_url = driver.current_url
if("google.com/sorry" in final_url):
raise ValueError("Caught 429 into google")
if("news.google" in final_url):
raise ValueError("Couldn't parse the new url")
finally:
# Close the browser
driver.quit()
return final_url
Enjoy new updates. Thanks for the solution @iamatef
- You can install this package using pip:
pip install googlenewsdecoder
- You can upgrade this package using pip (upgrade to latest version):
pip install googlenewsdecoder --upgrade
from googlenewsdecoder import new_decoderv1
def main():
interval_time = 5 # default interval is None, if not specified
source_urls = ["https://news.google.com/read/CBMilgFBVV95cUxOM0JJaFRwV2dqRDk5dEFpWmF1cC1IVml5WmVtbHZBRXBjZHBfaUsyalRpa1I3a2lKM1ZnZUI4MHhPU2sydi1nX3JrYU0xWjhLaHNfU0N6cEhOYVE2TEptRnRoZGVTU3kzZGJNQzc2aDZqYjJOR0xleTdsemdRVnJGLTVYTEhzWGw4Z19lR3AwR0F1bXlyZ0HSAYwBQVVfeXFMTXlLRDRJUFN5WHg3ZTI0X1F4SjN6bmFIck1IaGxFVVZyOFQxdk1JT3JUbl91SEhsU0NpQzkzRFdHSEtjVGhJNzY4ZTl6eXhESUQ3XzdWVTBGOGgwSmlXaVRmU3BsQlhPVjV4VWxET3FQVzJNbm5CUDlUOHJUTExaME5YbjZCX1NqOU9Ta3U?hl=en-US&gl=US&ceid=US%3Aen","https://news.google.com/read/CBMiiAFBVV95cUxQOXZLdC1hSzFqQVVLWGJVZzlPaDYyNjdWTURScV9BbVp0SWhFNzZpSWZxSzdhc0tKbVlHMU13NmZVOFdidFFkajZPTm9SRnlZMWFRZ01CVHh0dXU0TjNVMUxZNk9Ibk5DV3hrYlRiZ20zYkIzSFhMQVVpcTFPc00xQjhhcGV1aXM00gF_QVVfeXFMTmtFQXMwMlY1el9WY0VRWEh5YkxXbHF0SjFLQVByNk1xS3hpdnBuUDVxOGZCQXl1QVFXaUVpbk5lUGgwRVVVT25tZlVUVWZqQzc4cm5MSVlfYmVlclFTOUFmTHF4eTlfemhTa2JKeG14bmNabENkSmZaeHB4WnZ5dw?hl=en-US&gl=US&ceid=US%3Aen"]
for url in source_urls:
try:
decoded_url = new_decoderv1(url, interval=interval_time)
if decoded_url.get("status"):
print("Decoded URL:", decoded_url["decoded_url"])
else:
print("Error:", decoded_url["message"])
except Exception as e:
print(f"Error occurred: {e}")
# Output: decoded_url - {'status': True, 'decoded_url': 'https://healthdatamanagement.com/articles/empowering-the-quintuple-aim-embracing-an-essential-architecture/'}
if __name__ == "__main__":
main()
Enjoy new updates. Thanks for the solution @iamatef
Wow... thank you.
Do we have the possibility to use it via PHP?
How can we handle such a possibility?
@maks-outsource You just need to re-code it with same logic... It can work with any languages.
According to LLM
<?php
function get_decoding_params($gn_art_id) {
$url = "https://news.google.com/articles/$gn_art_id";
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$response = curl_exec($ch);
if (curl_errno($ch)) {
throw new Exception('Curl error: ' . curl_error($ch));
}
curl_close($ch);
// Load the response into DOMDocument
$dom = new DOMDocument();
@$dom->loadHTML($response);
$xpath = new DOMXPath($dom);
$div = $xpath->query("//c-wiz/div")->item(0);
return [
"signature" => $div->getAttribute("data-n-a-sg"),
"timestamp" => $div->getAttribute("data-n-a-ts"),
"gn_art_id" => $gn_art_id,
];
}
function decode_urls($articles) {
$articles_reqs = [];
foreach ($articles as $art) {
$articles_reqs[] = [
"Fbv4je",
json_encode([
["garturlreq", [
["X", "X", ["X", "X"], null, null, 1, 1, "US:en", null, 1, null, null, null, null, null, 0, 1],
"X", "X", 1, [1, 1, 1], 1, 1, null, 0, 0, null, 0
]],
$art["gn_art_id"],
$art["timestamp"],
$art["signature"]
])
];
}
$payload = "f.req=" . urlencode(json_encode([$articles_reqs]));
$headers = [
"Content-Type: application/x-www-form-urlencoded;charset=UTF-8",
];
$ch = curl_init("https://news.google.com/_/DotsSplashUi/data/batchexecute");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$response = curl_exec($ch);
if (curl_errno($ch)) {
throw new Exception('Curl error: ' . curl_error($ch));
}
curl_close($ch);
$responseParts = explode("\n\n", $response);
$decoded = json_decode($responseParts[1], true);
return array_map(function($res) {
return json_decode($res[2], true)[1];
}, array_slice($decoded, 0, -2));
}
// Example usage
$encoded_urls = [
"https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5",
"https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen",
];
$articles_params = [];
foreach ($encoded_urls as $url) {
$gn_art_id = basename(parse_url($url, PHP_URL_PATH));
$articles_params[] = get_decoding_params($gn_art_id);
}
$decoded_urls = decode_urls($articles_params);
print_r($decoded_urls);
?>
I'm having a problem with order of the urls.
Basically, i get the URL from a previous array of Article objects. then when decoding, i need i to be at the same order of previous Article array. But when returning from /batchexecute it just loses the order. Anyone has a solution for this? For now i may need to not "batch run it" but run one by one...
fyi my current implementation, tips are welcome
def get_decoding_params(url):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
div = soup.select_one("c-wiz > div")
gn_art_id = urlparse(url).path.split("/")[-1]
return {
"signature": div.get("data-n-a-sg"),
"timestamp": div.get("data-n-a-ts"),
"gn_art_id": gn_art_id,
}
def decode_urls(articles):
articles_reqs = [
[
"Fbv4je",
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
]
for art in articles
]
payload = f"f.req={quote(json.dumps([articles_reqs]))}"
headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
response = requests.post(
url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
headers=headers,
data=payload,
)
response.raise_for_status()
return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]
def decode_batch_urls(urls):
articles_params = [get_decoding_params(url) for url in urls]
decoded_urls = decode_urls(articles_params)
return decoded_urls
def decode_single_url(url):
articles_params = [get_decoding_params(url)]
decoded_urls = decode_urls(articles_params)
return decoded_urls[0]
I didn't find any key or id-like that i could use to re-map the ordering, sadly
Hello @sviatoslav-lebediev,
we used to use residential proxies with round 2.5/GB pricing. but now that is out of the window as with the new mechanism we are using too much bandwidth.
https://www.webshare.io/features/datacenter-proxy
can you please confirm is data center proxies works well for google ?? is this the one you are talking about ??
are they rotating the IP pool or it's static ??