-
-
Save huksley/bc3cb046157a99cd9d1517b32f91a99e to your computer and use it in GitHub Desktop.
/** | |
* This magically uses batchexecute protocol. It's not documented, but it works. | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2024 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
const fetchDecodedBatchExecute = (id: string) => { | |
const s = | |
'[[["Fbv4je","[\\"garturlreq\\",[[\\"en-US\\",\\"US\\",[\\"FINANCE_TOP_INDICES\\",\\"WEB_TEST_1_0_0\\"],null,null,1,1,\\"US:en\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],\\"en-US\\",\\"US\\",1,[2,3,4,8],1,0,\\"655000234\\",0,0,null,0],\\"' + | |
id + | |
'\\"]",null,"generic"]]]'; | |
return fetch("https://news.google.com/_/DotsSplashUi/data/batchexecute?" + "rpcids=Fbv4je", { | |
headers: { | |
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8", | |
Referrer: "https://news.google.com/" | |
}, | |
body: "f.req=" + encodeURIComponent(s), | |
method: "POST" | |
}) | |
.then(e => e.text()) | |
.then(s => { | |
const header = '[\\"garturlres\\",\\"'; | |
const footer = '\\",'; | |
if (!s.includes(header)) { | |
throw new Error("header not found: " + s); | |
} | |
const start = s.substring(s.indexOf(header) + header.length); | |
if (!start.includes(footer)) { | |
throw new Error("footer not found"); | |
} | |
const url = start.substring(0, start.indexOf(footer)); | |
return url; | |
}); | |
}; | |
/** | |
* Google News started generate encoded, internal URLs for RSS items | |
* https://news.google.com/rss/search?q=New%20York%20when%3A30d&hl=en-US&gl=US&ceid=US:en | |
* | |
* This script decodes URLs into original one, for example URL | |
* https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5 | |
* | |
* contains this | |
* https://techcrunch.com/2022/10/27/new-york-post-hacked-offensive-tweets/ | |
* | |
* In path after articles/ goes Base64 encoded binary data | |
* | |
* Format is the following: | |
* <prefix> <len bytes> <URL bytes> <len bytes> <amp URL bytes> [<suffix>] | |
* | |
* <prefix> - 0x08, 0x13, 0x22 | |
* <suffix> - 0xd2, 0x01, 0x00 (sometimes missing??) | |
* <len bytes> - formatted as 0x40 or 0x81 0x01 sometimes | |
* | |
* | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* https://news.google.com/rss/articles/CBMidkFVX3lxTFB1QmFsSi1Zc3dLQkpNLThKTXExWXBGWlE0eERJQ2hLRENIOFJzRTlsRnM1NS1Hc2FlbjdIMlZ3eWNQa0JqeVYzZGs1Y0hKaUtTUko2dmJabUtVMWZob0lNSFNCa3NLQ05ROGh4cVZfVTYyUDVxc2c?oc=5 | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* | |
* FIXME: What will happen if URL more than 255 bytes?? | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2022 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
export const decodeGoogleNewsUrl = async (sourceUrl: string) => { | |
const url = new URL(sourceUrl); | |
const path = url.pathname.split("/"); | |
if ( | |
url.hostname === "news.google.com" && | |
path.length > 1 && | |
path[path.length - 2] === "articles" | |
) { | |
const base64 = path[path.length - 1]; | |
let str = atob(base64); | |
const prefix = Buffer.from([0x08, 0x13, 0x22]).toString("binary"); | |
if (str.startsWith(prefix)) { | |
str = str.substring(prefix.length); | |
} | |
const suffix = Buffer.from([0xd2, 0x01, 0x00]).toString("binary"); | |
if (str.endsWith(suffix)) { | |
str = str.substring(0, str.length - suffix.length); | |
} | |
// One or two bytes to skip | |
const bytes = Uint8Array.from(str, c => c.charCodeAt(0)); | |
const len = bytes.at(0)!; | |
if (len >= 0x80) { | |
str = str.substring(2, len + 2); | |
} else { | |
str = str.substring(1, len + 1); | |
} | |
if (str.startsWith("AU_yqL")) { | |
// New style encoding, introduced in July 2024. Not yet known how to decode offline. | |
const url = await fetchDecodedBatchExecute(base64); | |
return url; | |
} | |
return str; | |
} else { | |
return sourceUrl; | |
} | |
}; |
@huksley more than 100 request , we get ban
please find way to slove this bro ...
Hi @huksley ! Firstly I would like to thank you for the solution.
And @Glyphosate69 for the version in Python.
I get the following output with decode_google_news_url
:
https://www.rondoniadinamica.com/noticias/2024/07/deputado-alan-queiroz-destina-emenda-parlamentar-para-microrevestimento-asfaltico-em-alto-alegre-dos-parecis,195210.shtm
@Glyphosate69 , thanks, the converted dedoder works well
@huksley @Glyphosate69 It works! Thank you!
I could use some help porting this solution to Kotlin. I'm getting a 400 I think it's related to body structure. Can anyone see why these are wrong? I've tried these 2 options.
RESPONSE: 400
METHOD: HttpMethod(value=POST)
FROM: https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je
COMMON HEADERS
-> accept-ch: Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Model, Sec-CH-UA-WoW64, Sec-CH-UA-Form-Factors, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version
-> alt-svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
-> cache-control: no-cache, no-store, max-age=0, must-revalidate
-> content-security-policy: require-trusted-types-for 'script';report-uri /_/DotsSplashUi/cspreport
-> content-type: application/json; charset=utf-8
-> cross-origin-opener-policy: same-origin-allow-popups
-> date: Thu, 25 Jul 2024 16:58:10 GMT
-> expires: Mon, 01 Jan 1990 00:00:00 GMT
-> p3p: CP="This is not a P3P policy! See g.co/p3phelp for more info."
-> permissions-policy: ch-ua-arch=*, ch-ua-bitness=*, ch-ua-full-version=*, ch-ua-full-version-list=*, ch-ua-model=*, ch-ua-wow64=*, ch-ua-form-factors=*, ch-ua-platform=*, ch-ua-platform-version=*
-> pragma: no-cache
-> server: ESF
-> set-cookie: NID=516=leQJMqD4H9VakZVMCpCU0NFlrhGJNGECMSpfUVSUj5kqdfyHHaluQNey8BloNXewA3GnthVyvQqnrXTqNhQjHZbFq3_Zhwy-_Qq-rPu2MRa6PhwzyAKr02aZNM55NYryyoWhdz10wXPmKSr81eBrpXm-_pNgdCCijlTWlbG0vjk; expires=Fri, 24-Jan-2025 16:58:10 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=none; GN_PREF=W251bGwsIkNBSVNEQWlpaUlxMUJoQ3ctSTJwQXciXQ__; Expires=Fri, 24-Jan-2025 04:58:10 GMT; Path=/; Secure
-> strict-transport-security: max-age=31536000
-> vary: Sec-Fetch-Dest, Sec-Fetch-Mode, Sec-Fetch-Site
-> x-content-type-options: nosniff
-> x-frame-options: SAMEORIGIN
-> x-xss-protection: 0
BODY Content-Type: application/json; charset=utf-8
BODY START
)]}'
[["er",null,null,null,null,400,null,null,null,3],["di",27],["af.httprm",27,"8080489730126530367",45]]
BODY END
REQUEST: https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je
METHOD: HttpMethod(value=POST)
COMMON HEADERS
-> Accept: */*
-> Accept-Charset: UTF-8
-> User-Agent: Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.3
CONTENT HEADERS
-> Content-Length: 690
-> Content-Type: text/plain; charset=UTF-8
BODY Content-Type: text/plain; charset=UTF-8
BODY START
{"f.req":[[["Fbv4je","["garturlreq",[["en-US","US",["FINANCE_TOP_INDICES","WEB_TEST_1_0_0"],null,null,1,1,"US:en",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],"en-US","US",1,[2,3,4,8],1,0,"655000234",0,0,null,0],"CBMimgFBVV95cUxPNXItT09qVkV0QjQ1STFPcTVqYWpQOHFSV0s0ZFdVN3ZGOHplTzdOUDFRb0RWSVZJTVNBazZVcjNIb1VPLTRVNGgxbXA0YUhUV2lMX2dBSEJnU0pIcjFUVmY5Znc3OU9BZVVSMVJ1VTBBMkdmTnVQRkpSb203UDZLVnhaWGRfY2xRWE5FNWY3VEdobUNNYmRSS2dn0gGfAUFVX3lxTFBaa3otTDNkUW9yMjNnODFrN3dHUS1kdXJhalZfMG1RUTVsc20xVWdla2dXY1JxbzJwNlU2N1dpS1luSndmV05ibkZnUW5UbkxlNl9FY3JiRXVSX2NzUi1PcHAyb0RPMkRzRDVQbzR4Q0VRM2dmazhzU2J6VWRjcVM2NmNjZTktUnFVaHhsa2ZQaHdlZDY0VUtuRnhmS1h5aw"]",null,"generic"]]]}
BODY END
REQUEST: https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je
METHOD: HttpMethod(value=POST)
COMMON HEADERS
-> Accept: */*
-> Accept-Charset: UTF-8
-> User-Agent: Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.3
CONTENT HEADERS
-> Content-Length: 554
-> Content-Type: text/plain; charset=UTF-8
BODY Content-Type: text/plain; charset=UTF-8
BODY START
f.req=[[["Fbv4je","["garturlreq",[["en-US","US",["FINANCE_TOP_INDICES","WEB_TEST_1_0_0"],null,null,1,1,"US:en",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],"en-US","US",1,[2,3,4,8],1,0,"655000234",0,0,null,0],"CBMi2gFBVV95cUxNa1BKamdzMDZXRVd3cmZGV1daV2tzdzhINTRmeXg2UDJNc3p5NXZYSDc1TEJIbXlXOUt5N3dYdGVxNU8zbG9vUkdpMFI5aGNhcVZGMzV3cTZLZ0VQM1dVRGt5clBvdk9Bb2I1eW9SM0xwNkhmQ3o2ZTloZm5LeXdFNlhBWms4cnJvT0VhYXltU0NER1lkb21lbXRTemwzUVdhM3dSYVJlejNLM2tXd2xLeTBhOXFyaHZOTFlkZUx2c3ZHWkxPM041a3pzZS0tMzRWSVZwQ0lDQUh2UQ"]",null,"generic"]]]
BODY END
PHP Using Multi Curl
`<?php
/**
- This magically uses batchexecute protocol. It's not documented, but it works.
- Licensed under: MIT License
- Copyright (c) 2024 Ruslan Gainutdinov
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included
- in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
*/
function fetchDecodedBatchExecute($ids) {
$multi = curl_multi_init();
$channels = [];
foreach ($ids as $id) {
$s = '[[["Fbv4je","[\\"garturlreq\\",[[\\"en-US\\",\\"US\\",[\\"FINANCE_TOP_INDICES\\",\\"WEB_TEST_1_0_0\\"],null,null,1,1,\\"US:en\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],\\"en-US\\",\\"US\\",1,[2,3,4,8],1,0,\\"655000234\\",0,0,null,0],\\"' . $id . '\\"]",null,"generic"]]]';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, "f.req=" . urlencode($s));
curl_setopt($ch, CURLOPT_HTTPHEADER, [
"Content-Type: application/x-www-form-urlencoded;charset=utf-8",
"Referer: https://news.google.com/"
]);
curl_multi_add_handle($multi, $ch);
$channels[$id] = $ch;
}
$running = null;
do {
curl_multi_exec($multi, $running);
} while ($running);
$results = [];
foreach ($channels as $id => $ch) {
$response = curl_multi_getcontent($ch);
$header = '[\\"garturlres\\",\\"';
$footer = '\\",';
if (strpos($response, $header) === false) {
$results[$id] = null;
continue;
}
$start = substr($response, strpos($response, $header) + strlen($header));
if (strpos($start, $footer) === false) {
$results[$id] = null;
continue;
}
$url = substr($start, 0, strpos($start, $footer));
$results[$id] = $url;
curl_multi_remove_handle($multi, $ch);
}
curl_multi_close($multi);
return $results;
}
/**
- Google News started generate encoded, internal URLs for RSS items
- https://news.google.com/rss/search?q=New%20York%20when%3A30d&hl=en-US&gl=US&ceid=US:en
- This script decodes URLs into original one, for example URL
- https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5
- contains this
- https://techcrunch.com/2022/10/27/new-york-post-hacked-offensive-tweets/
- In path after articles/ goes Base64 encoded binary data
- Format is the following:
- []
- - 0x08, 0x13, 0x22
- - 0xd2, 0x01, 0x00 (sometimes missing??)
- - formatted as 0x40 or 0x81 0x01 sometimes
- Licensed under: MIT License
- Copyright (c) 2022 Ruslan Gainutdinov
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included
- in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
*/
function decodeGoogleNewsUrls($sourceUrls) {
// Convert single URL to array if a string is provided
if (is_string($sourceUrls)) {
$sourceUrls = [$sourceUrls];
}
// Check if $sourceUrls is now an array
if (!is_array($sourceUrls)) {
throw new InvalidArgumentException("Input must be a URL string or an array of URL strings");
}
$decodedUrls = [];
$batchExecuteIds = [];
foreach ($sourceUrls as $sourceUrl) {
$url = parse_url($sourceUrl);
$path = explode('/', $url['path']);
if ($url['host'] === 'news.google.com' && count($path) > 1 && $path[count($path) - 2] === 'articles') {
$base64 = $path[count($path) - 1];
$str = base64_decode($base64);
$prefix = "\x08\x13\x22";
if (strpos($str, $prefix) === 0) {
$str = substr($str, strlen($prefix));
}
$suffix = "\xd2\x01\x00";
if (substr($str, -strlen($suffix)) === $suffix) {
$str = substr($str, 0, -strlen($suffix));
}
// One or two bytes to skip
$bytes = unpack('C*', $str);
$len = $bytes[1];
if ($len >= 0x80) {
$str = substr($str, 2, $len - 1);
} else {
$str = substr($str, 1, $len);
}
if (strpos($str, 'AU_yqL') === 0) {
$batchExecuteIds[$sourceUrl] = $base64;
} else {
$decodedUrls[$sourceUrl] = $str;
}
} else {
$decodedUrls[$sourceUrl] = $sourceUrl;
}
}
if (!empty($batchExecuteIds)) {
$batchResults = fetchDecodedBatchExecute($batchExecuteIds);
foreach ($batchResults as $sourceUrl => $result) {
$decodedUrls[$sourceUrl] = $result ?? $sourceUrl;
}
}
// If input was a single URL, return a single result
if (count($sourceUrls) === 1) {
return reset($decodedUrls);
}
return $decodedUrls;
}
// Example usage:
// For a single URL
$singleUrl = 'https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5';
$decodedSingleUrl = decodeGoogleNewsUrls($singleUrl);
echo "Decoded single URL: " . $decodedSingleUrl . "\n";
// For multiple URLs
$urls = [
'https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5',
'https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5',
'https://example.com/regular-url'
];
$decodedUrls = decodeGoogleNewsUrls($urls);
print_r($decodedUrls);`
/**
- Licensed under: MIT License
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included
- in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
*/
// Enjoy ! (and wrap a class around it)
private static String decodeUrl(String url) throws Exception {
URL surl = new URL(url);
String[] paths = surl.getPath().split("/");
if (Objects.equals(surl.getHost(), "news.google.com") && Objects.equals(paths[2], "articles")) {
byte[] decodedBytes = Base64.getUrlDecoder().decode(paths[3]);
String urls = new String(decodedBytes);
String prefix = new String(Arrays.copyOfRange(decodedBytes, 0, 3));
if (urls.startsWith(prefix))
urls = urls.substring(3);
int length = urls.getBytes()[0], start, end = 0;
if (length < 0) { // 2 bytes length
start = 2;
} else {
start = 1;
}
if( urls.substring(start).startsWith("AU_yqL")) {
return fetchDecodedBatchExecute(paths[3]);
}
else {
for (int i = start; end == 0 && i < urls.length(); i++) {
if (urls.charAt(i) > 127) {
end = i;
}
}
return urls.substring(start, end);
}
} else
return url;
}
private static String fetchDecodedBatchExecute(String id) {
String s = "[[[\"Fbv4je\",\"[\\\"garturlreq\\\",[[\\\"en-US\\\",\\\"US\\\",[\\\"FINANCE_TOP_INDICES\\\",\\\"WEB_TEST_1_0_0\\\"],null,null,1,1,\\\"US:en\\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],\\\"en-US\\\",\\\"US\\\",1,[2,3,4,8],1,0,\\\"655000234\\\",0,0,null,0],\\\"" + id + "\\\"]\",null,\"generic\"]]]";
try {
URL url = new URL("https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
conn.setRequestProperty("Referer", "https://news.google.com/");
conn.setDoOutput(true);
String data = "f.req=" + s;
try (OutputStream os = conn.getOutputStream()) {
byte[] input = data.getBytes("utf-8");
os.write(input, 0, input.length);
}
int responseCode = conn.getResponseCode();
if (responseCode != 200) {
throw new IOException("Failed to fetch data from Google. Response code: " + responseCode);
}
Scanner scanner = new Scanner(conn.getInputStream(), "UTF-8");
String response = scanner.useDelimiter("\\A").next();
scanner.close();
String header = "[\\\"garturlres\\\",\\\"";
String footer = "\\\",";
int headerLength = header.length();
String resp = response.substring(response.indexOf(header)+headerLength);
return resp.substring(0, resp.indexOf(footer));
}
catch (Exception e) {
log.error("fetchDecodedBatchExecute: {}", e.getMessage());
return null;
}
}
@caiolivf fixed
@nandhadeva if you get errors after a certain number of requests, you need to limit the frequency of fetch. You can find many solutions online.
// New style encoding, introduced in July 2024. Not yet known how to decode offline.
// if (str.startsWith("AU_yqL")) {
function decode(base64: string) {
// Add removed at end '='
base64 += Array(5 - (base64.length % 4)).join("=");
base64 = base64
.replace(/\-/g, "+") // Convert '-' to '+'
.replace(/\_/g, "/"); // Convert '_' to '/'
return base64;
}
console(decode("AU_yqLOmYvNelzH6X4XJXfMLHdIeqX7gOM53gxEnFFaiz07MbXLMh_PMHavykLxbL5ULxWEG93ufMQUDB1ih_FrczLDqfUljv1W8QYvJd0zs2fpKQNoXRF1Jf-v4bylz-QGbiTBdVUO0Ue-5v6dN2pekzd0A0cNypBdsJBE1nuSTuuJOG01Lag82y3-JVTFgyebjjf2mPVvunJxxIgKeG4oyJMt1ZIRejLpmvxlqofSDyfgCE8EXA1ApdQ"))
// => valid base64 AU/yqLOmYvNelzH6X4XJXfMLHdIeqX7gOM53gxEnFFaiz07MbXLMh/PMHavykLxbL5ULxWEG93ufMQUDB1ih/FrczLDqfUljv1W8QYvJd0zs2fpKQNoXRF1Jf+v4bylz+QGbiTBdVUO0Ue+5v6dN2pekzd0A0cNypBdsJBE1nuSTuuJOG01Lag82y3+JVTFgyebjjf2mPVvunJxxIgKeG4oyJMt1ZIRejLpmvxlqofSDyfgCE8EXA1ApdQ==
// which is https://www.dailymail.co.uk/news/article-2789981/chinese-restaurant-syndrome-old-wives-tale-heston-blumenthal-says-msg-really-important-element-taste-not-bad-you.html
It seems like it is a valid base64 after some demangling and probably a protobuf format afterwards, or compressed. but right now I can't find any algorithm that can decode that data properly.
Warning: file_get_contents(https://news.google.com/rss/search?q=COVID+when:30d&hl=en-US&gl=US&ceid=US:en&ie=utf-8): Failed to open stream: HTTP request failed! HTTP/1.1 503 Service Unavailable in /home/vonwallace/public_html/news/newsget.php on line 208
Could not get results for: https://news.google.com/rss/search?q=COVID+when:30d&hl=en-US&gl=US&ceid=US:en&ie=utf-8
curl -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" -L "https://news.google.com/rss/search?q=Cybersecurity+when:30d&hl=en-US&gl=US&ceid=US:en&ie=utf-8"
<title>Sorry...</title><style> body { font-family: verdana, arial, sans-serif; background-color: #fff; color: #000; }</style>Sorry... |
We're sorry...
... but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.
Had to take my news feeds offline because of this if anyone knows a fix let me know
Hey everyone check out my website for IT/cyber security experts with a touch of faith.
https://vonwallace.com
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
Yes but now I am blocked I wonder how long that will remain in effect
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
i confirm, it is working
This script made too many requests to google got blocked, anyone know how long that will remain in effect
This script made too many requests to google got blocked, anyone know how long that will remain in effect
Same situation here.
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
I confirm. hope it lasts
Hi,
I tried below code which was pasted above for new decode format but for news links which has youtube redirect it's not working please help me anyone
import requests
import base64
def fetch_decoded_batch_execute(id):
s = (
'[[["Fbv4je","[\"garturlreq\",[[\"en-US\",\"US\",[\"FINANCE_TOP_INDICES\",\"WEB_TEST_1_0_0\"],'
'null,null,1,1,\"US:en\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],'
'\"en-US\",\"US\",1,[2,3,4,8],1,0,\"655000234\",0,0,null,0],\"' +
id +
'\"]",null,"generic"]]]'
)
headers = {
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8",
"Referer": "https://news.google.com/"
}
response = requests.post(
"https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je",
headers=headers,
data={"f.req": s}
)
if response.status_code != 200:
raise Exception("Failed to fetch data from Google.")
text = response.text
header = '[\\"garturlres\\",\\"'
footer = '\\",'
if header not in text:
raise Exception(f"Header not found in response: {text}")
start = text.split(header, 1)[1]
if footer not in start:
raise Exception("Footer not found in response.")
url = start.split(footer, 1)[0]
return url
def decode_google_news_url(source_url):
url = requests.utils.urlparse(source_url)
path = url.path.split("/")
if url.hostname == "news.google.com" and len(path) > 1 and path[-2] == "articles":
base64_str = path[-1]
decoded_bytes = base64.urlsafe_b64decode(base64_str + '==')
decoded_str = decoded_bytes.decode('latin1')
prefix = b'\x08\x13\x22'.decode('latin1')
if decoded_str.startswith(prefix):
decoded_str = decoded_str[len(prefix):]
suffix = b'\xd2\x01\x00'.decode('latin1')
if decoded_str.endswith(suffix):
decoded_str = decoded_str[:-len(suffix)]
bytes_array = bytearray(decoded_str, 'latin1')
length = bytes_array[0]
if length >= 0x80:
decoded_str = decoded_str[2:length+1]
else:
decoded_str = decoded_str[1:length+1]
if decoded_str.startswith("AU_yqL"):
return fetch_decoded_batch_execute(base64_str)
return decoded_str
else:
return source_url
Example usage
if name == "main":
source_url = 'https://news.google.com/rss/articles/CBMiVkFVX3lxTE1KbVBoUnRqcVpXbk9YSnJoM3BuTFJTQ3NlSDliN0hzYk11Z29TWF9sMmY3eG8tdTNfMExCQ0tMcFlVTTdLcDJibjhMY1ZUXzNmWlVQd3JR?oc=5'
decoded_url = decode_google_news_url(source_url)
print(decoded_url)
import requests
import base64
def fetch_decoded_batch_execute(id):
s = (
'[[["Fbv4je","[\"garturlreq\",[[\"en-US\",\"US\",[\"FINANCE_TOP_INDICES\",\"WEB_TEST_1_0_0\"],'
'null,null,1,1,\"US:en\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],'
'\"en-US\",\"US\",1,[2,3,4,8],1,0,\"655000234\",0,0,null,0],\"' +
id +
'\"]",null,"generic"]]]'
)
headers = {
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8",
"Referer": "https://news.google.com/"
}
response = requests.post(
"https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je",
headers=headers,
data={"f.req": s}
)
if response.status_code != 200:
raise Exception("Failed to fetch data from Google.")
text = response.text
header = '[\\"garturlres\\",\\"'
footer = '\\",'
if header not in text:
raise Exception(f"Header not found in response: {text}")
start = text.split(header, 1)[1]
if footer not in start:
raise Exception("Footer not found in response.")
url = start.split(footer, 1)[0]
return url
def decode_google_news_url(source_url):
url = requests.utils.urlparse(source_url)
path = url.path.split("/")
if url.hostname == "news.google.com" and len(path) > 1 and path[-2] == "articles":
base64_str = path[-1]
decoded_bytes = base64.urlsafe_b64decode(base64_str + '==')
decoded_str = decoded_bytes.decode('latin1')
prefix = b'\x08\x13\x22'.decode('latin1')
if decoded_str.startswith(prefix):
decoded_str = decoded_str[len(prefix):]
suffix = b'\xd2\x01\x00'.decode('latin1')
if decoded_str.endswith(suffix):
decoded_str = decoded_str[:-len(suffix)]
bytes_array = bytearray(decoded_str, 'latin1')
length = bytes_array[0]
if length >= 0x80:
decoded_str = decoded_str[2:length+2]
else:
decoded_str = decoded_str[1:length+1]
if decoded_str.startswith("AU_yqL"):
return fetch_decoded_batch_execute(base64_str)
return decoded_str
else:
return source_url
Example usage
if name == "main":
source_url = 'https://news.google.com/rss/articles/CBMiVkFVX3lxTE4zaGU2bTY2ZGkzdTRkSkJ0cFpsTGlDUjkxU2FBRURaTWU0c3QzVWZ1MHZZNkZ5Vzk1ZVBnTDFHY2R6ZmdCUkpUTUJsS1pqQTlCRzlzbHV3?oc=5'
decoded_url = decode_google_news_url(source_url)
print(decoded_url)
Not able to decode youtube links HELP ME PLEASE
@huksley
I am not able to decode links redirecting to youtube
@Stevespear426 / @eternityready2 / @stuartskelton / @dylanpyle / @Stevespear426
Please help me I tried given python code but not working :(
@VishruthBharadwaj With my version of this I get back https://www.youtube.com/watch?v\\u003d-r2aoxoUsPk\\
. so it seems to work for me. There is a QPS limit, and you are usually locked out for about 30 minutes.
Can you please paste your latest code which you're using.
@stuartskelton
Can you please paste your latest code which you're using.
@VishruthBharadwaj my code is a Perl port of the code above. @Glyphosate69 Python version works unedited. it returns https://www.youtube.com/watch?v\\u003d-r2aoxoUsPk
where \\u003d
is =
so the final URL is https://www.youtube.com/watch?v=-r2aoxoUsPk
Thanks a lot @stuartskelton
did they change it back?
What is QPS? Which server? @ebinezerp