-
-
Save huksley/bc3cb046157a99cd9d1517b32f91a99e to your computer and use it in GitHub Desktop.
/** | |
* This magically uses batchexecute protocol. It's not documented, but it works. | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2024 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
const fetchDecodedBatchExecute = (id: string) => { | |
const s = | |
'[[["Fbv4je","[\\"garturlreq\\",[[\\"en-US\\",\\"US\\",[\\"FINANCE_TOP_INDICES\\",\\"WEB_TEST_1_0_0\\"],null,null,1,1,\\"US:en\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],\\"en-US\\",\\"US\\",1,[2,3,4,8],1,0,\\"655000234\\",0,0,null,0],\\"' + | |
id + | |
'\\"]",null,"generic"]]]'; | |
return fetch("https://news.google.com/_/DotsSplashUi/data/batchexecute?" + "rpcids=Fbv4je", { | |
headers: { | |
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8", | |
Referrer: "https://news.google.com/" | |
}, | |
body: "f.req=" + encodeURIComponent(s), | |
method: "POST" | |
}) | |
.then(e => e.text()) | |
.then(s => { | |
const header = '[\\"garturlres\\",\\"'; | |
const footer = '\\",'; | |
if (!s.includes(header)) { | |
throw new Error("header not found: " + s); | |
} | |
const start = s.substring(s.indexOf(header) + header.length); | |
if (!start.includes(footer)) { | |
throw new Error("footer not found"); | |
} | |
const url = start.substring(0, start.indexOf(footer)); | |
return url; | |
}); | |
}; | |
/** | |
* Google News started generate encoded, internal URLs for RSS items | |
* https://news.google.com/rss/search?q=New%20York%20when%3A30d&hl=en-US&gl=US&ceid=US:en | |
* | |
* This script decodes URLs into original one, for example URL | |
* https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5 | |
* | |
* contains this | |
* https://techcrunch.com/2022/10/27/new-york-post-hacked-offensive-tweets/ | |
* | |
* In path after articles/ goes Base64 encoded binary data | |
* | |
* Format is the following: | |
* <prefix> <len bytes> <URL bytes> <len bytes> <amp URL bytes> [<suffix>] | |
* | |
* <prefix> - 0x08, 0x13, 0x22 | |
* <suffix> - 0xd2, 0x01, 0x00 (sometimes missing??) | |
* <len bytes> - formatted as 0x40 or 0x81 0x01 sometimes | |
* | |
* | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* https://news.google.com/rss/articles/CBMidkFVX3lxTFB1QmFsSi1Zc3dLQkpNLThKTXExWXBGWlE0eERJQ2hLRENIOFJzRTlsRnM1NS1Hc2FlbjdIMlZ3eWNQa0JqeVYzZGs1Y0hKaUtTUko2dmJabUtVMWZob0lNSFNCa3NLQ05ROGh4cVZfVTYyUDVxc2c?oc=5 | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* | |
* FIXME: What will happen if URL more than 255 bytes?? | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2022 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
export const decodeGoogleNewsUrl = async (sourceUrl: string) => { | |
const url = new URL(sourceUrl); | |
const path = url.pathname.split("/"); | |
if ( | |
url.hostname === "news.google.com" && | |
path.length > 1 && | |
path[path.length - 2] === "articles" | |
) { | |
const base64 = path[path.length - 1]; | |
let str = atob(base64); | |
const prefix = Buffer.from([0x08, 0x13, 0x22]).toString("binary"); | |
if (str.startsWith(prefix)) { | |
str = str.substring(prefix.length); | |
} | |
const suffix = Buffer.from([0xd2, 0x01, 0x00]).toString("binary"); | |
if (str.endsWith(suffix)) { | |
str = str.substring(0, str.length - suffix.length); | |
} | |
// One or two bytes to skip | |
const bytes = Uint8Array.from(str, c => c.charCodeAt(0)); | |
const len = bytes.at(0)!; | |
if (len >= 0x80) { | |
str = str.substring(2, len + 2); | |
} else { | |
str = str.substring(1, len + 1); | |
} | |
if (str.startsWith("AU_yqL")) { | |
// New style encoding, introduced in July 2024. Not yet known how to decode offline. | |
const url = await fetchDecodedBatchExecute(base64); | |
return url; | |
} | |
return str; | |
} else { | |
return sourceUrl; | |
} | |
}; |
curl -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" -L "https://news.google.com/rss/search?q=Cybersecurity+when:30d&hl=en-US&gl=US&ceid=US:en&ie=utf-8"
<title>Sorry...</title><style> body { font-family: verdana, arial, sans-serif; background-color: #fff; color: #000; }</style>Sorry... |
We're sorry...
... but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.
Had to take my news feeds offline because of this if anyone knows a fix let me know
Hey everyone check out my website for IT/cyber security experts with a touch of faith.
https://vonwallace.com
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
Yes but now I am blocked I wonder how long that will remain in effect
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
i confirm, it is working
This script made too many requests to google got blocked, anyone know how long that will remain in effect
This script made too many requests to google got blocked, anyone know how long that will remain in effect
Same situation here.
Looks like the Google News RSS feeds went back to the old way this morning. My old parsing is working again.
I confirm. hope it lasts
Hi,
I tried below code which was pasted above for new decode format but for news links which has youtube redirect it's not working please help me anyone
import requests
import base64
def fetch_decoded_batch_execute(id):
s = (
'[[["Fbv4je","[\"garturlreq\",[[\"en-US\",\"US\",[\"FINANCE_TOP_INDICES\",\"WEB_TEST_1_0_0\"],'
'null,null,1,1,\"US:en\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],'
'\"en-US\",\"US\",1,[2,3,4,8],1,0,\"655000234\",0,0,null,0],\"' +
id +
'\"]",null,"generic"]]]'
)
headers = {
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8",
"Referer": "https://news.google.com/"
}
response = requests.post(
"https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je",
headers=headers,
data={"f.req": s}
)
if response.status_code != 200:
raise Exception("Failed to fetch data from Google.")
text = response.text
header = '[\\"garturlres\\",\\"'
footer = '\\",'
if header not in text:
raise Exception(f"Header not found in response: {text}")
start = text.split(header, 1)[1]
if footer not in start:
raise Exception("Footer not found in response.")
url = start.split(footer, 1)[0]
return url
def decode_google_news_url(source_url):
url = requests.utils.urlparse(source_url)
path = url.path.split("/")
if url.hostname == "news.google.com" and len(path) > 1 and path[-2] == "articles":
base64_str = path[-1]
decoded_bytes = base64.urlsafe_b64decode(base64_str + '==')
decoded_str = decoded_bytes.decode('latin1')
prefix = b'\x08\x13\x22'.decode('latin1')
if decoded_str.startswith(prefix):
decoded_str = decoded_str[len(prefix):]
suffix = b'\xd2\x01\x00'.decode('latin1')
if decoded_str.endswith(suffix):
decoded_str = decoded_str[:-len(suffix)]
bytes_array = bytearray(decoded_str, 'latin1')
length = bytes_array[0]
if length >= 0x80:
decoded_str = decoded_str[2:length+1]
else:
decoded_str = decoded_str[1:length+1]
if decoded_str.startswith("AU_yqL"):
return fetch_decoded_batch_execute(base64_str)
return decoded_str
else:
return source_url
Example usage
if name == "main":
source_url = 'https://news.google.com/rss/articles/CBMiVkFVX3lxTE1KbVBoUnRqcVpXbk9YSnJoM3BuTFJTQ3NlSDliN0hzYk11Z29TWF9sMmY3eG8tdTNfMExCQ0tMcFlVTTdLcDJibjhMY1ZUXzNmWlVQd3JR?oc=5'
decoded_url = decode_google_news_url(source_url)
print(decoded_url)
import requests
import base64
def fetch_decoded_batch_execute(id):
s = (
'[[["Fbv4je","[\"garturlreq\",[[\"en-US\",\"US\",[\"FINANCE_TOP_INDICES\",\"WEB_TEST_1_0_0\"],'
'null,null,1,1,\"US:en\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],'
'\"en-US\",\"US\",1,[2,3,4,8],1,0,\"655000234\",0,0,null,0],\"' +
id +
'\"]",null,"generic"]]]'
)
headers = {
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8",
"Referer": "https://news.google.com/"
}
response = requests.post(
"https://news.google.com/_/DotsSplashUi/data/batchexecute?rpcids=Fbv4je",
headers=headers,
data={"f.req": s}
)
if response.status_code != 200:
raise Exception("Failed to fetch data from Google.")
text = response.text
header = '[\\"garturlres\\",\\"'
footer = '\\",'
if header not in text:
raise Exception(f"Header not found in response: {text}")
start = text.split(header, 1)[1]
if footer not in start:
raise Exception("Footer not found in response.")
url = start.split(footer, 1)[0]
return url
def decode_google_news_url(source_url):
url = requests.utils.urlparse(source_url)
path = url.path.split("/")
if url.hostname == "news.google.com" and len(path) > 1 and path[-2] == "articles":
base64_str = path[-1]
decoded_bytes = base64.urlsafe_b64decode(base64_str + '==')
decoded_str = decoded_bytes.decode('latin1')
prefix = b'\x08\x13\x22'.decode('latin1')
if decoded_str.startswith(prefix):
decoded_str = decoded_str[len(prefix):]
suffix = b'\xd2\x01\x00'.decode('latin1')
if decoded_str.endswith(suffix):
decoded_str = decoded_str[:-len(suffix)]
bytes_array = bytearray(decoded_str, 'latin1')
length = bytes_array[0]
if length >= 0x80:
decoded_str = decoded_str[2:length+2]
else:
decoded_str = decoded_str[1:length+1]
if decoded_str.startswith("AU_yqL"):
return fetch_decoded_batch_execute(base64_str)
return decoded_str
else:
return source_url
Example usage
if name == "main":
source_url = 'https://news.google.com/rss/articles/CBMiVkFVX3lxTE4zaGU2bTY2ZGkzdTRkSkJ0cFpsTGlDUjkxU2FBRURaTWU0c3QzVWZ1MHZZNkZ5Vzk1ZVBnTDFHY2R6ZmdCUkpUTUJsS1pqQTlCRzlzbHV3?oc=5'
decoded_url = decode_google_news_url(source_url)
print(decoded_url)
Not able to decode youtube links HELP ME PLEASE
@huksley
I am not able to decode links redirecting to youtube
@Stevespear426 / @eternityready2 / @stuartskelton / @dylanpyle / @Stevespear426
Please help me I tried given python code but not working :(
@VishruthBharadwaj With my version of this I get back https://www.youtube.com/watch?v\\u003d-r2aoxoUsPk\\
. so it seems to work for me. There is a QPS limit, and you are usually locked out for about 30 minutes.
Can you please paste your latest code which you're using.
@stuartskelton
Can you please paste your latest code which you're using.
@VishruthBharadwaj my code is a Perl port of the code above. @Glyphosate69 Python version works unedited. it returns https://www.youtube.com/watch?v\\u003d-r2aoxoUsPk
where \\u003d
is =
so the final URL is https://www.youtube.com/watch?v=-r2aoxoUsPk
Thanks a lot @stuartskelton
did they change it back?
Not working the old method
The old method is broken again
Can confirm... broken again.
@huksley old method is not working....
Here is another option
https://www.bing.com/news/search?q=wordpress&format=rss&count=24
Url is extractable
I will be switching over.
It does not give you a max of 100 results, but thats okay as that is normally older less interesting news sometimes. and most people will not scroll that far.
I would rather have 24 trending stories than 100 random ones over the last 30 days or so.
https://www.bing.com/news/search?q=wordpress&format=rss&count=24
A lot of the articles don't have meta images to scrape though. Kinda circles back to the same problem for me... no preview images
Multi treaded curl to grab the og:image off of the site with the article in the background
Claud ai will show you how to write it or I can provide you some php code
I wrote MS asking for permission to use their feed. Still waiting for a reply.
Multi treaded curl to grab the og:image off of the site with the article in the background Claud ai will show you how to write it or I can provide you some php code
What I'm saying is many of those Bing articles don't have og:images.
'\u003d' should be replaced with '='
url = url.split('\u003d').join('=');
Do you have any ideas or solutions?
Currently, the old method is completely broken.
How do we get direct links to sources in Google News?
Warning: file_get_contents(https://news.google.com/rss/search?q=COVID+when:30d&hl=en-US&gl=US&ceid=US:en&ie=utf-8): Failed to open stream: HTTP request failed! HTTP/1.1 503 Service Unavailable in /home/vonwallace/public_html/news/newsget.php on line 208
Could not get results for: https://news.google.com/rss/search?q=COVID+when:30d&hl=en-US&gl=US&ceid=US:en&ie=utf-8