-
-
Save huksley/bc3cb046157a99cd9d1517b32f91a99e to your computer and use it in GitHub Desktop.
/** | |
* This magically uses batchexecute protocol. It's not documented, but it works. | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2024 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
const fetchDecodedBatchExecute = (id: string) => { | |
const s = | |
'[[["Fbv4je","[\\"garturlreq\\",[[\\"en-US\\",\\"US\\",[\\"FINANCE_TOP_INDICES\\",\\"WEB_TEST_1_0_0\\"],null,null,1,1,\\"US:en\\",null,180,null,null,null,null,null,0,null,null,[1608992183,723341000]],\\"en-US\\",\\"US\\",1,[2,3,4,8],1,0,\\"655000234\\",0,0,null,0],\\"' + | |
id + | |
'\\"]",null,"generic"]]]'; | |
return fetch("https://news.google.com/_/DotsSplashUi/data/batchexecute?" + "rpcids=Fbv4je", { | |
headers: { | |
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8", | |
Referrer: "https://news.google.com/" | |
}, | |
body: "f.req=" + encodeURIComponent(s), | |
method: "POST" | |
}) | |
.then(e => e.text()) | |
.then(s => { | |
const header = '[\\"garturlres\\",\\"'; | |
const footer = '\\",'; | |
if (!s.includes(header)) { | |
throw new Error("header not found: " + s); | |
} | |
const start = s.substring(s.indexOf(header) + header.length); | |
if (!start.includes(footer)) { | |
throw new Error("footer not found"); | |
} | |
const url = start.substring(0, start.indexOf(footer)); | |
return url; | |
}); | |
}; | |
/** | |
* Google News started generate encoded, internal URLs for RSS items | |
* https://news.google.com/rss/search?q=New%20York%20when%3A30d&hl=en-US&gl=US&ceid=US:en | |
* | |
* This script decodes URLs into original one, for example URL | |
* https://news.google.com/__i/rss/rd/articles/CBMiSGh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyMi8xMC8yNy9uZXcteW9yay1wb3N0LWhhY2tlZC1vZmZlbnNpdmUtdHdlZXRzL9IBAA?oc=5 | |
* | |
* contains this | |
* https://techcrunch.com/2022/10/27/new-york-post-hacked-offensive-tweets/ | |
* | |
* In path after articles/ goes Base64 encoded binary data | |
* | |
* Format is the following: | |
* <prefix> <len bytes> <URL bytes> <len bytes> <amp URL bytes> [<suffix>] | |
* | |
* <prefix> - 0x08, 0x13, 0x22 | |
* <suffix> - 0xd2, 0x01, 0x00 (sometimes missing??) | |
* <len bytes> - formatted as 0x40 or 0x81 0x01 sometimes | |
* | |
* | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* https://news.google.com/rss/articles/CBMidkFVX3lxTFB1QmFsSi1Zc3dLQkpNLThKTXExWXBGWlE0eERJQ2hLRENIOFJzRTlsRnM1NS1Hc2FlbjdIMlZ3eWNQa0JqeVYzZGs1Y0hKaUtTUko2dmJabUtVMWZob0lNSFNCa3NLQ05ROGh4cVZfVTYyUDVxc2c?oc=5 | |
* https://news.google.com/rss/articles/CBMiqwFBVV95cUxNMTRqdUZpNl9hQldXbGo2YVVLOGFQdkFLYldlMUxUVlNEaElsYjRRODVUMkF3R1RYdWxvT1NoVzdUYS0xSHg3eVdpTjdVODQ5cVJJLWt4dk9vZFBScVp2ZmpzQXZZRy1ncDM5c2tRbXBVVHVrQnpmMGVrQXNkQVItV3h4dVQ1V1BTbjhnM3k2ZUdPdnhVOFk1NmllNTZkdGJTbW9NX0k5U3E2Tkk?oc=5 | |
* | |
* FIXME: What will happen if URL more than 255 bytes?? | |
* | |
* Licensed under: MIT License | |
* | |
* Copyright (c) 2022 Ruslan Gainutdinov | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included | |
* in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
export const decodeGoogleNewsUrl = async (sourceUrl: string) => { | |
const url = new URL(sourceUrl); | |
const path = url.pathname.split("/"); | |
if ( | |
url.hostname === "news.google.com" && | |
path.length > 1 && | |
path[path.length - 2] === "articles" | |
) { | |
const base64 = path[path.length - 1]; | |
let str = atob(base64); | |
const prefix = Buffer.from([0x08, 0x13, 0x22]).toString("binary"); | |
if (str.startsWith(prefix)) { | |
str = str.substring(prefix.length); | |
} | |
const suffix = Buffer.from([0xd2, 0x01, 0x00]).toString("binary"); | |
if (str.endsWith(suffix)) { | |
str = str.substring(0, str.length - suffix.length); | |
} | |
// One or two bytes to skip | |
const bytes = Uint8Array.from(str, c => c.charCodeAt(0)); | |
const len = bytes.at(0)!; | |
if (len >= 0x80) { | |
str = str.substring(2, len + 2); | |
} else { | |
str = str.substring(1, len + 1); | |
} | |
if (str.startsWith("AU_yqL")) { | |
// New style encoding, introduced in July 2024. Not yet known how to decode offline. | |
const url = await fetchDecodedBatchExecute(base64); | |
return url; | |
} | |
return str; | |
} else { | |
return sourceUrl; | |
} | |
}; |
This is not an unsanctioned use or something. They have a publicly available RSS feed, and it is supposed to contain an article link, not some tracking, obfuscated URL. If there were some authentication or paid plan for that, it would also be okay, but let's not imagine this is done so users (us) have a better experience.
To address the issue of request limitations with Google's API, I implemented the following solution:
Switched to Bing News API, which offers 1,000 free requests per month—ample for our needs.
Set up a daily cron job that:
Submits 6 predefined search terms to the Bing API
Stores the results in a PostgreSQL database
Modified the user-facing news retrieval process:
News is now pulled from our database instead of making live API calls
This approach eliminates API usage when users access news
While the news may be up to a day old, this trade-off is acceptable given the benefits of reliable access and reduced API dependency.
This solution effectively manages API limitations while ensuring consistent news availability for users.
Here is an example
Here is how I grap api info to db
`<?php
function newsgetf($searchterms, $maxnumberarticles)
{
date_default_timezone_set('GMT');
// Database connection parameters
$host = 'localhost';
$dbname = 'bingnews';
$user = '';
$password = '';
// Connect to the PostgreSQL database
$dbconn = pg_connect("host=$host dbname=$dbname user=$user password=$password");
if (!$dbconn) {
die("Connection failed: " . pg_last_error());
}
$subscription_key = "";
$endpoint = "https://api.bing.microsoft.com/v7.0/news/search";
// Prepare the SQL statements
$check_duplicate_sql = "SELECT url FROM news_articles WHERE url = $1";
$upsert_sql = "INSERT INTO news_articles (url, name, date_published, description, provider, og_image, search_terms, insertion_timestamp)
VALUES ($1, $2, $3, $4, $5, $6, ARRAY[$7], $8)
ON CONFLICT (url)
DO UPDATE SET
name = EXCLUDED.name,
date_published = EXCLUDED.date_published,
description = EXCLUDED.description,
provider = EXCLUDED.provider,
og_image = EXCLUDED.og_image,
search_terms = array_append(news_articles.search_terms, $7),
insertion_timestamp = EXCLUDED.insertion_timestamp";
foreach ($searchterms as $searchterm) {
$query = urlencode($searchterm);
$url = $endpoint . "?q=" . $query . "&count=" . $maxnumberarticles . "&mkt=en-US";
$headers = [
"Ocp-Apim-Subscription-Key: $subscription_key"
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$response = curl_exec($ch);
if ($response === false) {
echo "Error fetching results for '$searchterm': " . curl_error($ch) . "\n";
continue;
}
curl_close($ch);
$result = json_decode($response, true);
if (isset($result['value'])) {
$values = $result['value'];
// Fetch metadata for each article
$mh = curl_multi_init();
$handles = [];
foreach ($values as $key => $item) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $item['url']);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_multi_add_handle($mh, $ch);
$handles[$key] = $ch;
}
$running = null;
do {
curl_multi_exec($mh, $running);
} while ($running);
foreach ($handles as $key => $ch) {
$page_content = curl_multi_getcontent($ch);
if ($page_content !== false && !empty($page_content)) {
$dom_obj = new DOMDocument();
@$dom_obj->loadHTML($page_content, LIBXML_NOWARNING | LIBXML_NOERROR);
foreach ($dom_obj->getElementsByTagName('meta') as $meta) {
if ($meta->getAttribute('property') == 'og:image') {
$values[$key]['og_image'] = $meta->getAttribute('content');
}
if ($meta->getAttribute('property') == 'og:description') {
$values[$key]['og_description'] = $meta->getAttribute('content');
}
}
}
curl_multi_remove_handle($mh, $ch);
curl_close($ch);
}
curl_multi_close($mh);
// Insert or update each article in the database
$inserted_count = 0;
$updated_count = 0;
$duplicate_count = 0;
$error_count = 0;
$current_timestamp = date('Y-m-d H:i:s');
foreach ($values as $item) {
// Check if the article already exists
$check_result = pg_query_params($dbconn, $check_duplicate_sql, [$item['url']]);
if (pg_num_rows($check_result) > 0) {
$duplicate_count++;
continue;
}
$result = pg_query_params($dbconn, $upsert_sql, [
$item['url'],
$item['name'],
$item['datePublished'],
$item['og_description'] ?? $item['description'],
$item['provider'][0]['name'] ?? '',
$item['og_image'] ?? ($item['image']['originalImg'] ?? ($item['image']['thumbnail']['contentUrl'] ?? '')),
$searchterm,
$current_timestamp
]);
if ($result) {
$affected_rows = pg_affected_rows($result);
if ($affected_rows == 1) {
$inserted_count++;
} else {
$updated_count++;
}
} else {
echo "Error inserting/updating article for '$searchterm': " . pg_last_error($dbconn) . "\n";
$error_count++;
}
}
echo "For search term '$searchterm':\n";
echo " Inserted: $inserted_count, Updated: $updated_count, Duplicates: $duplicate_count, Errors: $error_count\n";
} else {
echo "No results found for: $searchterm\n";
}
}
// Close the database connection
pg_close($dbconn);
}
// Example usage
$search_terms = ['CHRISTIAN REVIVAL JESUS', 'PERSECUTED CHRISTIAN JESUS','Cybersecurity','DevOps','Technology advances or AI or machine learning or quantum computing or IoT'];
$max_articles_per_term = 100;
newsgetf($search_terms, $max_articles_per_term);`
Here is how I render what is in the db to html
`<?php
// Global variable to control Pica usage
$USE_PICA = false;
function newsgetf($searchterm, $maxnumberarticles)
{
global $USE_PICA;
date_default_timezone_set('GMT');
// Hardcoded entries
$values = [
[
'name' => 'About - Dallas-Fort Worth Church Eleven 32 - Non-Denominational Church - Allen, TX',
'url' => 'https://churcheleven32.com/about/',
'date_published' => date('D, d M Y H:i:s T'),
'description' => 'Church Eleven32 is a non-denominational church in the Dallas-Fort Worth area. We are dedicated to being a place for all people to know God. Learn more about our church here.',
'provider' => 'Church Eleven32',
'og_image' => 'https://churcheleven32.com/wp-content/uploads/2023/07/30981495541_52ff1465af_b.jpg'
],
[
'name' => 'St Sava Orthodox Church - Allen, TX',
'url' => 'https://stsavaoca.org/',
'date_published' => date('D, d M Y H:i:s T'),
'description' => 'Discover the roots of Christianity with the Orthodox Church, a faith tradition tracing back nearly 2,000 years to Jesus Christ and His Apostles. Orthodox Christianity preserves the original teachings, practices, and sacraments established by the early Church, offering a direct connection to apostolic times. With a rich history spanning continents and cultures, Orthodoxy emphasizes right worship and belief, carefully guarding the truth passed down through Holy Scripture and Sacred Tradition. Experience the depth and authenticity of this ancient yet living faith as you explore the origins of Christian spirituality.',
'provider' => 'St Sava Orthodox Church',
'og_image' => 'https://stsavaoca.org/s/img/wp-content/uploads/2024/03/StSavaChurch.jpg.webp'
]
];
// Database connection parameters
$host = 'localhost';
$dbname = 'bingnews';
$user = '';
$password = '';
// Connect to the PostgreSQL database
$dbconn = pg_connect("host=$host dbname=$dbname user=$user password=$password");
if (!$dbconn) {
die("Connection failed: " . pg_last_error());
}
// Prepare the SQL query
$query = "SELECT * FROM news_articles WHERE $1 = ANY(search_terms) ORDER BY date_published DESC LIMIT $2";
// Execute the query
$result = pg_query_params($dbconn, $query, array($searchterm, $maxnumberarticles));
if (!$result) {
echo "<h1>Error fetching results: " . pg_last_error($dbconn) . "</h1>";
pg_close($dbconn);
return;
}
$db_values = pg_fetch_all($result);
// Merge hardcoded entries with database results
if ($db_values) {
$values = array_merge($values, $db_values);
}
if ($values) {
echo "<h1><i class=\"fas fa-newspaper\" style=\"color: #3498db; margin-right: 10px;\"></i> Found " . count($values) . " News Articles for \"" . htmlspecialchars($searchterm, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\"</h1>";
echo "<div class=\"posts__container\">";
foreach ($values as $key => $item) {
$protocol = "https://";
$CurPageURL = $protocol . $_SERVER['HTTP_HOST'] . $_SERVER['REQUEST_URI'];
echo "
<script type=\"application/ld+json\">
{
\"@context\": \"https://schema.org/\",
\"@type\": \"NewsArticle\",
\"mainEntityOfPage\": {
\"@type\": \"WebPage\",
\"@id\": \"" . htmlspecialchars($CurPageURL, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\"
},
\"headline\": \"" . trim(htmlspecialchars($item['name'], ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8', false)) . "\",
\"description\": \"" . trim(htmlspecialchars($item['description'], ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8', false)) . "\",
\"image\": \"" . htmlspecialchars($item['og_image'] ?? '', ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\",
\"author\": {
\"@type\": \"Organization\",
\"name\": \"" . htmlspecialchars($item['provider'], ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\"
},
\"publisher\": {
\"@type\": \"Organization\",
\"name\": \"" . htmlspecialchars($item['provider'], ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\",
\"logo\": {
\"@type\": \"ImageObject\",
\"url\": \"\"
}
},
\"datePublished\": \"" . date("Y-m-d", strtotime($item['date_published'])) . "\"
}
</script>";
echo "<div class=\"div__post\">";
echo "<div class=\"time\"><time datetime=\"" . date("Y-m-d", strtotime($item['date_published'])) . "T" . date("H:i:s", strtotime($item['date_published'])) . "\">" . htmlspecialchars($item['date_published'], ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . " [#" . ($key + 1) . "]" . "</time></div>";
$image_url = $item['og_image'] ?? '';
if ($image_url) {
echo "<div class=\"center\"><img loading=\"lazy\" src=\"" . htmlspecialchars($image_url, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\" class=\"pica-resize\" style=\"width: 100%; height: auto;\" onerror=\"this.style.display='none'\" alt=\"" . trim(htmlspecialchars($item['name'], ENT_NOQUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8', false)) . "\"/></div>";
}
echo "<a href=\"" . htmlspecialchars($item['url'], ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "\" target=\"_blank\" class=\"underline-on-hover\">" . htmlspecialchars($item['name'], ENT_NOQUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8', false) . "</a>";
echo "<div class=\"description__block\">" . htmlspecialchars($item['description'], ENT_NOQUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8', false) . "<br /></div></div>";
}
echo "</div>";
// Add Pica library and custom JavaScript
echo "
<script src=\"https://cdnjs.cloudflare.com/ajax/libs/pica/9.0.1/pica.min.js\"></script>
<script>
document.addEventListener('DOMContentLoaded', function() {
const usePica = " . ($USE_PICA ? 'true' : 'false') . ";
if (!usePica) {
document.querySelectorAll('img.pica-resize').forEach(img => {
img.style.width = '100%';
img.style.height = 'auto';
});
return;
}
const pica = window.pica({
features: ['js', 'wasm', 'ww']
});
const resizeImage = (img) => {
const container = img.parentElement;
const containerWidth = container.clientWidth;
// Create a temporary image to get the natural aspect ratio
const tempImg = new Image();
tempImg.src = img.src;
tempImg.onload = () => {
const aspectRatio = tempImg.naturalWidth / tempImg.naturalHeight;
const targetHeight = containerWidth / aspectRatio;
const canvas = document.createElement('canvas');
canvas.width = containerWidth;
canvas.height = targetHeight;
pica.resize(tempImg, canvas, {
quality: 3,
alpha: true,
unsharpAmount: 80,
unsharpRadius: 0.6,
unsharpThreshold: 2
})
.then(result => pica.toBlob(result, 'image/png', 1.0))
.then(blob => {
const resizedUrl = URL.createObjectURL(blob);
img.src = resizedUrl;
img.style.width = '100%';
img.style.height = 'auto';
})
.catch(err => {
console.error('Pica error:', err);
img.style.width = '100%';
img.style.height = 'auto';
});
};
};
const observer = new ResizeObserver(entries => {
entries.forEach(entry => {
const img = entry.target.querySelector('img.pica-resize');
if (img) {
resizeImage(img);
}
});
});
document.querySelectorAll('.center').forEach(container => {
const img = container.querySelector('img.pica-resize');
if (img) {
observer.observe(container);
img.addEventListener('load', () => resizeImage(img));
}
});
});
</script>";
echo "<hr/><div class=\"center\"><b><i><a href=\"https://www.biblegateway.com/passage/?search=1+John+4%3A7-21&version=NIV\" target=\"_blank\">God Is Love - 1 John 4:7-21</a></i></b></div></div></body></html>";
} else {
echo "<h1>No results found for: " . htmlspecialchars($searchterm, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, 'UTF-8') . "</h1>";
}
// Close the database connection
pg_close($dbconn);
}
// Example usage
// $USE_PICA = true; // Set this to false to disable Pica image resizing
// $searchterm = 'CHRISTIAN REVIVAL JESUS';
// $max_articles = 10;
// newsgetf($searchterm, $max_articles);
?>`
Here is the sql for the tables of the db
`-- Table: public.news_articles
-- DROP TABLE IF EXISTS public.news_articles;
CREATE TABLE IF NOT EXISTS public.news_articles
(
id integer NOT NULL DEFAULT nextval('news_articles_id_seq'::regclass),
url text COLLATE pg_catalog."default" NOT NULL,
name text COLLATE pg_catalog."default" NOT NULL,
date_published timestamp without time zone NOT NULL,
description text COLLATE pg_catalog."default",
provider character varying(255) COLLATE pg_catalog."default",
og_image text COLLATE pg_catalog."default",
search_terms text[] COLLATE pg_catalog."default",
insertion_timestamp timestamp without time zone NOT NULL DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT news_articles_pkey PRIMARY KEY (id),
CONSTRAINT news_articles_url_key UNIQUE (url)
)
TABLESPACE pg_default;
ALTER TABLE IF EXISTS public.news_articles
OWNER to postgres;
GRANT ALL ON TABLE public.news_articles TO coviduser;
GRANT ALL ON TABLE public.news_articles TO postgres;
-- Index: idx_date_published
-- DROP INDEX IF EXISTS public.idx_date_published;
CREATE INDEX IF NOT EXISTS idx_date_published
ON public.news_articles USING btree
(date_published ASC NULLS LAST)
TABLESPACE pg_default;
-- Index: idx_insertion_timestamp
-- DROP INDEX IF EXISTS public.idx_insertion_timestamp;
CREATE INDEX IF NOT EXISTS idx_insertion_timestamp
ON public.news_articles USING btree
(insertion_timestamp ASC NULLS LAST)
TABLESPACE pg_default;
-- Index: idx_search_terms
-- DROP INDEX IF EXISTS public.idx_search_terms;
CREATE INDEX IF NOT EXISTS idx_search_terms
ON public.news_articles USING gin
(search_terms COLLATE pg_catalog."default")
TABLESPACE pg_default;
-- Index: idx_url
-- DROP INDEX IF EXISTS public.idx_url;
CREATE INDEX IF NOT EXISTS idx_url
ON public.news_articles USING btree
(url COLLATE pg_catalog."default" ASC NULLS LAST)
TABLESPACE pg_default;`
It appears that Google may be utilising a combination of Base64 encoding, Protocol Buffers, and AES256 encryption, although I am not entirely certain of this.
To elaborate, when decoding Base64 (URL Safe) into Protocol Buffer using the provided message structure, it occasionally produces one or two hashes that begin with 'AU_yqL'. This prefix could either indicate the start of the hash or a URL, such as https://.
The message structure in Protocol Buffer is defined as follows:
message Article {
optional int32 version = 1;
optional string unknown4 = 4;
optional string unknown26 = 26;
}
Does anyone have insights into whether AES256 encryption is indeed being used in this context, or any guidance on how to decrypt it?
Original Hash: CBMiR0FVX3lxTE5fVEhXSDU2YnFhcDk1bGR6RjUtemduZllBb0QwQ2tHVHprVTc4SzFvU3dkUjJaZFNFZ250eHhxblhkc0ZKMmdV
Hash output: AU_yqLN_THWH56bqap95ldzF5-zgnfYAoD0CkGTzkU78K1oSwdR2ZdSEgntxxqnXdsFJ2gU
Hash normalized: AU/yqLN/THWH56bqap95ldzF5+zgnfYAoD0CkGTzkU78K1oSwdR2ZdSEgntxxqnXdsFJ2gU
What should be the result: http://www.jazzandbeyond.com.au/
Pretty sure this solution just broke
def busted :(
@huksley this request solution is broken
Unfortunately, the solution that @huksley proposed no longer works. It produces errors like this: [["wrb.fr","Fbv4je",null,null,null,[3],"generic"],["di",10],["af.httprm",10,"2111786207358723693",9]]
.
@huksley surprisingly the request solution is not working. Can help to fix.
yes, they have added few more parameters at end of payload, looks like some encrypted values
,1725004324,\"ATR1dL9qAQrN8uy3dkKVSj-G9RHc\"]",null,"generic"]]]&at=AEtveWhZ98E6YWfBFsXKcv6oDg_O:1725004324622&
below attached compare image, here right side contains previous payload and left side contains current payload
Any solutions?
Any solutions?
Tried to decode it, but no luck
payload = (
'[[["Fbv4je","[\"garturlreq\",[[\"en-US\",\"US\",[\"FINANCE_TOP_INDICES\",\"WEB_TEST_1_0_0\"],null,null,1,1,\"US:en\",null,360,null,null,null,null,null,0,null,null,[1677434405,738601000]],\"en-US\",\"US\",1,[2,3,4,8],1,0,\"668194412\",0,0,null,0],\"'
+ code
+ '\",1725016444,\"ATR1dL9R_yt7riBAiulU9qaZcXAJ\"]",null,"generic"]]]'
)
For 1725016444 it's a Unix timestamp. Finding solution for ATR1dL9R_yt7riBAiulU9qaZcXAJ
payload = ( '[[["Fbv4je","["garturlreq",[["en-US","US",["FINANCE_TOP_INDICES","WEB_TEST_1_0_0"],null,null,1,1,"US:en",null,360,null,null,null,null,null,0,null,null,[1677434405,738601000]],"en-US","US",1,[2,3,4,8],1,0,"668194412",0,0,null,0],"' + code + '",1725016444,"ATR1dL9R_yt7riBAiulU9qaZcXAJ"]",null,"generic"]]]' )
For 1725016444 it's a Unix timestamp. Finding solution for ATR1dL9R_yt7riBAiulU9qaZcXAJ
Will you update your python package as well if you find the solution?
payload = ( '[[["Fbv4je","["garturlreq",[["en-US","US",["FINANCE_TOP_INDICES","WEB_TEST_1_0_0"],null,null,1,1,"US:en",null,360,null,null,null,null,null,0,null,null,[1677434405,738601000]],"en-US","US",1,[2,3,4,8],1,0,"668194412",0,0,null,0],"' + code + '",1725016444,"ATR1dL9R_yt7riBAiulU9qaZcXAJ"]",null,"generic"]]]' )
For 1725016444 it's a Unix timestamp. Finding solution for ATR1dL9R_yt7riBAiulU9qaZcXAJWill you update your python package as well if you find the solution?
trying and waiting for @huksley
PYTHON SOLUTION
I don't know the exact algorithm used by Google to encode/decode the URLs, but I found a way to decode them using reverse engineering by inspecting the requests made by the browser in the redirection chain.
pip install beautifulsoup4 lxml
import json
from urllib.parse import quote, urlparse
import requests
from bs4 import BeautifulSoup
def get_decoding_params(gn_art_id):
response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
div = soup.select_one("c-wiz > div")
return {
"signature": div.get("data-n-a-sg"),
"timestamp": div.get("data-n-a-ts"),
"gn_art_id": gn_art_id,
}
def decode_urls(articles):
articles_reqs = [
[
"Fbv4je",
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]',
]
for art in articles
]
payload = f"f.req={quote(json.dumps([articles_reqs]))}"
headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"}
response = requests.post(
url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
headers=headers,
data=payload,
)
response.raise_for_status()
return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]]
# Example usage
encoded_urls = [
"https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5",
"https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen",
]
articles_params = [get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls]
decoded_urls = decode_urls(articles_params)
print(decoded_urls)
PYTHON SOLUTION
I don't know the exact algorithm used by Google to encode/decode the URLs, but I found a way to decode them using reverse engineering by inspecting the requests made by the browser in the redirection chain.
pip install beautifulsoup4 lxml
import json from urllib.parse import quote, urlparse import requests from bs4 import BeautifulSoup def get_decoding_params(gn_art_id): response = requests.get(f"https://news.google.com/articles/{gn_art_id}") response.raise_for_status() soup = BeautifulSoup(response.text, "lxml") div = soup.select_one("c-wiz > div") return { "signature": div.get("data-n-a-sg"), "timestamp": div.get("data-n-a-ts"), "gn_art_id": gn_art_id, } def decode_urls(articles): articles_reqs = [ [ "Fbv4je", f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]', ] for art in articles ] payload = f"f.req={quote(json.dumps([articles_reqs]))}" headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"} response = requests.post( url="https://news.google.com/_/DotsSplashUi/data/batchexecute", headers=headers, data=payload, ) response.raise_for_status() return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]] # Example usage encoded_urls = [ "https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5", "https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen", ] articles_params = [get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls] decoded_urls = decode_urls(articles_params) print(decoded_urls)
Man, you are a genius. I never thought that way. I was checking using httptoolkit, my bad i never notice. which tool do you use? httpdebugger? Can I implement it in my module? https://github.com/SSujitX/google-news-url-decoder
def get_decoding_params(gn_art_id):
response = requests.get(f"https://news.google.com/articles/{gn_art_id}")
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
div = soup.select_one("c-wiz > div")
return {
"signature": div.get("data-n-a-sg"),
"timestamp": div.get("data-n-a-ts"),
"gn_art_id": gn_art_id,
}
def decode_google_news_url(source_url):
article = get_decoding_params(urlparse(source_url).path.split("/")[-1])
articles_req = [
"Fbv4je",
f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{article["gn_art_id"]}",{article["timestamp"]},"{article["signature"]}"]',
]
response = requests.post(
url="https://news.google.com/_/DotsSplashUi/data/batchexecute",
headers={"content-type": "application/x-www-form-urlencoded;charset=UTF-8"},
data=f"f.req={quote(json.dumps([[articles_req]]))}",
)
response.raise_for_status()
return json.loads(json.loads(response.text.split("\n\n")[1])[:-2][0][2])[1]
# Example usage
encoded = "https://news.google.com/rss/articles/CBMiwwFBVV95cUxPdEpINnp6em8wMkZnSndsLTlmUkRlWjRyeDlGS1E1WHRmX0E2QXo0S0ZxZ2FCeUkzMnRYRm9wZEE4RGE5bzZnZGdFZUw2VWRSQ0pfcG9WQ1JyWDg3cGVZMFd2Vk4zUDhWSF8tMm45TTdsLVJLdGtLUjB6QlJSWlNfU0gwOEdzY3RtakJFTDB2bzdrUXdnZVRaWGZhVUZjWmdiNXdXX1FyODY5RnBXYTVLUFRHYUJvY25TQzhRQWNydHctR1E?oc=5&hl=en-US&gl=US&ceid=US:en"
decoded_url = decode_google_news_url(encoded)
# prints https://www.forbes.com/sites/digital-assets/2024/08/29/from-polymarket-predictions-to-press-on-nails-crypto-moves-mainstream/
print(decoded_url)
TY @jacoboisaza !! For anyone who wants a solution that takes in a single url instead of an array of URLs...
so it’s making multiple requests? seems like google really doesn’t want us to decode urls. what are the chances they block us?
Updated on Repo
pip install googlenewsdecoder --upgrade
from googlenewsdecoder import new_decoderv1
def main():
interval_time = 5 # default interval is 1 sec, if not specified
source_url = "https://news.google.com/read/CBMi2AFBVV95cUxPd1ZCc1loODVVNHpnbFFTVHFkTG94eWh1NWhTeE9yT1RyNTRXMVV2S1VIUFM3ZlVkVjl6UHh3RkJ0bXdaTVRlcHBjMWFWTkhvZWVuM3pBMEtEdlllRDBveGdIUm9GUnJ4ajd1YWR5cWs3VFA5V2dsZnY1RDZhVDdORHRSSE9EalF2TndWdlh4bkJOWU5UMTdIV2RCc285Q2p3MFA4WnpodUNqN1RNREMwa3d5T2ZHS0JlX0MySGZLc01kWDNtUEkzemtkbWhTZXdQTmdfU1JJaXY?hl=en-US&gl=US&ceid=US%3Aen"
try:
decoded_url = new_decoderv1(source_url, interval=interval_time)
if decoded_url.get("status"):
print("Decoded URL:", decoded_url["decoded_url"])
else:
print("Error:", decoded_url["message"])
except Exception as e:
print(f"Error occurred: {e}")
# Output: decoded_urls - [{'status': True, 'decoded_url': 'https://healthdatamanagement.com/articles/empowering-the-quintuple-aim-embracing-an-essential-architecture/'}]
if __name__ == "__main__":
main()
Yes, the new challenge is to hack the requests limits...
so it’s making multiple requests? seems like google really doesn’t want us to decode urls. what are the chances they block us?
PYTHON SOLUTION
I don't know the exact algorithm used by Google to encode/decode the URLs, but I found a way to decode them using reverse engineering by inspecting the requests made by the browser in the redirection chain.
pip install beautifulsoup4 lxml
import json from urllib.parse import quote, urlparse import requests from bs4 import BeautifulSoup def get_decoding_params(gn_art_id): response = requests.get(f"https://news.google.com/articles/{gn_art_id}") response.raise_for_status() soup = BeautifulSoup(response.text, "lxml") div = soup.select_one("c-wiz > div") return { "signature": div.get("data-n-a-sg"), "timestamp": div.get("data-n-a-ts"), "gn_art_id": gn_art_id, } def decode_urls(articles): articles_reqs = [ [ "Fbv4je", f'["garturlreq",[["X","X",["X","X"],null,null,1,1,"US:en",null,1,null,null,null,null,null,0,1],"X","X",1,[1,1,1],1,1,null,0,0,null,0],"{art["gn_art_id"]}",{art["timestamp"]},"{art["signature"]}"]', ] for art in articles ] payload = f"f.req={quote(json.dumps([articles_reqs]))}" headers = {"content-type": "application/x-www-form-urlencoded;charset=UTF-8"} response = requests.post( url="https://news.google.com/_/DotsSplashUi/data/batchexecute", headers=headers, data=payload, ) response.raise_for_status() return [json.loads(res[2])[1] for res in json.loads(response.text.split("\n\n")[1])[:-2]] # Example usage encoded_urls = [ "https://news.google.com/rss/articles/CBMipgFBVV95cUxPWV9fTEI4cjh1RndwanpzNVliMUh6czg2X1RjeEN0YUctUmlZb0FyeV9oT3RWM1JrMGRodGtqTk1zV3pkNEpmdGNxc2lfd0c4LVpGVENvUDFMOEJqc0FCVVExSlRrQmI3TWZ2NUc4dy1EVXF4YnBLaGZ4cTFMQXFFM2JpanhDR3hoRmthUjVjdm1najZsaFh4a3lBbDladDZtVS1FMHFn?oc=5", "https://news.google.com/rss/articles/CBMi3AFBVV95cUxOX01TWDZZN2J5LWlmU3hudGZaRDh6a1dxUHMtalBEY1c0TlJSNlpieWxaUkxUU19MVTN3Y1BqaUZael83d1ctNXhaQUtPM0IyMFc4R3VydEtoMmFYMWpMU1Rtc3BjYmY4d3gxZHlMZG5NX0s1RmR2ZXI5YllvdzNSd2xkOFNCUTZTaEp3b0IxZEJZdVFLUDBNMC1wNGgwMGhjRG9HRFpRZU5BMFVIYjZCOWdWcHI1YzdoVHFWYnZSOEFwQ0NubGx3Rzd0SHN6OENKMXZUcHUxazA5WTIw?hl=en-US&gl=US&ceid=US%3Aen", ] articles_params = [get_decoding_params(urlparse(url).path.split("/")[-1]) for url in encoded_urls] decoded_urls = decode_urls(articles_params) print(decoded_urls)
Man, you are a genius. I never thought that way. I was checking using httptoolkit, my bad i never notice. which tool do you use? httpdebugger? Can I implement it in my module? https://github.com/SSujitX/google-news-url-decoder
it's open source so take it, mix it, and share it
I used Chrome DevTools to track the redirections and inspect the payloads and responses. After many attempts at cleaning and simplifying the process, I discovered this minimal approach that works well in Python.
Is there a PHP solution?
I converted all the requests using GPT, licked the code, but nothing works :( I tested the test.php fome with those function through a browser.
I'm already thinking - maybe I shouldn't run this code from the browser?
But I don't see any point in it.
I converted this to PHP and got it working but goggle throttles it then blocks you. So I signed up for bing news api and pull the news articles once a day then insert them into a db and query the db when the user pulls up the page.
Can someone convert the new solution to javascript? I will update the gist afterwards, thank you!
When I call _/DotsSplashUi/data/batchexecute?rpcids=Fbv4je the answer does not include "garturlres". Anyone else having this problem? Was there already an update in July managing this?
@ruthvik92 as best as i can guess - they switched from secretly-ish encoding the canonical article URLs inside the returned google news URL, to simply not encoding them inside the URL; i suspect it's now just stored somewhere separately on their servers, and the info in the new URLs is just an identifier pointing to that data. Therefore, to get the original URL you have to make a request to Google servers.
Hard to say the reason - maybe it's because this is unsanctioned use they want to prevent, maybe it's an internal change that happened to affect outside users. Personally for me though, faking sketchy API requests to get the data pushed this across the threshold from "clever hack" to "clearly something they don't want you doing", so I've switched to using and paying for the Bing News API.