Skip to content

Instantly share code, notes, and snippets.

@pfeilbr
Last active May 14, 2022 23:15
Show Gist options
  • Save pfeilbr/24e18abf1ae051a90589ae16488cadb8 to your computer and use it in GitHub Desktop.
Save pfeilbr/24e18abf1ae051a90589ae16488cadb8 to your computer and use it in GitHub Desktop.
fetch all aws directory api metadata (arch diagrams, products, blog posts, builders library articles, etc.)
// fetch all aws directory api metadata
(async () => {
const directories = [
{
"directoryId": "event-content"
},
{
"directoryId": "amazon-redwood"
},
{
"directoryId": "aws-products"
},
{
"directoryId": "free-tier-products"
},
{
"directoryId": "blog-posts"
},
{
"directoryId": "whats-new"
},
{
"directoryId": "security-bulletins"
},
{
"directoryId": "media-resources"
}
];
const l = (o) => {
console.log(JSON.stringify(o, null, 2))
}
const sleep = ms => {
return new Promise(resolve => setTimeout(resolve, ms))
}
const fetchJSON = async (url) => {
//l(`fetchJSON("${url}")`)
const resp = await fetch(url)
const data = await resp.json();
return data;
}
const fetchDirectoryMetadata = async (directoryId) => {
const metadataURL = `https://aws.amazon.com/api/dirs/items/search?item.directoryId=${directoryId}&item.locale=en_US`;
const data = await fetchJSON(metadataURL)
return data;
}
const fetchDirectoryContent = async (directoryId, metadata) => {
const urlTemplate = `https://aws.amazon.com/api/dirs/items/search?item.directoryId=${directoryId}&size=${metadata.metadata.count}&item.locale=en_US&page=`;
const pageIndexes = Array.from(Array(metadata.metadata.pageCount).keys())
const pages = [];
for (const pageIndex of pageIndexes) {
const url = urlTemplate + `${pageIndex}`
const data = await fetchJSON(url);
if (data.items.length > 0) {
pages.push(data);
} else if (pageIndex < pageIndexes[pageIndexes.length-1]) { // appears max page is 1000. this is based on "directoryId=blog-posts,action=break,pageIndex=1000,pageIndexes.length=2077""
l(`directoryId=${directoryId},action=break,pageIndex=${pageIndex},pageIndexes.length=${pageIndexes.length}`)
break;
}
}
return pages;
}
const fetchDirectory = async (directoryId) => {
const metadata = await fetchDirectoryMetadata(directoryId)
const pageCount = Math.ceil(metadata.metadata.totalHits / metadata.metadata.count);
metadata.metadata.pageCount = pageCount;
const data = await fetchDirectoryContent(directoryId, metadata);
//l(metadata)
return data;
}
const saveFile = async () => {
// create a new handle
const newHandle = await window.showSaveFilePicker();
// create a FileSystemWritableFileStream to write to
const writableStream = await newHandle.createWritable();
const obj = {hello: 'world'};
const blob = new Blob([JSON.stringify(obj, null, 2)], {type : 'application/json'});
// write our file
await writableStream.write(blob);
// close the file and write the contents to disk.
await writableStream.close();
}
const main = async () => {
//await saveFile();
//return;
try {
for (const directory of directories.slice(0,1)) {
const data = await fetchDirectory(directory.directoryId)
const totalItems = data.reduce((previous, current) => {
return previous + current.items.length;
}, 0)
l(`directory.directoryId=${directory.directoryId},totalPages=${data.length},totalItems=${totalItems},itemsPerPage=${data[0].metadata.count}`);
//await sleep(1000)
}
} catch (e) {
console.log(e)
}
}
await main();
})();
/*
var params = {
Bucket: 'STRING_VALUE', /* required */
ContinuationToken: 'STRING_VALUE',
Delimiter: 'STRING_VALUE',
EncodingType: url,
ExpectedBucketOwner: 'STRING_VALUE',
FetchOwner: true || false,
MaxKeys: 'NUMBER_VALUE',
Prefix: 'STRING_VALUE',
RequestPayer: requester,
StartAfter: 'STRING_VALUE'
};
s3.listObjectsV2(params, function(err, data) {
if (err) console.log(err, err.stack); // an error occurred
else console.log(data); // successful response
});
*/
/*
const oldMain = async () => {
const baseURL = `https://aws.amazon.com/api/dirs/items/search?item.directoryId=whitepapers&sort_by=item.additionalFields.sortDate&sort_order=desc&size=9&item.locale=en_US&tags.id=GLOBAL%23content-type%23reference-arch-diagram&page=`
let page = 0;
let more = true;
let items = [];
while (more) {
const resp = await fetch(baseURL + `${page}`)
const data = await resp.json();
console.log(data);
more = data.metadata.count > 0;
if (more) {
items.push(...data.items);
page++;
}
}
console.log(`pages=${page},items.length=${items.length}`);
const output = JSON.stringify(items, null, 2);
console.log(output);
}
*/
/*
# TODO
* all blogs (<https://aws.amazon.com/blogs/>)
* template URL - "https://aws.amazon.com/api/dirs/items/search?item.directoryId=blog-posts&sort_by=item.additionalFields.createdDate&sort_order=desc&size=10&item.locale=en_US&page=1"
* all events content (<https://aws.amazon.com/events/events-content>) page=0...N
* template URL "https://aws.amazon.com/api/dirs/items/search?item.directoryId=event-content&sort_by=item.dateCreated&sort_order=desc&size=12&item.locale=en_US&tags.id=GLOBAL%23language%23english&page=1"
* Builders Library - https://aws.amazon.com/api/dirs/items/search?item.directoryId=amazon-redwood&sort_by=item.additionalFields.customSort&sort_order=asc&size=24&item.locale=en_US
* whats new, item.directoryId=whats-new
* item.directoryId=security-bulletins
* item.directoryId=aws-products
* item.directoryId=blog-posts
* item.directoryId=media-resources
* item.directoryId=free-tier-products
* use <https://lunrjs.com/> for searching
* step fn processing logic - need to figure out how to not download everything each run. way to download only new or changed items since last run
* define work by getting metadata for number results via https://...?item.directoryId=${directoryId}&item.locale=en_US&page=0. generate singe sqs message for each unique URL
* use returned `metadata.count` for `size` query string parameter
* &sort_by=item.[dateCreated|dateUpdated]&sort_order=desc
* lambda subscription to SQS. process sequencially. set batch size to >1 initially to see if throttling. can always set batch size to 1
---
```sh
# source: <https://gist.github.com/garystafford/37442d8fd8dde388f50856c6a2900b0d>
# One-liner to retrieve a list of all AWS products from aws.amazon.com/products sorted by product category (requires jq). Worked as of 2022-01-03. Page format tends to change a lot...
curl --silent --compressed \
'https://aws.amazon.com/api/dirs/items/search?item.directoryId=aws-products&sort_by=item.additionalFields.productCategory&sort_order=asc&size=500&item.locale=en_US' \
| jq -r '.items[].item | .additionalFields.productCategory + " | " + .additionalFields.productName' \
| sort
```
* <https://github.com/tycarac/aws-documents> - good refernce project that "Downloads AWS documents, currently whitepapers, from AWS documentation website."
* <https://github.com/nragusa/aws-newrelease-slack> - An AWS CDK application that sends AWS new service and feature release announcements to a Slack channel of your choice
querying various content types in the directory by tags.id querystring value
"contentType": "AWS Solution", tags.id=GLOBAL#content-type#solution
"contentType": "Pattern", tags.id=GLOBAL%23content-type%23pattern
"contentType": "Reference Architecture Diagram", tags.id=GLOBAL%23content-type%23reference-arch-diagram
"contentType": "Guide", tags.id=GLOBAL%23content-type%23tech-guide
"contentType": multi-valued, tags.id=GLOBAL%23content-type%23video
"contentType": "Whitepaper", tags.id=GLOBAL%23content-type%23whitepaper
-- general response shape
{
"items": [],
"metadata": {
"count": 0,
"totalHits": 299
},
"fieldTypes": {
"updateDate": "Date",
"imageSrcUrl": "URL",
"featureFlag": "Text",
"description": "LongText",
"sortDate": "Date",
"docTitle": "Text",
"primaryURL": "URL",
"datePublished": "Date",
"publishedText": "Text",
"footerInfoSubtext": "Text",
"subHeadline": "Text",
"enableShare": "Boolean",
"category": "Text",
"contentType": "Text"
}
}
--- example item (items[0].item)
{
"item": {
"id": "whitepapers#image-moderation-chatbot",
"locale": "en_US",
"directoryId": "whitepapers",
"name": "image-moderation-chatbot",
"author": "julicoll",
"createdBy": "julicoll",
"lastUpdatedBy": "julicoll",
"numImpressions": 0,
"score": 0,
"dateCreated": "2019-06-25T17:21:57+0000",
"dateUpdated": "2021-07-29T17:01:46+0000",
"additionalFields": {
"datePublished": "2018-12-05",
"publishedText": "December 2018",
"description": "Shows you how to build a serverless chatbot on AWS that monitors your chat channels and removes images containing suggestive or explicit content.<p><a href=\"https://github.com/awslabs/lambda-refarch-imagemoderationchatbot?did=wp_card&trk=wp_card\" target=\"_blank\" rel=\"noopener\">Code</a></p><p class=\"m-subheadline\">Media Services | Serverless</p>",
"docTitle": "Image Moderation Chatbot",
"sortDate": "2018-12-05",
"enableShare": "1",
"contentType": "Reference Architecture Diagram",
"primaryURL": "https://github.com/awslabs/lambda-refarch-imagemoderationchatbot?did=wp_card&trk=wp_card"
}
},
"tags": [
{
"id": "GLOBAL#content-type#reference-arch-diagram",
"locale": "en_US",
"tagNamespaceId": "GLOBAL#content-type",
"name": "Reference Architecture Diagram",
"description": "Reference Architecture Diagram",
"createdBy": "jenbar",
"lastUpdatedBy": "jenbar",
"dateCreated": "2020-04-29T05:19:31+0000",
"dateUpdated": "2022-02-03T03:31:09+0000"
},
{
"id": "GLOBAL#methodology#serverless",
"locale": "en_US",
"tagNamespaceId": "GLOBAL#methodology",
"name": "Serverless",
"description": "Serverless",
"createdBy": "jenbar",
"lastUpdatedBy": "jenbar",
"dateCreated": "2020-06-05T07:06:34+0000",
"dateUpdated": "2022-02-03T03:32:11+0000"
},
{
"id": "GLOBAL#tech-category#media-services",
"locale": "en_US",
"tagNamespaceId": "GLOBAL#tech-category",
"name": "Media Services",
"description": "Media Services",
"createdBy": "jarfaa",
"lastUpdatedBy": "jenbar",
"dateCreated": "2020-07-17T03:06:10+0000",
"dateUpdated": "2022-02-03T03:35:28+0000"
}
]
}
`
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment