Skip to content

Instantly share code, notes, and snippets.

@keithrozario
Created May 9, 2020 04:39
Show Gist options
  • Save keithrozario/865c33b27f06e1c3d82021794e913628 to your computer and use it in GitHub Desktop.
Save keithrozario/865c33b27f06e1c3d82021794e913628 to your computer and use it in GitHub Desktop.
AWS This is my Architecture Scrapper
import requests
import json
import csv
base_url = "https://aws.amazon.com/api/dirs/items/search"
params = {
"item.directoryId": "this-is-my-architecture",
"sort_by": "item.additionalFields.airDate",
"sort_order": "desc",
"size": 100,
"item.locale": "en_US"
}
posts = list()
# Not the best, but iterate over the first 100 pages, break if the count == 0.
for page_num in range(0, 100):
params['page'] = page_num
response = json.loads(requests.get(base_url, params=params).content.decode('utf-8'))
posts.extend(response['items'])
if response['metadata']['count'] == 0:
break
print(f"Found {len(posts)} posts on this is my architecture")
tag_data, category_data, post_data = list(), list(), list()
for post_id, post in enumerate(posts):
air_date = post['item']['additionalFields']['airDate']
description = post['item']['additionalFields']['description']
headline = post['item']['additionalFields']['headline']
url = post['item']['additionalFields']['headlineUrl']
post_data.append({
"post_id": post_id,
"description": description,
"date": air_date,
"headline": headline,
"url": url
})
try:
categories = [cat.strip() for cat in post['item']['additionalFields']['category'].split('|')]
for cat in categories:
category_data.append({
"post_id": post_id,
"category": cat,
"date": air_date})
except KeyError:
pass
tags = [tag.strip().lower() for tag in post['item']['additionalFields']['youtubeTags'].split(',')]
tags = set(tags) # deduplicate list
for tag in tags:
tag_data.append({
"post_id": post_id,
"tag": tag,
"date": air_date
})
with open('tags.csv', 'w', newline='\n') as tag_file:
fieldnames = ['post_id', 'date', 'tag']
writer = csv.DictWriter(tag_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
writer.writeheader()
writer.writerows(tag_data)
with open('categories.csv', 'w', newline='\n') as cat_file:
fieldnames = ['post_id', 'date', 'category']
writer = csv.DictWriter(cat_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
writer.writeheader()
writer.writerows(category_data)
with open('post.csv', 'w', newline='\n') as post_file:
fieldnames = ['post_id', 'date', 'headline', 'description', 'url']
writer = csv.DictWriter(post_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
writer.writeheader()
writer.writerows(post_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment