Skip to content

Instantly share code, notes, and snippets.

@jerlendds
Last active August 26, 2021 00:08
Show Gist options
  • Save jerlendds/37747e4bdf90582a1f5198b8cacba0f6 to your computer and use it in GitHub Desktop.
Save jerlendds/37747e4bdf90582a1f5198b8cacba0f6 to your computer and use it in GitHub Desktop.
Retrieve all CSE links from a start.me webpage by passing in the page id
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from typing import List
import requests
import json
def get_startme_links(page_ids: List[str] = None):
if page_ids is None:
raise ValueError(f"expected list of start.me ids, received: {page_ids}")
all_cse = []
cse_count = 0
not_cse_count = 0
for page_id in page_ids:
data = requests.get(f"https://start.me/p/{page_id}.json").json()
cse_categories = data["page"]["sections"][0]["columns"]
for widget in cse_categories:
for topic in widget.get('widgets'):
cse_set = {
"cse_category": topic.get('title'),
"cse_links": []
}
if topic['items'] != {}:
all_cse.append(cse_set)
else:
break
for cse_item in topic["items"]["links"]:
url = cse_item.get('url')
valid_cse_url = ['http://cse.google.',
'https://cse.google',] # noqa
if url[:18] in valid_cse_url:
cse_count += 1
cse_item = {
"title": cse_item["title"],
"url": cse_item["url"],
"description": ""
}
cse_set["cse_links"].append(cse_item)
else:
not_cse_count += 1
print(json.dumps(all_cse, indent=5))
print(f"\n\n Total CSE urls: {cse_count}\n",
f"Total urls that are NOT CSEs: {not_cse_count}\n\n")
known_cse_sources = [
"b5ynOQ",
"EL84Km",
"L1rEYQ",
"8ynloB",
"b5Aow7",
"b56G5Q",
"Wp1kpe",
"BnBb5v",
"b59RMv",
"GEQXv7",
"m65arv",
"6rAJbo",
"ZeDvrP"
]
get_startme_links(page_ids=known_cse_sources)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment