Last active
August 26, 2021 00:08
-
-
Save jerlendds/37747e4bdf90582a1f5198b8cacba0f6 to your computer and use it in GitHub Desktop.
Retrieve all CSE links from a start.me webpage by passing in the page id
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from typing import List | |
import requests | |
import json | |
def get_startme_links(page_ids: List[str] = None): | |
if page_ids is None: | |
raise ValueError(f"expected list of start.me ids, received: {page_ids}") | |
all_cse = [] | |
cse_count = 0 | |
not_cse_count = 0 | |
for page_id in page_ids: | |
data = requests.get(f"https://start.me/p/{page_id}.json").json() | |
cse_categories = data["page"]["sections"][0]["columns"] | |
for widget in cse_categories: | |
for topic in widget.get('widgets'): | |
cse_set = { | |
"cse_category": topic.get('title'), | |
"cse_links": [] | |
} | |
if topic['items'] != {}: | |
all_cse.append(cse_set) | |
else: | |
break | |
for cse_item in topic["items"]["links"]: | |
url = cse_item.get('url') | |
valid_cse_url = ['http://cse.google.', | |
'https://cse.google',] # noqa | |
if url[:18] in valid_cse_url: | |
cse_count += 1 | |
cse_item = { | |
"title": cse_item["title"], | |
"url": cse_item["url"], | |
"description": "" | |
} | |
cse_set["cse_links"].append(cse_item) | |
else: | |
not_cse_count += 1 | |
print(json.dumps(all_cse, indent=5)) | |
print(f"\n\n Total CSE urls: {cse_count}\n", | |
f"Total urls that are NOT CSEs: {not_cse_count}\n\n") | |
known_cse_sources = [ | |
"b5ynOQ", | |
"EL84Km", | |
"L1rEYQ", | |
"8ynloB", | |
"b5Aow7", | |
"b56G5Q", | |
"Wp1kpe", | |
"BnBb5v", | |
"b59RMv", | |
"GEQXv7", | |
"m65arv", | |
"6rAJbo", | |
"ZeDvrP" | |
] | |
get_startme_links(page_ids=known_cse_sources) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment