-
-
Save tech234a/44cf6d692f6c1c17807bacc26a5485dc to your computer and use it in GitHub Desktop.
Chrome Web Store Discovery Script - all 58 regions, all item types, and single language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Chrome Web Store Discovery Script | |
# By tech234a, October 2020 | |
# This script works by taking accessing the "infiniteWall" of items that appears after scrolling past all of the collections within | |
# the homepage for an item type. Even infinity comes to an end. | |
# CONFIGURATION | |
# Item type | |
# Possible values: extensions, themes, apps, app/3-games, collection/[collection name] | |
CATEGORIES = ["extensions", "themes", "apps", "app/3-games"] | |
# CATEGORY = 'extensions' | |
# Language code | |
# Possible values from docs: https://developer.chrome.com/webstore/i18n#localeTable | |
# "ar", "am", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "en_GB", "en_US", "es", "es_419", "et", "fa", "fi", "fil", "fr", "gu", | |
# "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "ml", "mr", "ms", "nl", "no", "pl", "pt_BR", "pt_PT", "ro", "ru", | |
# "sk", "sl", "sr", "sv", "sw", "ta", "te", "th", "tr", "uk", "vi", "zh_CN", "zh_TW" | |
LANGUAGE = 'en' | |
# Country code (2 letters) | |
# Possible values from Chrome Developer Dashboard: | |
COUNTRYCODES = ["AR", "AU", "AT", "BE", "BR", "BG", "CA", "CL", "CN", "CO", "CU", "CZ", "DK", "EC", "EG", "EE", "FI", "FR", "DE", "GR", | |
"HK", "HU", "IN", "ID", "IE", "IL", "IT", "JP", "LT", "MY", "MX", "MA", "NL", "NZ", "NO", "PA", "PE", "PH", "PL", "PT", "RO", "RU", | |
"SA", "SG", "SK", "ZA", "ES", "SE", "CH", "TW", "TH", "TR", "AE", "UA", "GB", "US", "VE", "VN", "001"] | |
# Also: "001" for "worldwide", which refers to items that are listed in all countries so is actually the smallest subset of items | |
# COUNTRYCODE = '001' | |
# Number of items to get per page: max 210, but the UI normally requests 96 | |
CNTPAGE = 210 | |
# SCRIPT | |
from requests import session | |
from json import loads | |
from threading import Thread | |
from time import sleep | |
requests = session() | |
discoveredids = set() | |
from queue import Queue | |
jobs = Queue() | |
def doit(): | |
while not jobs.empty(): | |
COUNTRYCODE = jobs.get() | |
for CATEGORY in CATEGORIES: | |
totalreqs = 1 | |
print("Total discovered:", len(discoveredids)) | |
print("Request number:", totalreqs) | |
# Make the initial request | |
params = ( | |
('hl', LANGUAGE), | |
('gl', COUNTRYCODE), | |
('pv', '20201016'), | |
('requestedCounts', 'infiniteWall:'+str(CNTPAGE)+':0:true'), | |
('category', CATEGORY), | |
) | |
response = requests.post('https://chrome.google.com/webstore/ajax/item', params=params) | |
parsed = loads(response.text[6:]) | |
token = parsed[1][4] | |
for item in parsed[1][1]: | |
# there's actually a lot of other metadata that can be collected here if interested | |
# including title, short description, author, price, average rating, category, icon URL, | |
# and more, but for now this script just gets the ID | |
discoveredids.add(item[0]) | |
while True: | |
totalreqs += 1 | |
print("Total discovered:", len(discoveredids)) | |
print("Request number:", totalreqs) | |
params = ( | |
('hl', LANGUAGE), | |
('gl', COUNTRYCODE), | |
('pv', '20201016'), | |
('requestedCounts', 'infiniteWall:'+str(CNTPAGE)+':0:true'), | |
('token', token), # this pagination token isn't random, but it's provided in the previous request so I'm using it anyway | |
('category', CATEGORY), | |
) | |
response = requests.post('https://chrome.google.com/webstore/ajax/item', params=params) | |
parsed = loads(response.text[6:]) | |
token = parsed[1][4] | |
for item in parsed[1][1]: | |
# there's actually a lot of other metadata that can be collected here if interested | |
# including title, short description, author, price, average rating, category, icon URL, | |
# and more, but for now this script just gets the ID | |
discoveredids.add(item[0]) | |
if len(parsed[1][1]) != CNTPAGE: | |
break | |
threads = [] | |
for item in COUNTRYCODES: | |
jobs.put(item) | |
for i in range(20): | |
runthread = Thread(target=doit) | |
runthread.start() | |
threads.append(runthread) | |
del runthread | |
#https://stackoverflow.com/a/11968881 | |
for x in threads: | |
x.join() | |
threads.remove(x) | |
del x | |
print("Waiting 5 minutes to ensure all threads have completed...") | |
sleep(300) | |
print("FINAL NUMBER OF DISCOVERED ITEMS:", len(discoveredids)) | |
open("out.txt", "w").write("\n".join(sorted(discoveredids))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment