Skip to content

Instantly share code, notes, and snippets.

@tech234a
Last active January 30, 2021 23:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tech234a/44cf6d692f6c1c17807bacc26a5485dc to your computer and use it in GitHub Desktop.
Save tech234a/44cf6d692f6c1c17807bacc26a5485dc to your computer and use it in GitHub Desktop.
Chrome Web Store Discovery Script - all 58 regions, all item types, and single language
# Chrome Web Store Discovery Script
# By tech234a, October 2020
# This script works by taking accessing the "infiniteWall" of items that appears after scrolling past all of the collections within
# the homepage for an item type. Even infinity comes to an end.
# CONFIGURATION
# Item type
# Possible values: extensions, themes, apps, app/3-games, collection/[collection name]
CATEGORIES = ["extensions", "themes", "apps", "app/3-games"]
# CATEGORY = 'extensions'
# Language code
# Possible values from docs: https://developer.chrome.com/webstore/i18n#localeTable
# "ar", "am", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "en_GB", "en_US", "es", "es_419", "et", "fa", "fi", "fil", "fr", "gu",
# "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "ml", "mr", "ms", "nl", "no", "pl", "pt_BR", "pt_PT", "ro", "ru",
# "sk", "sl", "sr", "sv", "sw", "ta", "te", "th", "tr", "uk", "vi", "zh_CN", "zh_TW"
LANGUAGE = 'en'
# Country code (2 letters)
# Possible values from Chrome Developer Dashboard:
COUNTRYCODES = ["AR", "AU", "AT", "BE", "BR", "BG", "CA", "CL", "CN", "CO", "CU", "CZ", "DK", "EC", "EG", "EE", "FI", "FR", "DE", "GR",
"HK", "HU", "IN", "ID", "IE", "IL", "IT", "JP", "LT", "MY", "MX", "MA", "NL", "NZ", "NO", "PA", "PE", "PH", "PL", "PT", "RO", "RU",
"SA", "SG", "SK", "ZA", "ES", "SE", "CH", "TW", "TH", "TR", "AE", "UA", "GB", "US", "VE", "VN", "001"]
# Also: "001" for "worldwide", which refers to items that are listed in all countries so is actually the smallest subset of items
# COUNTRYCODE = '001'
# Number of items to get per page: max 210, but the UI normally requests 96
CNTPAGE = 210
# SCRIPT
from requests import session
from json import loads
from threading import Thread
from time import sleep
requests = session()
discoveredids = set()
from queue import Queue
jobs = Queue()
def doit():
while not jobs.empty():
COUNTRYCODE = jobs.get()
for CATEGORY in CATEGORIES:
totalreqs = 1
print("Total discovered:", len(discoveredids))
print("Request number:", totalreqs)
# Make the initial request
params = (
('hl', LANGUAGE),
('gl', COUNTRYCODE),
('pv', '20201016'),
('requestedCounts', 'infiniteWall:'+str(CNTPAGE)+':0:true'),
('category', CATEGORY),
)
response = requests.post('https://chrome.google.com/webstore/ajax/item', params=params)
parsed = loads(response.text[6:])
token = parsed[1][4]
for item in parsed[1][1]:
# there's actually a lot of other metadata that can be collected here if interested
# including title, short description, author, price, average rating, category, icon URL,
# and more, but for now this script just gets the ID
discoveredids.add(item[0])
while True:
totalreqs += 1
print("Total discovered:", len(discoveredids))
print("Request number:", totalreqs)
params = (
('hl', LANGUAGE),
('gl', COUNTRYCODE),
('pv', '20201016'),
('requestedCounts', 'infiniteWall:'+str(CNTPAGE)+':0:true'),
('token', token), # this pagination token isn't random, but it's provided in the previous request so I'm using it anyway
('category', CATEGORY),
)
response = requests.post('https://chrome.google.com/webstore/ajax/item', params=params)
parsed = loads(response.text[6:])
token = parsed[1][4]
for item in parsed[1][1]:
# there's actually a lot of other metadata that can be collected here if interested
# including title, short description, author, price, average rating, category, icon URL,
# and more, but for now this script just gets the ID
discoveredids.add(item[0])
if len(parsed[1][1]) != CNTPAGE:
break
threads = []
for item in COUNTRYCODES:
jobs.put(item)
for i in range(20):
runthread = Thread(target=doit)
runthread.start()
threads.append(runthread)
del runthread
#https://stackoverflow.com/a/11968881
for x in threads:
x.join()
threads.remove(x)
del x
print("Waiting 5 minutes to ensure all threads have completed...")
sleep(300)
print("FINAL NUMBER OF DISCOVERED ITEMS:", len(discoveredids))
open("out.txt", "w").write("\n".join(sorted(discoveredids)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment