Skip to content

Instantly share code, notes, and snippets.

@2ndBillingCycle
Last active February 19, 2020 08:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 2ndBillingCycle/930f209e5932f0a7f87380fa3e00b2ff to your computer and use it in GitHub Desktop.
Save 2ndBillingCycle/930f209e5932f0a7f87380fa3e00b2ff to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
# Original code from @Teknikal_Domain: https://teknikaldomain.me/code/automating-algolia-search-indexing/#that-program
# Code changed without permission; all rights reserved Teknikal_Domain
import json
import sys
from pathlib import Path
from algoliasearch.search_client import SearchClient
assert sys.version_info >= (3, 6), "Python 3.6 or greater needed for f-strings"
print("Connecting")
client = SearchClient.create("XXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
index = client.init_index("blog_index")
print("Reading local data")
local_dat = json.load(
Path("teknikaldomain.me/tekpro-blog/public/index.json").read_text()
)
print(f"Saving {len(local_dat)} records to index")
index.save_objects(local_dat, {"autoGenerateObjectIDIfNotExist": False})
print("Done")
print("Checking index for dead records")
print("Reading index data")
idx_dat = list(index.browse_objects({"query": ""}))
print(f"Got {len(idx_dat)} records")
if not idx_dat:
print("Done")
sys.exit(0)
# The below could help determine if you'll need multithreading
# python -m timeit -s "set1 = set(f'{x:0>256}' for x in range(1000000)); set2 = set(f'{x:0>256}' for x in range(1000001, 2000000))" "set1-set2"
# Or try
# from timeit import timeit
# set1 = set(f"{num:0>256}" for num in range(0 , 1000000))
# set2 = set(f"{num:0>256}" for num in range(1000001, 2000000))
# print(f"comparison took {timeit(lambda: set1 - set2, number=10)/10} seconds on average")
print("Now cross-checking datasets")
print(f"This may need a maximum of {len(idx_dat) * len(local_dat)} computations")
local_recs = set(local_rec["objectID"] for local_rec in local_dat)
index_recs = set(rec["objectID"] for rec in idx_dat)
dead_list = list(index_recs - local_recs)
if dead_list:
for object_id in dead_list:
print(f"Dead record: {object_id}")
print(f"Pruning {len(dead_list)} records")
index.delete_objects(dead_list)
else:
print("Index up to date")
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment