Skip to content

Instantly share code, notes, and snippets.

@esnosy
Created August 22, 2023 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esnosy/de0925eade2066814dd2fd12c419f7a3 to your computer and use it in GitHub Desktop.
Save esnosy/de0925eade2066814dd2fd12c419f7a3 to your computer and use it in GitHub Desktop.
binary search numpy sort and search on disk
import mmap
from struct import Struct
from multiprocessing import Pool
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
long_long_packer = Struct("q")
chrome_options = webdriver.ChromeOptions()
# https://web.archive.org/web/20230814145816/https://www.selenium.dev/blog/2023/headless-is-going-away/
chrome_options.add_argument("--headless=new")
chrome_path = ChromeDriverManager().install()
def crawl(url: str):
driver = webdriver.Chrome(
service=ChromeService(chrome_path), options=chrome_options
)
driver.get(url)
# Index page content
clean_text = BeautifulSoup(driver.page_source, "lxml").get_text(
separator=" ", strip=True
)
print(clean_text)
with open("hashed_keywords.bin", "wb") as file:
for keyword in clean_text.split():
keyword_hash = hash(keyword.lower())
data = long_long_packer.pack(keyword_hash)
file.write(data)
# Sort on disk
with open("hashed_keywords.bin", "r+b") as file:
mm = mmap.mmap(file.fileno(), 0)
arr = np.ndarray.__new__(
np.ndarray, buffer=mm, dtype=np.int64, shape=(mm.size() // 8,)
)
arr.sort(kind="heapsort")
# Search for a keyword
with open("hashed_keywords.bin", "r+b") as file:
mm = mmap.mmap(file.fileno(), 0)
arr = np.ndarray.__new__(
np.ndarray, buffer=mm, dtype=np.int64, shape=(mm.size() // 8,)
)
query = hash("google")
i = arr.searchsorted(query)
print(arr[i] == query)
# Find all links
link_elements = driver.find_elements(By.TAG_NAME, "a")
for e in link_elements:
try:
link = e.get_attribute("href")
print(link)
except Exception:
continue
driver.quit()
with Pool(4) as p:
p.map(crawl, ["https://www.google.com", "https://www.bing.com"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment