Created
August 22, 2023 19:41
-
-
Save esnosy/de0925eade2066814dd2fd12c419f7a3 to your computer and use it in GitHub Desktop.
binary search numpy sort and search on disk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mmap | |
from struct import Struct | |
from multiprocessing import Pool | |
import numpy as np | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service as ChromeService | |
from selenium.webdriver.common.by import By | |
from webdriver_manager.chrome import ChromeDriverManager | |
long_long_packer = Struct("q") | |
chrome_options = webdriver.ChromeOptions() | |
# https://web.archive.org/web/20230814145816/https://www.selenium.dev/blog/2023/headless-is-going-away/ | |
chrome_options.add_argument("--headless=new") | |
chrome_path = ChromeDriverManager().install() | |
def crawl(url: str): | |
driver = webdriver.Chrome( | |
service=ChromeService(chrome_path), options=chrome_options | |
) | |
driver.get(url) | |
# Index page content | |
clean_text = BeautifulSoup(driver.page_source, "lxml").get_text( | |
separator=" ", strip=True | |
) | |
print(clean_text) | |
with open("hashed_keywords.bin", "wb") as file: | |
for keyword in clean_text.split(): | |
keyword_hash = hash(keyword.lower()) | |
data = long_long_packer.pack(keyword_hash) | |
file.write(data) | |
# Sort on disk | |
with open("hashed_keywords.bin", "r+b") as file: | |
mm = mmap.mmap(file.fileno(), 0) | |
arr = np.ndarray.__new__( | |
np.ndarray, buffer=mm, dtype=np.int64, shape=(mm.size() // 8,) | |
) | |
arr.sort(kind="heapsort") | |
# Search for a keyword | |
with open("hashed_keywords.bin", "r+b") as file: | |
mm = mmap.mmap(file.fileno(), 0) | |
arr = np.ndarray.__new__( | |
np.ndarray, buffer=mm, dtype=np.int64, shape=(mm.size() // 8,) | |
) | |
query = hash("google") | |
i = arr.searchsorted(query) | |
print(arr[i] == query) | |
# Find all links | |
link_elements = driver.find_elements(By.TAG_NAME, "a") | |
for e in link_elements: | |
try: | |
link = e.get_attribute("href") | |
print(link) | |
except Exception: | |
continue | |
driver.quit() | |
with Pool(4) as p: | |
p.map(crawl, ["https://www.google.com", "https://www.bing.com"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment