Skip to content

Instantly share code, notes, and snippets.

@cjnghn
Created April 16, 2023 07:30
Show Gist options
  • Save cjnghn/5b39a05ededa05216e964a0b4ceddcf1 to your computer and use it in GitHub Desktop.
Save cjnghn/5b39a05ededa05216e964a0b4ceddcf1 to your computer and use it in GitHub Desktop.
python scraping with cache
import os
import pickle
import requests
import hashlib
from pathlib import Path
from typing import Callable, Optional
class Cache:
def __init__(
self,
directory: Path = Path.home() / Path(".cache/websites")
) -> None:
self.directory = directory
directory.mkdir(parents=True, exist_ok=True)
def get(self, uri: str) -> Optional[str]:
path = self.cache_path(url)
val = path.read_bytes()
return val.decode('utf-8')
def put(self, uri: str, val: str) -> None:
path = self.cache_path(uri)
path.parent.mkdir(exist_ok=True)
with open(path, 'wb') as f:
f.write(val.encode('utf-8'))
def cache_path(self, uri: str) -> Path:
hashed = hashlib.sha256(uri.encode()).hexdigest()
return Path(self.directory / hashed)
if __name__ == "__main__":
cache = Cache()
url = "https://wikipedia.com"
data = requests.get(url)
cache.put(url, data.text)
print(cache.get(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment