Created
April 16, 2023 07:30
-
-
Save cjnghn/5b39a05ededa05216e964a0b4ceddcf1 to your computer and use it in GitHub Desktop.
python scraping with cache
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pickle | |
import requests | |
import hashlib | |
from pathlib import Path | |
from typing import Callable, Optional | |
class Cache: | |
def __init__( | |
self, | |
directory: Path = Path.home() / Path(".cache/websites") | |
) -> None: | |
self.directory = directory | |
directory.mkdir(parents=True, exist_ok=True) | |
def get(self, uri: str) -> Optional[str]: | |
path = self.cache_path(url) | |
val = path.read_bytes() | |
return val.decode('utf-8') | |
def put(self, uri: str, val: str) -> None: | |
path = self.cache_path(uri) | |
path.parent.mkdir(exist_ok=True) | |
with open(path, 'wb') as f: | |
f.write(val.encode('utf-8')) | |
def cache_path(self, uri: str) -> Path: | |
hashed = hashlib.sha256(uri.encode()).hexdigest() | |
return Path(self.directory / hashed) | |
if __name__ == "__main__": | |
cache = Cache() | |
url = "https://wikipedia.com" | |
data = requests.get(url) | |
cache.put(url, data.text) | |
print(cache.get(url)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment