Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save evantancy/c1a1defc7ff4030595fb67ae8820585a to your computer and use it in GitHub Desktop.
Save evantancy/c1a1defc7ff4030595fb67ae8820585a to your computer and use it in GitHub Desktop.
LangChain Directory Loader that respects .gitignore files
"""Loading logic for loading documents from a git directory respecting .gitignore files."""
import logging
import fnmatch
from pathlib import Path
from typing import List, Type, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
FILE_LOADER_TYPE = Union[
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]
logger = logging.getLogger(__file__)
def _load_gitignore_patterns(dir_path: Path):
gitignore_path = dir_path / ".gitignore"
if gitignore_path.is_file():
lines = gitignore_path.read_text().splitlines()
return [line.strip() for line in lines if line.strip() and not line.startswith('#')]
return []
class GitDirectoryLoader(BaseLoader):
"""Loading logic for loading documents from a git directory respecting .gitignore files."""
def __init__(
self,
path: str,
silent_errors: bool = False,
load_hidden: bool = False,
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
recursive: bool = False,
):
self.path = path
self.load_hidden = load_hidden
self.loader_cls = loader_cls
self.silent_errors = silent_errors
self.recursive = recursive
def _load(self, p: Path, gitignore_patterns: List[str]) -> List[Document]:
docs = []
for file_or_dir in p.iterdir():
if file_or_dir.is_file():
if not any(fnmatch.fnmatch(str(file_or_dir.relative_to(p)), pattern) for pattern in gitignore_patterns):
try:
sub_docs = self.loader_cls(str(file_or_dir)).load()
docs.extend(sub_docs)
except Exception as error:
if self.silent_errors:
logger.warning(error)
else:
raise error
elif file_or_dir.is_dir() and (self.recursive or file_or_dir == p):
# Ignore hidden directories unless the load_hidden flag is set
if file_or_dir.name.startswith(".") and not self.load_hidden:
continue
subdir_gitignore_patterns = gitignore_patterns + _load_gitignore_patterns(file_or_dir)
docs.extend(self._load(file_or_dir, subdir_gitignore_patterns))
return docs
def load(self) -> List[Document]:
p = Path(self.path)
gitignore_patterns = _load_gitignore_patterns(p)
return self._load(p, gitignore_patterns)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment