Skip to content

Instantly share code, notes, and snippets.

@justinrfenn
Created March 31, 2023 19:16
Show Gist options
  • Save justinrfenn/a8a3599b26b53334126e398443dfe156 to your computer and use it in GitHub Desktop.
Save justinrfenn/a8a3599b26b53334126e398443dfe156 to your computer and use it in GitHub Desktop.
LangChain Directory Loader that respects .gitignore files
"""Loading logic for loading documents from a git directory respecting .gitignore files."""
import logging
import fnmatch
from pathlib import Path
from typing import List, Type, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
FILE_LOADER_TYPE = Union[
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]
logger = logging.getLogger(__file__)
def _load_gitignore_patterns(dir_path: Path):
gitignore_path = dir_path / ".gitignore"
if gitignore_path.is_file():
lines = gitignore_path.read_text().splitlines()
return [line.strip() for line in lines if line.strip() and not line.startswith('#')]
return []
class GitDirectoryLoader(BaseLoader):
"""Loading logic for loading documents from a git directory respecting .gitignore files."""
def __init__(
self,
path: str,
silent_errors: bool = False,
load_hidden: bool = False,
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
recursive: bool = False,
):
self.path = path
self.load_hidden = load_hidden
self.loader_cls = loader_cls
self.silent_errors = silent_errors
self.recursive = recursive
def _load(self, p: Path, gitignore_patterns: List[str]) -> List[Document]:
docs = []
for file_or_dir in p.iterdir():
if file_or_dir.is_file():
if not any(fnmatch.fnmatch(str(file_or_dir.relative_to(p)), pattern) for pattern in gitignore_patterns):
try:
sub_docs = self.loader_cls(str(file_or_dir)).load()
docs.extend(sub_docs)
except Exception as error:
if self.silent_errors:
logger.warning(error)
else:
raise error
elif file_or_dir.is_dir() and (self.recursive or file_or_dir == p):
# Ignore hidden directories unless the load_hidden flag is set
if file_or_dir.name.startswith(".") and not self.load_hidden:
continue
subdir_gitignore_patterns = gitignore_patterns + _load_gitignore_patterns(file_or_dir)
docs.extend(self._load(file_or_dir, subdir_gitignore_patterns))
return docs
def load(self) -> List[Document]:
p = Path(self.path)
gitignore_patterns = _load_gitignore_patterns(p)
return self._load(p, gitignore_patterns)
@ht55ght55
Copy link

Nice one! What are you using to parse code into embeddable chunks? an AST? the basic text splitter?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment