Skip to content

Instantly share code, notes, and snippets.

@shenli
Created February 10, 2024 17:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shenli/36f7514a809535a76d9042fa2a15e58a to your computer and use it in GitHub Desktop.
Save shenli/36f7514a809535a76d9042fa2a15e58a to your computer and use it in GitHub Desktop.
A CSV file reader that could assign text field name
"""
Based on Simple CSV reader. https://github.com/run-llama/llama-hub/blob/2c95b021246b54b0542bf9ed9289828cc9da6654/llama_hub/file/simple_csv/base.py
A parser for tabular data files.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class CaseCSVReader(BaseReader):
"""CSV parser.
Args:
encoding (str): Encoding used to open the file.
utf-8 by default.
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
encoding: str = "utf-8",
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._encoding = encoding
def load_data(
self, file: Path, text_field: str, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
import csv
docs = []
with open(file, "r", encoding=self._encoding) as fp:
csv_reader = csv.DictReader(fp)
for row in csv_reader:
text = row.get(text_field, '').strip()
del row[text_field]
d = Document(text=text, metadata=row, extra_info=extra_info or {})
docs.append(d)
return docs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment