Created
February 10, 2024 17:50
-
-
Save shenli/36f7514a809535a76d9042fa2a15e58a to your computer and use it in GitHub Desktop.
A CSV file reader that could assign text field name
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Based on Simple CSV reader. https://github.com/run-llama/llama-hub/blob/2c95b021246b54b0542bf9ed9289828cc9da6654/llama_hub/file/simple_csv/base.py | |
A parser for tabular data files. | |
""" | |
from pathlib import Path | |
from typing import Any, Dict, List, Optional | |
from llama_index.readers.base import BaseReader | |
from llama_index.readers.schema.base import Document | |
class CaseCSVReader(BaseReader): | |
"""CSV parser. | |
Args: | |
encoding (str): Encoding used to open the file. | |
utf-8 by default. | |
concat_rows (bool): whether to concatenate all rows into one document. | |
If set to False, a Document will be created for each row. | |
True by default. | |
""" | |
def __init__( | |
self, | |
*args: Any, | |
concat_rows: bool = True, | |
encoding: str = "utf-8", | |
**kwargs: Any | |
) -> None: | |
"""Init params.""" | |
super().__init__(*args, **kwargs) | |
self._concat_rows = concat_rows | |
self._encoding = encoding | |
def load_data( | |
self, file: Path, text_field: str, extra_info: Optional[Dict] = None | |
) -> List[Document]: | |
"""Parse file.""" | |
import csv | |
docs = [] | |
with open(file, "r", encoding=self._encoding) as fp: | |
csv_reader = csv.DictReader(fp) | |
for row in csv_reader: | |
text = row.get(text_field, '').strip() | |
del row[text_field] | |
d = Document(text=text, metadata=row, extra_info=extra_info or {}) | |
docs.append(d) | |
return docs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment