-
-
Save janduplessis883/3b09942794dbc4513fcccf65155b84ca to your computer and use it in GitHub Desktop.
Langchain tool for preprocessing text data. Version one million nine-hundred and fifty two π jk version 1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import pandas as pd | |
from sklearn.preprocessing import MinMaxScaler | |
from langchain.tools import BaseTool | |
from typing import Optional, Union, List | |
from langchain.callbacks.manager import CallbackManagerForToolRun, AsyncCallbackManagerForToolRun | |
class DataPreprocessingTool(BaseTool): | |
name = "DataPreprocessingTool" | |
description = "A tool for preprocessing and structuring unstructured data." | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.nlp = spacy.load("en_core_web_sm") # Loading spaCy NLP model | |
def _run( | |
self, | |
unstructured_data: Union[str, List[str]], | |
run_manager: Optional[CallbackManagerForToolRun] = None | |
) -> pd.DataFrame: | |
# Main function to orchestrate the preprocessing and structuring steps | |
structured_data = self.structure_data(unstructured_data) | |
normalized_data = self.normalize_data(structured_data) | |
enriched_data = self.enrich_data(normalized_data) | |
return enriched_data | |
async def _arun( | |
self, | |
unstructured_data: Union[str, List[str]], | |
run_manager: Optional[AsyncCallbackManagerForToolRun] = None | |
) -> pd.DataFrame: | |
# Asynchronous version of the _run method | |
# Assuming similar steps for async version | |
return await self._run(unstructured_data, run_manager) | |
def tokenize(self, text: str) -> List[str]: | |
# Tokenizes the text | |
doc = self.nlp(text) | |
tokens = [token.text for token in doc] | |
return tokens | |
def remove_stopwords(self, tokens: List[str]) -> List[str]: | |
# Removes stopwords from the tokenized text | |
filtered_tokens = [token for token in tokens if not self.nlp.vocab[token].is_stop] | |
return filtered_tokens | |
def stem_or_lemmatize(self, tokens: List[str]) -> List[str]: | |
# Lemmatizes the tokenized text | |
lemmatized_tokens = [self.nlp(token)[0].lemma_ for token in tokens] | |
return lemmatized_tokens | |
def clean_text(self, text: str) -> str: | |
# Performs general cleaning like lowercasing, punctuation removal, etc. | |
doc = self.nlp(text.lower()) | |
cleaned_text = " ".join([token.text for token in doc if not token.is_punct]) | |
return cleaned_text | |
def vectorize_text(self, text: str) -> List[float]: | |
# Converts text to a numerical vector | |
doc = self.nlp(text) | |
vector = doc.vector.tolist() | |
return vector | |
def extract_keywords(self, text: str) -> List[str]: | |
# Extracts keywords from the text | |
# Assume keywords are the noun chunks in the text | |
doc = self.nlp(text) | |
keywords = [chunk.text for chunk in doc.noun_chunks] | |
return keywords | |
def extract_entities(self, text: str) -> List[str]: | |
# Extracts named entities from the text | |
doc = self.nlp(text) | |
entities = [ent.text for ent in doc.ents] | |
return entities | |
def structure_data(self, data: Union[str, List[str]]) -> pd.DataFrame: | |
# Transforms unstructured data into a structured format | |
# Assume each item in data is a text document | |
structured_data_list = [] | |
for item in data: | |
cleaned_text = self.clean_text(item) | |
tokens = self.tokenize(cleaned_text) | |
lemmatized_tokens = self.stem_or_lemmatize(tokens) | |
vector = self.vectorize_text(cleaned_text) | |
keywords = self.extract_keywords(cleaned_text) | |
entities = self.extract_entities(cleaned_text) | |
structured_data_list.append({ | |
'cleaned_text': cleaned_text, | |
'tokens': lemmatized_tokens, | |
'vector': vector, | |
'keywords': keywords, | |
'entities': entities | |
}) | |
structured_data = pd.DataFrame(structured_data_list) | |
return structured_data | |
def normalize_data(self, data: pd.DataFrame) -> pd.DataFrame: | |
# Normalizes data (e.g., scaling features to a standard range) | |
scaler = MinMaxScaler() | |
# Assume data has a 'vector' column with numerical values | |
data['vector'] = list(scaler.fit_transform(data['vector'].tolist())) | |
return data | |
def enrich_data(self, data: pd.DataFrame) -> pd.DataFrame: | |
# Enriches data with additional information | |
# Example: Adding a column for the number of tokens in each document | |
data['num_tokens'] = data['tokens'].apply(len) | |
return data | |
def save_data(self, data: pd.DataFrame, path: str): | |
# Saves the structured data to a file | |
data.to_csv(path, index=False) | |
def get_data(self) -> pd.DataFrame: | |
# Returns the structured data for further processing | |
# Assume structured data is saved in a class attribute | |
return self.structured_data | |
# ... (further implementation details and logic to be added later) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment