Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save janduplessis883/3b09942794dbc4513fcccf65155b84ca to your computer and use it in GitHub Desktop.
Save janduplessis883/3b09942794dbc4513fcccf65155b84ca to your computer and use it in GitHub Desktop.
Langchain tool for preprocessing text data. Version one million nine-hundred and fifty two πŸ˜‚ jk version 1
import spacy
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from langchain.tools import BaseTool
from typing import Optional, Union, List
from langchain.callbacks.manager import CallbackManagerForToolRun, AsyncCallbackManagerForToolRun
class DataPreprocessingTool(BaseTool):
name = "DataPreprocessingTool"
description = "A tool for preprocessing and structuring unstructured data."
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.nlp = spacy.load("en_core_web_sm") # Loading spaCy NLP model
def _run(
self,
unstructured_data: Union[str, List[str]],
run_manager: Optional[CallbackManagerForToolRun] = None
) -> pd.DataFrame:
# Main function to orchestrate the preprocessing and structuring steps
structured_data = self.structure_data(unstructured_data)
normalized_data = self.normalize_data(structured_data)
enriched_data = self.enrich_data(normalized_data)
return enriched_data
async def _arun(
self,
unstructured_data: Union[str, List[str]],
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> pd.DataFrame:
# Asynchronous version of the _run method
# Assuming similar steps for async version
return await self._run(unstructured_data, run_manager)
def tokenize(self, text: str) -> List[str]:
# Tokenizes the text
doc = self.nlp(text)
tokens = [token.text for token in doc]
return tokens
def remove_stopwords(self, tokens: List[str]) -> List[str]:
# Removes stopwords from the tokenized text
filtered_tokens = [token for token in tokens if not self.nlp.vocab[token].is_stop]
return filtered_tokens
def stem_or_lemmatize(self, tokens: List[str]) -> List[str]:
# Lemmatizes the tokenized text
lemmatized_tokens = [self.nlp(token)[0].lemma_ for token in tokens]
return lemmatized_tokens
def clean_text(self, text: str) -> str:
# Performs general cleaning like lowercasing, punctuation removal, etc.
doc = self.nlp(text.lower())
cleaned_text = " ".join([token.text for token in doc if not token.is_punct])
return cleaned_text
def vectorize_text(self, text: str) -> List[float]:
# Converts text to a numerical vector
doc = self.nlp(text)
vector = doc.vector.tolist()
return vector
def extract_keywords(self, text: str) -> List[str]:
# Extracts keywords from the text
# Assume keywords are the noun chunks in the text
doc = self.nlp(text)
keywords = [chunk.text for chunk in doc.noun_chunks]
return keywords
def extract_entities(self, text: str) -> List[str]:
# Extracts named entities from the text
doc = self.nlp(text)
entities = [ent.text for ent in doc.ents]
return entities
def structure_data(self, data: Union[str, List[str]]) -> pd.DataFrame:
# Transforms unstructured data into a structured format
# Assume each item in data is a text document
structured_data_list = []
for item in data:
cleaned_text = self.clean_text(item)
tokens = self.tokenize(cleaned_text)
lemmatized_tokens = self.stem_or_lemmatize(tokens)
vector = self.vectorize_text(cleaned_text)
keywords = self.extract_keywords(cleaned_text)
entities = self.extract_entities(cleaned_text)
structured_data_list.append({
'cleaned_text': cleaned_text,
'tokens': lemmatized_tokens,
'vector': vector,
'keywords': keywords,
'entities': entities
})
structured_data = pd.DataFrame(structured_data_list)
return structured_data
def normalize_data(self, data: pd.DataFrame) -> pd.DataFrame:
# Normalizes data (e.g., scaling features to a standard range)
scaler = MinMaxScaler()
# Assume data has a 'vector' column with numerical values
data['vector'] = list(scaler.fit_transform(data['vector'].tolist()))
return data
def enrich_data(self, data: pd.DataFrame) -> pd.DataFrame:
# Enriches data with additional information
# Example: Adding a column for the number of tokens in each document
data['num_tokens'] = data['tokens'].apply(len)
return data
def save_data(self, data: pd.DataFrame, path: str):
# Saves the structured data to a file
data.to_csv(path, index=False)
def get_data(self) -> pd.DataFrame:
# Returns the structured data for further processing
# Assume structured data is saved in a class attribute
return self.structured_data
# ... (further implementation details and logic to be added later)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment