This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.pipeline import Textractor | |
textractor = Textractor(sections=True) | |
# Install [pipeline-data] extra to support extracting text from docx/pdf/xlsx | |
for section in textractor("https://github.com/neuml/txtai"): | |
print(f"\n[SECTION]\n{section}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from datetime import timedelta | |
from datasets import load_dataset | |
from txtai import LLM | |
from txtai.pipeline import Labels, HFTrainer | |
def prompt(text): | |
text = f""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.pipeline import Translation | |
# Load pipeline | |
translate = Translation() | |
# Run translations | |
languages = ["fr", "es", "de", "hi", "ja"] | |
for language in languages: | |
text = translate("The sky is blue, the stars are far", language) | |
english = translate(text, "en") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Install txtai with torch cpu | |
pip install txtai torch==2.3.1+cpu \ | |
-f https://download.pytorch.org/whl/torch_stable.html | |
# Install llama.cpp | |
CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from txtai.pipeline import LLM, Summary, Textractor | |
from txtai.workflow import Task, Workflow | |
# Extract text from HTML, ignore boilerplate text | |
textractor = Textractor(lines=True, join=True, minlength=100) | |
text = textractor("https://github.com/neuml/txtai") | |
# Summarization with standard models |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################## | |
# Data functions | |
################################## | |
import re | |
from datasets import load_dataset | |
def clean(text): | |
text = text.replace("\n", " ").strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
from txtai.pipeline import Tokenizer | |
# Split using built-in Python method | |
print("Create embeddings for text".split()) | |
print("🚀Create embeddings for text⭐".split()) | |
print("为文本创建嵌入".split()) | |
# Remove stop words | |
tokenizer = Tokenizer(stopwords=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################## | |
# Data functions | |
################################## | |
import re | |
from datasets import load_dataset | |
def clean(text): | |
text = text.replace("\n", " ").strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai import LLM | |
# Hugging Face models | |
llm = LLM("google/gemma-2-9b") | |
# llama.cpp models automatically downloaded from HF HUB | |
llm = LLM("bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf") | |
# Models served via APIs (OpenAI / Claude / Ollama) | |
llm = LLM("gpt-4o") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langchain.chains import create_retrieval_chain | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_community.vectorstores import FAISS | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
system = ( | |
"You are an assistant for question-answering tasks. " |
NewerOlder