Skip to content

Instantly share code, notes, and snippets.

View davidmezzetti's full-sized avatar

David Mezzetti davidmezzetti

View GitHub Profile
from txtai.pipeline import Textractor
textractor = Textractor(sections=True)
# Install [pipeline-data] extra to support extracting text from docx/pdf/xlsx
for section in textractor("https://github.com/neuml/txtai"):
print(f"\n[SECTION]\n{section}")
import time
from datetime import timedelta
from datasets import load_dataset
from txtai import LLM
from txtai.pipeline import Labels, HFTrainer
def prompt(text):
text = f"""
from txtai.pipeline import Translation
# Load pipeline
translate = Translation()
# Run translations
languages = ["fr", "es", "de", "hi", "ja"]
for language in languages:
text = translate("The sky is blue, the stars are far", language)
english = translate(text, "en")
# Install txtai with torch cpu
pip install txtai torch==2.3.1+cpu \
-f https://download.pytorch.org/whl/torch_stable.html
# Install llama.cpp
CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
import time
from txtai.pipeline import LLM, Summary, Textractor
from txtai.workflow import Task, Workflow
# Extract text from HTML, ignore boilerplate text
textractor = Textractor(lines=True, join=True, minlength=100)
text = textractor("https://github.com/neuml/txtai")
# Summarization with standard models
##################################
# Data functions
##################################
import re
from datasets import load_dataset
def clean(text):
text = text.replace("\n", " ").strip()
from transformers import AutoTokenizer
from txtai.pipeline import Tokenizer
# Split using built-in Python method
print("Create embeddings for text".split())
print("🚀Create embeddings for text⭐".split())
print("为文本创建嵌入".split())
# Remove stop words
tokenizer = Tokenizer(stopwords=True)
##################################
# Data functions
##################################
import re
from datasets import load_dataset
def clean(text):
text = text.replace("\n", " ").strip()
from txtai import LLM
# Hugging Face models
llm = LLM("google/gemma-2-9b")
# llama.cpp models automatically downloaded from HF HUB
llm = LLM("bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf")
# Models served via APIs (OpenAI / Claude / Ollama)
llm = LLM("gpt-4o")
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
system = (
"You are an assistant for question-answering tasks. "