Skip to content

Instantly share code, notes, and snippets.

@davidmezzetti
Created July 6, 2024 20:49
Show Gist options
  • Save davidmezzetti/c620b4f4a0febba01251dde859486fe2 to your computer and use it in GitHub Desktop.
Save davidmezzetti/c620b4f4a0febba01251dde859486fe2 to your computer and use it in GitHub Desktop.
from transformers import AutoTokenizer
from txtai.pipeline import Tokenizer
# Split using built-in Python method
print("Create embeddings for text".split())
print("🚀Create embeddings for text⭐".split())
print("为文本创建嵌入".split())
# Remove stop words
tokenizer = Tokenizer(stopwords=True)
print(tokenizer("Create embeddings for text"))
# Unicode Standard Annex #29 like Apache Lucene's standard tokenizer
tokenizer = Tokenizer()
print(tokenizer("Create embeddings for text"))
print(tokenizer("🚀Create embeddings for text⭐"))
print(tokenizer("为文本创建嵌入"))
# BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
print(tokenizer.tokenize("Create embeddings for text"))
print(tokenizer.tokenize("🚀Create embeddings for text⭐"))
# BERT tokenizer for Chinese
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
print(tokenizer.tokenize("为文本创建嵌入"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment