Skip to content

Instantly share code, notes, and snippets.

@btnguyen2k
Created May 15, 2023 04:54
Show Gist options
  • Save btnguyen2k/2d418b899a3673cd7b10c68ab39075db to your computer and use it in GitHub Desktop.
Save btnguyen2k/2d418b899a3673cd7b10c68ab39075db to your computer and use it in GitHub Desktop.
Normalize input text for embeddings or not?
package main
import (
"fmt"
"os"
"regexp"
"strings"
"github.com/btnguyen2k/oaiaux"
)
func CompareEmbeddings(client oaiaux.Client, model, query, text string) {
textOrigin := strings.TrimSpace(text)
textLower := strings.TrimSpace(strings.ToLower(text))
textNoNL := strings.TrimSpace(regexp.MustCompile(`[\r\n]+`).ReplaceAllString(text, " "))
textNoNLNW := strings.TrimSpace(regexp.MustCompile(`[^\p{L}\d\'\"\-\_\,\;\:\.\?\!\/]+`).ReplaceAllString(text, " "))
textNormFull := strings.TrimSpace(strings.ToLower(regexp.MustCompile(`[^\p{L}\d\'\"\-\_\,\;\:\.\?\!\/]+`).ReplaceAllString(text, " ")))
allText := []struct {
name, text string
}{
{name: "Origin", text: textOrigin},
{name: "Lower", text: textLower},
{name: "NoNL", text: textNoNL},
{name: "NoNLNW", text: textNoNLNW},
{name: "NormFull", text: textNormFull},
}
input := &oaiaux.EmbeddingsInput{
Model: model,
Input: query,
}
outputQuery := client.Embeddings(input)
if outputQuery.Error != nil {
panic(outputQuery.Error)
}
fmt.Printf("|%30s|%5d|", model, len(outputQuery.Data[0].Embedding))
for _, el := range allText {
input.Input = el.text
output := client.Embeddings(input)
if output.Error != nil {
panic(output.Error)
}
cosine := outputQuery.Data[0].Embedding.Cosine(output.Data[0].Embedding)
fmt.Printf("%.6f|", cosine)
}
fmt.Println()
}
func main() {
clientOpenAI, err := oaiaux.NewClient(oaiaux.PlatformOpenAI,
oaiaux.Option{Key: oaiaux.OptOpenAIApiKey, Value: os.Getenv("OPENAI_API_KEY")},
oaiaux.Option{Key: oaiaux.OptOpenAIOrganization, Value: os.Getenv("OPENAI_ORGANIZATION_ID")},
)
if err != nil {
panic(err)
}
query := "what are use cases of Markdown?"
text := `# Why Use Markdown?
You might be wondering why people use Markdown instead of a WYSIWYG editor. Why write with Markdown when you can press buttons in an interface to format your text? As it turns out, there are several reasons why people use Markdown instead of WYSIWYG editors.
- Markdown can be used for everything. People use it to create websites, documents, notes, books, presentations, email messages, and technical documentation.
- Markdown is portable. Files containing Markdown-formatted text can be opened using virtually any application. If you decide you don’t like the Markdown application you’re currently using, you can import your Markdown files into another Markdown application. That’s in stark contrast to word processing applications like Microsoft Word that lock your content into a proprietary file format.
- Markdown is platform independent. You can create Markdown-formatted text on any device running any operating system.
- Markdown is future proof. Even if the application you’re using stops working at some point in the future, you’ll still be able to read your Markdown-formatted text using a text editing application. This is an important consideration when it comes to books, university theses, and other milestone documents that need to be preserved indefinitely.
- Markdown is everywhere. Websites like Reddit and GitHub support Markdown, and lots of desktop and web-based applications support it.`
models := []string{
"text-embedding-ada-002",
"text-similarity-ada-001", "text-similarity-babbage-001", "text-similarity-curie-001", "text-similarity-davinci-001",
"text-search-ada-doc-001", "text-search-ada-query-001",
"text-search-babbage-doc-001", "text-search-babbage-query-001",
"text-search-curie-doc-001", "text-search-curie-query-001",
"text-search-davinci-doc-001", "text-search-davinci-query-001",
}
fmt.Printf("|Model|Dimensions|Origin|Lower|NoNL|NoNLNW|NormFull|\n")
fmt.Printf("|-----|---------:|-----:|----:|---:|-----:|-------:|\n")
for _, model := range models {
CompareEmbeddings(clientOpenAI, model, query, text)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment