Created
May 15, 2023 04:54
-
-
Save btnguyen2k/2d418b899a3673cd7b10c68ab39075db to your computer and use it in GitHub Desktop.
Normalize input text for embeddings or not?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"os" | |
"regexp" | |
"strings" | |
"github.com/btnguyen2k/oaiaux" | |
) | |
func CompareEmbeddings(client oaiaux.Client, model, query, text string) { | |
textOrigin := strings.TrimSpace(text) | |
textLower := strings.TrimSpace(strings.ToLower(text)) | |
textNoNL := strings.TrimSpace(regexp.MustCompile(`[\r\n]+`).ReplaceAllString(text, " ")) | |
textNoNLNW := strings.TrimSpace(regexp.MustCompile(`[^\p{L}\d\'\"\-\_\,\;\:\.\?\!\/]+`).ReplaceAllString(text, " ")) | |
textNormFull := strings.TrimSpace(strings.ToLower(regexp.MustCompile(`[^\p{L}\d\'\"\-\_\,\;\:\.\?\!\/]+`).ReplaceAllString(text, " "))) | |
allText := []struct { | |
name, text string | |
}{ | |
{name: "Origin", text: textOrigin}, | |
{name: "Lower", text: textLower}, | |
{name: "NoNL", text: textNoNL}, | |
{name: "NoNLNW", text: textNoNLNW}, | |
{name: "NormFull", text: textNormFull}, | |
} | |
input := &oaiaux.EmbeddingsInput{ | |
Model: model, | |
Input: query, | |
} | |
outputQuery := client.Embeddings(input) | |
if outputQuery.Error != nil { | |
panic(outputQuery.Error) | |
} | |
fmt.Printf("|%30s|%5d|", model, len(outputQuery.Data[0].Embedding)) | |
for _, el := range allText { | |
input.Input = el.text | |
output := client.Embeddings(input) | |
if output.Error != nil { | |
panic(output.Error) | |
} | |
cosine := outputQuery.Data[0].Embedding.Cosine(output.Data[0].Embedding) | |
fmt.Printf("%.6f|", cosine) | |
} | |
fmt.Println() | |
} | |
func main() { | |
clientOpenAI, err := oaiaux.NewClient(oaiaux.PlatformOpenAI, | |
oaiaux.Option{Key: oaiaux.OptOpenAIApiKey, Value: os.Getenv("OPENAI_API_KEY")}, | |
oaiaux.Option{Key: oaiaux.OptOpenAIOrganization, Value: os.Getenv("OPENAI_ORGANIZATION_ID")}, | |
) | |
if err != nil { | |
panic(err) | |
} | |
query := "what are use cases of Markdown?" | |
text := `# Why Use Markdown? | |
You might be wondering why people use Markdown instead of a WYSIWYG editor. Why write with Markdown when you can press buttons in an interface to format your text? As it turns out, there are several reasons why people use Markdown instead of WYSIWYG editors. | |
- Markdown can be used for everything. People use it to create websites, documents, notes, books, presentations, email messages, and technical documentation. | |
- Markdown is portable. Files containing Markdown-formatted text can be opened using virtually any application. If you decide you don’t like the Markdown application you’re currently using, you can import your Markdown files into another Markdown application. That’s in stark contrast to word processing applications like Microsoft Word that lock your content into a proprietary file format. | |
- Markdown is platform independent. You can create Markdown-formatted text on any device running any operating system. | |
- Markdown is future proof. Even if the application you’re using stops working at some point in the future, you’ll still be able to read your Markdown-formatted text using a text editing application. This is an important consideration when it comes to books, university theses, and other milestone documents that need to be preserved indefinitely. | |
- Markdown is everywhere. Websites like Reddit and GitHub support Markdown, and lots of desktop and web-based applications support it.` | |
models := []string{ | |
"text-embedding-ada-002", | |
"text-similarity-ada-001", "text-similarity-babbage-001", "text-similarity-curie-001", "text-similarity-davinci-001", | |
"text-search-ada-doc-001", "text-search-ada-query-001", | |
"text-search-babbage-doc-001", "text-search-babbage-query-001", | |
"text-search-curie-doc-001", "text-search-curie-query-001", | |
"text-search-davinci-doc-001", "text-search-davinci-query-001", | |
} | |
fmt.Printf("|Model|Dimensions|Origin|Lower|NoNL|NoNLNW|NormFull|\n") | |
fmt.Printf("|-----|---------:|-----:|----:|---:|-----:|-------:|\n") | |
for _, model := range models { | |
CompareEmbeddings(clientOpenAI, model, query, text) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment