mrsiano/search_engine.go

## search_engine.go
package main

// POC for pre-loaded search engine, the following code impl a search component
// based on a Document files i.e (ELK system).
// DataSet object must initialized first (pre-load - in order to map text to document(s)), in
// real life, we will need to sync the Objct and update it when new documents arrives
// currently this is a single thread app.
// the DataSet Object is a singleton instance (thread safe), to support multiple clients.
// Object updates are not thread safe yet!
// time complaxity:

import (
	"fmt"
	"strings"
	"sync"
	"time"

	core "github.com/mrsiano/search-engine/core"
)

// // Docs is representation of text in Document files.
// type Docs map[int]bool

// Init an interface to make sure any Dataset object will be initalize first.
// the init will run some pre-process stages.
type Init interface {
	initalizeDataSet()
}

// DataSet is the actual search engine object.
// recentSearches cached object for search history.
type DataSet struct {
	documents      map[string]core.Docs
	recentSearches map[string][]int
}

var (
	instance         *DataSet
	once             sync.Once
	tempMultiTextMap = make(map[int]bool)
)

// pre-processing
// scan all documents and map text to doc id.
func (dataset *DataSet) initalizeDataSet() {
	fmt.Println("documents and cache pre-load...")
	dataset.documents = map[string]core.Docs{}
	dataset.recentSearches = make(map[string][]int)
}

// AddDocuments add new keyword and their maps to documents.
func (dataset *DataSet) AddDocuments(keyword string, docs core.Docs) {
	dataset.documents[keyword] = docs
}

// GetInstance crates a singleton instance for multiple clients.
func GetInstance() *DataSet {
	once.Do(func() {
		instance = &DataSet{}
		if instance.documents == nil {
			instance.initalizeDataSet()
		}
	})
	return instance
}

func (dataset *DataSet) cachedResults(keyword string) []int {
	if dataexists := dataset.recentSearches[keyword]; dataexists != nil {
		return dataexists
	}
	return nil
}

// Store historical search keywords and their results.
func (dataset *DataSet) storeSearch(key string, res []int) {
	dataset.recentSearches[key] = res
}

func (dataset *DataSet) getTheLargestDoc(words []string) string {
	largest := words[0]
	lenOfDocs := len(dataset.documents[words[0]])
	for i := 1; i < len(words); i++ {
		tempLenOfDocs := len(dataset.documents[words[i]])
		if tempLenOfDocs > lenOfDocs {
			lenOfDocs = tempLenOfDocs
			largest = words[i]
		}
	}
	return largest
}

func (dataset *DataSet) evalSearch(word string) (res []int) {
	if dataMap := dataset.documents[word]; dataMap != nil {
		// TODO: get rid off the res array, and return multitext map.
		if len(tempMultiTextMap) == 0 {
			tempMultiTextMap = dataMap // populate the tempMultiTextMap.
		} else {
			for k := range tempMultiTextMap {
				if dataMap[k] == true { // if exists store it in a temp resultSet.
					res = append(res, k)
				} else { // continue \ skip if exists.
					continue
				}
			}
		}
	}
	return res
}

// Search functions will search relevant indexes in DataSet.documents
// based on a given keyword.
func (dataset *DataSet) Search(keyword string) (res []int) {
	defer func() { tempMultiTextMap = nil }()
	fmt.Println("Searching " + keyword + "...")
	words := strings.Split(strings.ToLower(keyword), " ")

	// first, search in cache.
	if exist := dataset.recentSearches[keyword]; exist != nil {
		return exist
	}

	// evalute search, use it diffrently in case of single vs multi text keywords.
	// compare the len and evalute.
	if len(words) == 1 {
		data := dataset.documents[words[0]]
		for key := range data {
			res = append(res, key)
		}
	} else {
		// check which of the keywords holds the largest set, and use it as basline.
		// evaluate the search against the basline.
		largest := dataset.getTheLargestDoc(words)
		res = dataset.evalSearch(largest)
		for _, word := range words { // fetch word.
			if word == largest {
				// continue due to line 132 (get largest)
				continue
			}
			res = append(res, dataset.evalSearch(word)...)
		}
	}
	dataset.storeSearch(keyword, res) // store resutls in cache.
	return res
}

func unixMilli(t time.Time) int64 {
	return t.Round(time.Millisecond).UnixNano() / (int64(time.Millisecond) / int64(time.Nanosecond))
}

// UseFakeData will populate the search engine with mock data.
func (dataset *DataSet) UseFakeData() {
	// scan and retrive 1M rows in ~500ms.
	fmt.Println("Use fake data")
	scaleSize := 6000000
	docs := make(core.Docs, scaleSize)
	for i := 0; i < scaleSize; i++ {
		docs[i] = true
	}
	dataset.documents["cat"] = docs
	scaleSize = 1000000
	docs = make(core.Docs, scaleSize)
	for i := 0; i < scaleSize; i++ {
		docs[i] = true
	}
	dataset.documents["dog"] = docs
}

func main() {
	// dry testing for the search engine
	var key string
	d1 := GetInstance()
	//TODO: use fake data.
	d1.UseFakeData()
	key = "dog"
	fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items")
	d2 := GetInstance()
	searchTime := unixMilli(time.Now())
	key = "dog cat"
	fmt.Println("searching: ", key, "found: ", len(d2.Search(key)), "items")
	fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms")
	d3 := GetInstance()
	key = "dog"
	fmt.Println("searching: ", key, "found: ", len(d3.Search(key)), "items")

	// retrive results from cache
	fmt.Println(" *** search from cache ***")
	searchTime = unixMilli(time.Now())
	key = "dog cat"
	fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items")
	fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms")
}
	package main

	// POC for pre-loaded search engine, the following code impl a search component
	// based on a Document files i.e (ELK system).
	// DataSet object must initialized first (pre-load - in order to map text to document(s)), in
	// real life, we will need to sync the Objct and update it when new documents arrives
	// currently this is a single thread app.
	// the DataSet Object is a singleton instance (thread safe), to support multiple clients.
	// Object updates are not thread safe yet!
	// time complaxity:

	import (
	"fmt"
	"strings"
	"sync"
	"time"

	core "github.com/mrsiano/search-engine/core"
	)

	// // Docs is representation of text in Document files.
	// type Docs map[int]bool

	// Init an interface to make sure any Dataset object will be initalize first.
	// the init will run some pre-process stages.
	type Init interface {
	initalizeDataSet()
	}

	// DataSet is the actual search engine object.
	// recentSearches cached object for search history.
	type DataSet struct {
	documents map[string]core.Docs
	recentSearches map[string][]int
	}

	var (
	instance *DataSet
	once sync.Once
	tempMultiTextMap = make(map[int]bool)
	)

	// pre-processing
	// scan all documents and map text to doc id.
	func (dataset *DataSet) initalizeDataSet() {
	fmt.Println("documents and cache pre-load...")
	dataset.documents = map[string]core.Docs{}
	dataset.recentSearches = make(map[string][]int)
	}

	// AddDocuments add new keyword and their maps to documents.
	func (dataset *DataSet) AddDocuments(keyword string, docs core.Docs) {
	dataset.documents[keyword] = docs
	}

	// GetInstance crates a singleton instance for multiple clients.
	func GetInstance() *DataSet {
	once.Do(func() {
	instance = &DataSet{}
	if instance.documents == nil {
	instance.initalizeDataSet()
	}
	})
	return instance
	}

	func (dataset *DataSet) cachedResults(keyword string) []int {
	if dataexists := dataset.recentSearches[keyword]; dataexists != nil {
	return dataexists
	}
	return nil
	}

	// Store historical search keywords and their results.
	func (dataset *DataSet) storeSearch(key string, res []int) {
	dataset.recentSearches[key] = res
	}

	func (dataset *DataSet) getTheLargestDoc(words []string) string {
	largest := words[0]
	lenOfDocs := len(dataset.documents[words[0]])
	for i := 1; i < len(words); i++ {
	tempLenOfDocs := len(dataset.documents[words[i]])
	if tempLenOfDocs > lenOfDocs {
	lenOfDocs = tempLenOfDocs
	largest = words[i]
	}
	}
	return largest
	}

	func (dataset *DataSet) evalSearch(word string) (res []int) {
	if dataMap := dataset.documents[word]; dataMap != nil {
	// TODO: get rid off the res array, and return multitext map.
	if len(tempMultiTextMap) == 0 {
	tempMultiTextMap = dataMap // populate the tempMultiTextMap.
	} else {
	for k := range tempMultiTextMap {
	if dataMap[k] == true { // if exists store it in a temp resultSet.
	res = append(res, k)
	} else { // continue \ skip if exists.
	continue
	}
	}
	}
	}
	return res
	}

	// Search functions will search relevant indexes in DataSet.documents
	// based on a given keyword.
	func (dataset *DataSet) Search(keyword string) (res []int) {
	defer func() { tempMultiTextMap = nil }()
	fmt.Println("Searching " + keyword + "...")
	words := strings.Split(strings.ToLower(keyword), " ")

	// first, search in cache.
	if exist := dataset.recentSearches[keyword]; exist != nil {
	return exist
	}

	// evalute search, use it diffrently in case of single vs multi text keywords.
	// compare the len and evalute.
	if len(words) == 1 {
	data := dataset.documents[words[0]]
	for key := range data {
	res = append(res, key)
	}
	} else {
	// check which of the keywords holds the largest set, and use it as basline.
	// evaluate the search against the basline.
	largest := dataset.getTheLargestDoc(words)
	res = dataset.evalSearch(largest)
	for _, word := range words { // fetch word.
	if word == largest {
	// continue due to line 132 (get largest)
	continue
	}
	res = append(res, dataset.evalSearch(word)...)
	}
	}
	dataset.storeSearch(keyword, res) // store resutls in cache.
	return res
	}

	func unixMilli(t time.Time) int64 {
	return t.Round(time.Millisecond).UnixNano() / (int64(time.Millisecond) / int64(time.Nanosecond))
	}

	// UseFakeData will populate the search engine with mock data.
	func (dataset *DataSet) UseFakeData() {
	// scan and retrive 1M rows in ~500ms.
	fmt.Println("Use fake data")
	scaleSize := 6000000
	docs := make(core.Docs, scaleSize)
	for i := 0; i < scaleSize; i++ {
	docs[i] = true
	}
	dataset.documents["cat"] = docs
	scaleSize = 1000000
	docs = make(core.Docs, scaleSize)
	for i := 0; i < scaleSize; i++ {
	docs[i] = true
	}
	dataset.documents["dog"] = docs
	}

	func main() {
	// dry testing for the search engine
	var key string
	d1 := GetInstance()
	//TODO: use fake data.
	d1.UseFakeData()
	key = "dog"
	fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items")
	d2 := GetInstance()
	searchTime := unixMilli(time.Now())
	key = "dog cat"
	fmt.Println("searching: ", key, "found: ", len(d2.Search(key)), "items")
	fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms")
	d3 := GetInstance()
	key = "dog"
	fmt.Println("searching: ", key, "found: ", len(d3.Search(key)), "items")

	// retrive results from cache
	fmt.Println(" * search from cache *")
	searchTime = unixMilli(time.Now())
	key = "dog cat"
	fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items")
	fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms")
	}