Skip to content

Instantly share code, notes, and snippets.

@mrsiano
Last active December 2, 2018 19:26
Show Gist options
  • Save mrsiano/a694437fc3194826255ec862b2a7c467 to your computer and use it in GitHub Desktop.
Save mrsiano/a694437fc3194826255ec862b2a7c467 to your computer and use it in GitHub Desktop.
package main
// POC for pre-loaded search engine, the following code impl a search component
// based on a Document files i.e (ELK system).
// DataSet object must initialized first (pre-load - in order to map text to document(s)), in
// real life, we will need to sync the Objct and update it when new documents arrives
// currently this is a single thread app.
// the DataSet Object is a singleton instance (thread safe), to support multiple clients.
// Object updates are not thread safe yet!
// time complaxity:
import (
"fmt"
"strings"
"sync"
"time"
core "github.com/mrsiano/search-engine/core"
)
// // Docs is representation of text in Document files.
// type Docs map[int]bool
// Init an interface to make sure any Dataset object will be initalize first.
// the init will run some pre-process stages.
type Init interface {
initalizeDataSet()
}
// DataSet is the actual search engine object.
// recentSearches cached object for search history.
type DataSet struct {
documents map[string]core.Docs
recentSearches map[string][]int
}
var (
instance *DataSet
once sync.Once
tempMultiTextMap = make(map[int]bool)
)
// pre-processing
// scan all documents and map text to doc id.
func (dataset *DataSet) initalizeDataSet() {
fmt.Println("documents and cache pre-load...")
dataset.documents = map[string]core.Docs{}
dataset.recentSearches = make(map[string][]int)
}
// AddDocuments add new keyword and their maps to documents.
func (dataset *DataSet) AddDocuments(keyword string, docs core.Docs) {
dataset.documents[keyword] = docs
}
// GetInstance crates a singleton instance for multiple clients.
func GetInstance() *DataSet {
once.Do(func() {
instance = &DataSet{}
if instance.documents == nil {
instance.initalizeDataSet()
}
})
return instance
}
func (dataset *DataSet) cachedResults(keyword string) []int {
if dataexists := dataset.recentSearches[keyword]; dataexists != nil {
return dataexists
}
return nil
}
// Store historical search keywords and their results.
func (dataset *DataSet) storeSearch(key string, res []int) {
dataset.recentSearches[key] = res
}
func (dataset *DataSet) getTheLargestDoc(words []string) string {
largest := words[0]
lenOfDocs := len(dataset.documents[words[0]])
for i := 1; i < len(words); i++ {
tempLenOfDocs := len(dataset.documents[words[i]])
if tempLenOfDocs > lenOfDocs {
lenOfDocs = tempLenOfDocs
largest = words[i]
}
}
return largest
}
func (dataset *DataSet) evalSearch(word string) (res []int) {
if dataMap := dataset.documents[word]; dataMap != nil {
// TODO: get rid off the res array, and return multitext map.
if len(tempMultiTextMap) == 0 {
tempMultiTextMap = dataMap // populate the tempMultiTextMap.
} else {
for k := range tempMultiTextMap {
if dataMap[k] == true { // if exists store it in a temp resultSet.
res = append(res, k)
} else { // continue \ skip if exists.
continue
}
}
}
}
return res
}
// Search functions will search relevant indexes in DataSet.documents
// based on a given keyword.
func (dataset *DataSet) Search(keyword string) (res []int) {
defer func() { tempMultiTextMap = nil }()
fmt.Println("Searching " + keyword + "...")
words := strings.Split(strings.ToLower(keyword), " ")
// first, search in cache.
if exist := dataset.recentSearches[keyword]; exist != nil {
return exist
}
// evalute search, use it diffrently in case of single vs multi text keywords.
// compare the len and evalute.
if len(words) == 1 {
data := dataset.documents[words[0]]
for key := range data {
res = append(res, key)
}
} else {
// check which of the keywords holds the largest set, and use it as basline.
// evaluate the search against the basline.
largest := dataset.getTheLargestDoc(words)
res = dataset.evalSearch(largest)
for _, word := range words { // fetch word.
if word == largest {
// continue due to line 132 (get largest)
continue
}
res = append(res, dataset.evalSearch(word)...)
}
}
dataset.storeSearch(keyword, res) // store resutls in cache.
return res
}
func unixMilli(t time.Time) int64 {
return t.Round(time.Millisecond).UnixNano() / (int64(time.Millisecond) / int64(time.Nanosecond))
}
// UseFakeData will populate the search engine with mock data.
func (dataset *DataSet) UseFakeData() {
// scan and retrive 1M rows in ~500ms.
fmt.Println("Use fake data")
scaleSize := 6000000
docs := make(core.Docs, scaleSize)
for i := 0; i < scaleSize; i++ {
docs[i] = true
}
dataset.documents["cat"] = docs
scaleSize = 1000000
docs = make(core.Docs, scaleSize)
for i := 0; i < scaleSize; i++ {
docs[i] = true
}
dataset.documents["dog"] = docs
}
func main() {
// dry testing for the search engine
var key string
d1 := GetInstance()
//TODO: use fake data.
d1.UseFakeData()
key = "dog"
fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items")
d2 := GetInstance()
searchTime := unixMilli(time.Now())
key = "dog cat"
fmt.Println("searching: ", key, "found: ", len(d2.Search(key)), "items")
fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms")
d3 := GetInstance()
key = "dog"
fmt.Println("searching: ", key, "found: ", len(d3.Search(key)), "items")
// retrive results from cache
fmt.Println(" *** search from cache ***")
searchTime = unixMilli(time.Now())
key = "dog cat"
fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items")
fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment