Last active
December 2, 2018 19:26
-
-
Save mrsiano/a694437fc3194826255ec862b2a7c467 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// POC for pre-loaded search engine, the following code impl a search component | |
// based on a Document files i.e (ELK system). | |
// DataSet object must initialized first (pre-load - in order to map text to document(s)), in | |
// real life, we will need to sync the Objct and update it when new documents arrives | |
// currently this is a single thread app. | |
// the DataSet Object is a singleton instance (thread safe), to support multiple clients. | |
// Object updates are not thread safe yet! | |
// time complaxity: | |
import ( | |
"fmt" | |
"strings" | |
"sync" | |
"time" | |
core "github.com/mrsiano/search-engine/core" | |
) | |
// // Docs is representation of text in Document files. | |
// type Docs map[int]bool | |
// Init an interface to make sure any Dataset object will be initalize first. | |
// the init will run some pre-process stages. | |
type Init interface { | |
initalizeDataSet() | |
} | |
// DataSet is the actual search engine object. | |
// recentSearches cached object for search history. | |
type DataSet struct { | |
documents map[string]core.Docs | |
recentSearches map[string][]int | |
} | |
var ( | |
instance *DataSet | |
once sync.Once | |
tempMultiTextMap = make(map[int]bool) | |
) | |
// pre-processing | |
// scan all documents and map text to doc id. | |
func (dataset *DataSet) initalizeDataSet() { | |
fmt.Println("documents and cache pre-load...") | |
dataset.documents = map[string]core.Docs{} | |
dataset.recentSearches = make(map[string][]int) | |
} | |
// AddDocuments add new keyword and their maps to documents. | |
func (dataset *DataSet) AddDocuments(keyword string, docs core.Docs) { | |
dataset.documents[keyword] = docs | |
} | |
// GetInstance crates a singleton instance for multiple clients. | |
func GetInstance() *DataSet { | |
once.Do(func() { | |
instance = &DataSet{} | |
if instance.documents == nil { | |
instance.initalizeDataSet() | |
} | |
}) | |
return instance | |
} | |
func (dataset *DataSet) cachedResults(keyword string) []int { | |
if dataexists := dataset.recentSearches[keyword]; dataexists != nil { | |
return dataexists | |
} | |
return nil | |
} | |
// Store historical search keywords and their results. | |
func (dataset *DataSet) storeSearch(key string, res []int) { | |
dataset.recentSearches[key] = res | |
} | |
func (dataset *DataSet) getTheLargestDoc(words []string) string { | |
largest := words[0] | |
lenOfDocs := len(dataset.documents[words[0]]) | |
for i := 1; i < len(words); i++ { | |
tempLenOfDocs := len(dataset.documents[words[i]]) | |
if tempLenOfDocs > lenOfDocs { | |
lenOfDocs = tempLenOfDocs | |
largest = words[i] | |
} | |
} | |
return largest | |
} | |
func (dataset *DataSet) evalSearch(word string) (res []int) { | |
if dataMap := dataset.documents[word]; dataMap != nil { | |
// TODO: get rid off the res array, and return multitext map. | |
if len(tempMultiTextMap) == 0 { | |
tempMultiTextMap = dataMap // populate the tempMultiTextMap. | |
} else { | |
for k := range tempMultiTextMap { | |
if dataMap[k] == true { // if exists store it in a temp resultSet. | |
res = append(res, k) | |
} else { // continue \ skip if exists. | |
continue | |
} | |
} | |
} | |
} | |
return res | |
} | |
// Search functions will search relevant indexes in DataSet.documents | |
// based on a given keyword. | |
func (dataset *DataSet) Search(keyword string) (res []int) { | |
defer func() { tempMultiTextMap = nil }() | |
fmt.Println("Searching " + keyword + "...") | |
words := strings.Split(strings.ToLower(keyword), " ") | |
// first, search in cache. | |
if exist := dataset.recentSearches[keyword]; exist != nil { | |
return exist | |
} | |
// evalute search, use it diffrently in case of single vs multi text keywords. | |
// compare the len and evalute. | |
if len(words) == 1 { | |
data := dataset.documents[words[0]] | |
for key := range data { | |
res = append(res, key) | |
} | |
} else { | |
// check which of the keywords holds the largest set, and use it as basline. | |
// evaluate the search against the basline. | |
largest := dataset.getTheLargestDoc(words) | |
res = dataset.evalSearch(largest) | |
for _, word := range words { // fetch word. | |
if word == largest { | |
// continue due to line 132 (get largest) | |
continue | |
} | |
res = append(res, dataset.evalSearch(word)...) | |
} | |
} | |
dataset.storeSearch(keyword, res) // store resutls in cache. | |
return res | |
} | |
func unixMilli(t time.Time) int64 { | |
return t.Round(time.Millisecond).UnixNano() / (int64(time.Millisecond) / int64(time.Nanosecond)) | |
} | |
// UseFakeData will populate the search engine with mock data. | |
func (dataset *DataSet) UseFakeData() { | |
// scan and retrive 1M rows in ~500ms. | |
fmt.Println("Use fake data") | |
scaleSize := 6000000 | |
docs := make(core.Docs, scaleSize) | |
for i := 0; i < scaleSize; i++ { | |
docs[i] = true | |
} | |
dataset.documents["cat"] = docs | |
scaleSize = 1000000 | |
docs = make(core.Docs, scaleSize) | |
for i := 0; i < scaleSize; i++ { | |
docs[i] = true | |
} | |
dataset.documents["dog"] = docs | |
} | |
func main() { | |
// dry testing for the search engine | |
var key string | |
d1 := GetInstance() | |
//TODO: use fake data. | |
d1.UseFakeData() | |
key = "dog" | |
fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items") | |
d2 := GetInstance() | |
searchTime := unixMilli(time.Now()) | |
key = "dog cat" | |
fmt.Println("searching: ", key, "found: ", len(d2.Search(key)), "items") | |
fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms") | |
d3 := GetInstance() | |
key = "dog" | |
fmt.Println("searching: ", key, "found: ", len(d3.Search(key)), "items") | |
// retrive results from cache | |
fmt.Println(" *** search from cache ***") | |
searchTime = unixMilli(time.Now()) | |
key = "dog cat" | |
fmt.Println("searching: ", key, "found: ", len(d1.Search(key)), "items") | |
fmt.Println("Search time", unixMilli(time.Now())-searchTime, "ms") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment