Created
June 16, 2016 20:46
-
-
Save avinoamr/0961495ed737c36e93e2866fb70854f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* External Map is a wrapper around Go's built-in map types to support growing | |
* the number of keys beyond the available memory by dumping parts of it to | |
* disk. | |
* | |
* It starts out as normal in-memory map, and a predefined threshold for | |
* its maximum capacity (Max), which is the maximum number of items the map will | |
* contain. When Max is reached, a Split operation takes place that cuts | |
* the map in half - keeping ~Max/2 items in-memory and saving the rest | |
* to disk. As a side-effect, the Split also maintains a small in-memory | |
* dictionary of all available files and which key-range they contain in order | |
* to allow lookups. | |
* | |
* When a lookup occurs, if it can't be resolved in memory, a Load operation | |
* takes place to load the relevant file where the key should reside back into | |
* memory. The lookup then resolves normally. Note that random access to | |
* different key can't result in a huge I/O overhead, so it's wise to plan your | |
* usage around handling keys in sequentially in batches that match their | |
* location in the key-range files. | |
* | |
* Deletions & updates are simply a lookup followed by an in-memory update. | |
* | |
* Open Issues: | |
* 1. Thread safety (currently it's thread unsafe!) | |
* In go, maps aren't thread-safe. Is that good enough for us? | |
* 2. Managing file descriptors, different writers, locations, etc. | |
* 3. Loading/Splitting might cause unnecessary grow and shrink operations | |
* on the underlying hashtable | |
* | |
*/ | |
package main | |
import "io" | |
import "os" | |
import "fmt" | |
import "sort" | |
import "math" | |
import "io/ioutil" | |
// the hashable key object | |
type Key interface { | |
Hash() int | |
} | |
// any value to save for that key | |
type Value interface{} | |
type KV struct { | |
k int | |
v string | |
} | |
type Map struct { | |
// the in-memory map | |
Data map[int]string | |
// maximum number of items to grow to | |
Max int | |
// the active memory range | |
MemRange *KeyRange | |
// list of all file key-ranges. | |
FileRanges []*KeyRange | |
// dirty flag indicates if the in-memory key-range has been modified and | |
// thus needs to be re-saved before additional loads | |
Dirty bool | |
} | |
type KeyRange struct { | |
Min int // inclusive | |
Max int // exclusive | |
File *os.File | |
} | |
func New(max int) *Map { | |
return &Map{ | |
Max: max, | |
Data: make(map[int]string), | |
MemRange: &KeyRange{Min: math.MinInt64, Max: math.MaxInt64}, | |
FileRanges: make([]*KeyRange, 0), | |
} | |
} | |
/** | |
* Sets a new value to key K. | |
* | |
* When still below the max-threshold, the key is inserted in memory. This is | |
* also the case when the key is within the currently active key-range. In both | |
* these cases, if the new insertion exceeds Max, a Split operation will cut the | |
* map size in half. | |
* | |
* Otherwise, the key-value pairs are appended to the relevant key-range file. | |
*/ | |
func (m *Map) Set(k int, v string) error { | |
if k >= m.MemRange.Min && k < m.MemRange.Max { | |
// it belongs in-memory, just add it. | |
m.Data[k] = v | |
m.Dirty = true | |
// if we exceeded Max, split the map. | |
if len(m.Data) > m.Max { | |
return m.Split() | |
} | |
return nil | |
} else { | |
// find the relevant key-range to place the key | |
kr := m.KeyRangeFor(k) | |
// save it to the relevant file. | |
tofile := make([]byte, 2) | |
tofile[0] = byte(k) // key | |
tofile[1] = byte(len(v)) // value length | |
tofile = append(tofile, []byte(v)...) // value | |
_, err := kr.File.Write(tofile) | |
return err | |
} | |
} | |
func (m *Map) Get(k int) (string, error) { | |
if k >= m.MemRange.Min && k < m.MemRange.Max { | |
// should reside in memory | |
return m.Data[k], nil | |
} else { | |
// should reside in a key-range file | |
kr := m.KeyRangeFor(k) | |
// load the key range | |
err := m.Load(kr) | |
if err != nil { | |
return "", err | |
} | |
// try again | |
return m.Get(k) | |
} | |
} | |
/** | |
* Determines where a key is located, returning the key-range that contains it | |
* and a boolean indicating if it's available in memory (supports fast Get()) | |
* or not (slow Get() due to loading) | |
*/ | |
func (m *Map) Find(k int) (*KeyRange, bool) { | |
if k >= m.MemRange.Min && k < m.MemRange.Max { | |
return m.MemRange, true | |
} else { | |
return m.KeyRangeFor(k), false | |
} | |
} | |
/** | |
* Free-up space by cutting the map in half: keeping approx. Max/2 items in | |
* memory, and saving the rest to a file for later probing. | |
* | |
* The split is performed around a Pivot key. The map is then scanned and split: | |
* keeping all keys below the Pivot in-memory, and dumping the rest to a new | |
* file. The file is therefor guaranteed to contain only values above the pivot, | |
* a fact that's used to optimize the Load operation. | |
*/ | |
func (m *Map) Split() error { | |
oldmax := m.MemRange.Max | |
m.MemRange.Max = pivot(m.Data) | |
// update the maximum value of the active in-memory map | |
// breaks the invariant that all memory data is within the bounds | |
fmt.Println("Splitting by", m.MemRange.Max) | |
// create the new key-range | |
kr := &KeyRange{Min: m.MemRange.Max, Max: oldmax} | |
// restore the invariant by dumping all entries that doesn't match to a file | |
err := m.Dump(kr) | |
if err != nil { | |
return err | |
} | |
// TODO: compare len(m.Data) with the original length to verify an effective | |
// pivot. Otherwise, reselect a more aggressive pivot. | |
// remember the newly created key-range to handle cases where future | |
// insertions and lookups can't be performed in-memory and needs to be | |
// appended to this file. | |
m.FileRanges = append(m.FileRanges, kr) | |
return nil | |
} | |
/** | |
* Load a key-range from file back into memory. | |
*/ | |
func (m *Map) Load(kr *KeyRange) error { | |
fmt.Println("Loading", kr) | |
// before loading - save the current in-memory map to a new key-range | |
// for later probing and insertions. Clear the table while we're at it. | |
if m.Dirty { | |
err := m.Dump(m.MemRange) | |
m.Dirty = false | |
if err != nil { | |
return err | |
} | |
} | |
// remember the key-range to handle cases where future insertions and | |
// lookups can't be performed in-memory and needs to be appended to this | |
// file. | |
m.FileRanges = append(m.FileRanges, m.MemRange) | |
m.MemRange = kr | |
// now, load the requested key-range | |
kr.File.Seek(0, 0) | |
header := make([]byte, 2) | |
for { | |
_, err := kr.File.Read(header) | |
if err == io.EOF { | |
break | |
} | |
k := int(header[0]) | |
l := int(header[1]) | |
body := make([]byte, l) | |
_, err = kr.File.Read(body) | |
if err != nil { | |
return err | |
} | |
// set it using the normal set function to support further splitting | |
// while loading. This can happen when the key-range being appended to | |
// beyond the Max limit. | |
err = m.Set(k, string(body)) | |
if err != nil { | |
return err | |
} | |
} | |
// close the file as we're not expected to use it again? | |
return nil | |
} | |
// helper function for dumping a specific key-range from memory to file | |
func (m *Map) Dump(kr *KeyRange) error { | |
tofile := make([]byte, 0) | |
for k, v := range m.Data { | |
if k >= kr.Min && k < kr.Max { | |
delete(m.Data, k) // free it up. | |
// add it to the file | |
tofile = append(tofile, byte(k)) // key | |
tofile = append(tofile, byte(len(v))) // value length | |
tofile = append(tofile, []byte(v)...) // value | |
} | |
} | |
// save the new key-range to file | |
var err error | |
if kr.File == nil { | |
prefix := fmt.Sprintf("externalmap.%d-%d", kr.Min, kr.Max) | |
kr.File, err = ioutil.TempFile("", prefix) | |
if err != nil { | |
return err | |
} | |
} | |
// write it. | |
_, err = kr.File.Write(tofile) | |
return err | |
} | |
/** | |
* Select a pivot key to split the in-memory map by. The pivot is selected to as | |
* the estimated median key of the map, implemented with Bentley and McIlroy’s | |
* pseudomedian of nine. | |
*/ | |
func pivot(data map[int]string) int { | |
samples := make([]int, 0, 9) | |
i := 0 | |
l := len(data) | |
for k, _ := range data { | |
if i == 0 || i == l/2 || i == l - 1 { | |
samples = append(samples, k) | |
} | |
i += 1 | |
} | |
// take the median of the 3 samples | |
sorted := sort.IntSlice(samples) | |
sort.Sort(sorted) | |
return sorted[1] | |
} | |
func (m *Map) KeyRangeFor(k int) *KeyRange { | |
for _, kr := range m.FileRanges { | |
if k >= kr.Min && k < kr.Max { | |
return kr | |
} | |
} | |
return nil | |
} | |
// -- test & play. | |
func main() { | |
m := New(30) | |
for i := 0 ; i < 100 ; i += 1 { | |
m.Set(i, fmt.Sprintf("i%d", i)) | |
fmt.Println("Set", i) | |
} | |
for i := 0 ; i < 100 ; i += 1 { | |
v, _ := m.Get(i) | |
fmt.Println(i, v) | |
} | |
// m.Set(1, "hello") | |
// m.Set(2, "world") | |
// m.Set(3, "foo") | |
// m.Set(4, "bar") | |
// m.Set(5, "cookie") | |
// fmt.Println("Pre-split=", m.Data) | |
// m.Set(6, "monster") | |
// fmt.Println("Post-split=", m.Data) | |
// m.Set(7, "happy") | |
// fmt.Println("Excessive insert=", m.Data) | |
// v, _ := m.Get(2) | |
// fmt.Println("Get(2)=", v) | |
// fmt.Println("Pre-load=", m.Data) | |
// v, _ = m.Get(6) | |
// fmt.Println("Get(6)=", v) | |
// fmt.Println("Post-load=", m.Data) | |
// v, _ = m.Get(2) | |
// fmt.Println("Get(2)=", v) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment