Skip to content

Instantly share code, notes, and snippets.

@avinoamr
Created June 16, 2016 20:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avinoamr/0961495ed737c36e93e2866fb70854f3 to your computer and use it in GitHub Desktop.
Save avinoamr/0961495ed737c36e93e2866fb70854f3 to your computer and use it in GitHub Desktop.
/**
* External Map is a wrapper around Go's built-in map types to support growing
* the number of keys beyond the available memory by dumping parts of it to
* disk.
*
* It starts out as normal in-memory map, and a predefined threshold for
* its maximum capacity (Max), which is the maximum number of items the map will
* contain. When Max is reached, a Split operation takes place that cuts
* the map in half - keeping ~Max/2 items in-memory and saving the rest
* to disk. As a side-effect, the Split also maintains a small in-memory
* dictionary of all available files and which key-range they contain in order
* to allow lookups.
*
* When a lookup occurs, if it can't be resolved in memory, a Load operation
* takes place to load the relevant file where the key should reside back into
* memory. The lookup then resolves normally. Note that random access to
* different key can't result in a huge I/O overhead, so it's wise to plan your
* usage around handling keys in sequentially in batches that match their
* location in the key-range files.
*
* Deletions & updates are simply a lookup followed by an in-memory update.
*
* Open Issues:
* 1. Thread safety (currently it's thread unsafe!)
* In go, maps aren't thread-safe. Is that good enough for us?
* 2. Managing file descriptors, different writers, locations, etc.
* 3. Loading/Splitting might cause unnecessary grow and shrink operations
* on the underlying hashtable
*
*/
package main
import "io"
import "os"
import "fmt"
import "sort"
import "math"
import "io/ioutil"
// the hashable key object
type Key interface {
Hash() int
}
// any value to save for that key
type Value interface{}
type KV struct {
k int
v string
}
type Map struct {
// the in-memory map
Data map[int]string
// maximum number of items to grow to
Max int
// the active memory range
MemRange *KeyRange
// list of all file key-ranges.
FileRanges []*KeyRange
// dirty flag indicates if the in-memory key-range has been modified and
// thus needs to be re-saved before additional loads
Dirty bool
}
type KeyRange struct {
Min int // inclusive
Max int // exclusive
File *os.File
}
func New(max int) *Map {
return &Map{
Max: max,
Data: make(map[int]string),
MemRange: &KeyRange{Min: math.MinInt64, Max: math.MaxInt64},
FileRanges: make([]*KeyRange, 0),
}
}
/**
* Sets a new value to key K.
*
* When still below the max-threshold, the key is inserted in memory. This is
* also the case when the key is within the currently active key-range. In both
* these cases, if the new insertion exceeds Max, a Split operation will cut the
* map size in half.
*
* Otherwise, the key-value pairs are appended to the relevant key-range file.
*/
func (m *Map) Set(k int, v string) error {
if k >= m.MemRange.Min && k < m.MemRange.Max {
// it belongs in-memory, just add it.
m.Data[k] = v
m.Dirty = true
// if we exceeded Max, split the map.
if len(m.Data) > m.Max {
return m.Split()
}
return nil
} else {
// find the relevant key-range to place the key
kr := m.KeyRangeFor(k)
// save it to the relevant file.
tofile := make([]byte, 2)
tofile[0] = byte(k) // key
tofile[1] = byte(len(v)) // value length
tofile = append(tofile, []byte(v)...) // value
_, err := kr.File.Write(tofile)
return err
}
}
func (m *Map) Get(k int) (string, error) {
if k >= m.MemRange.Min && k < m.MemRange.Max {
// should reside in memory
return m.Data[k], nil
} else {
// should reside in a key-range file
kr := m.KeyRangeFor(k)
// load the key range
err := m.Load(kr)
if err != nil {
return "", err
}
// try again
return m.Get(k)
}
}
/**
* Determines where a key is located, returning the key-range that contains it
* and a boolean indicating if it's available in memory (supports fast Get())
* or not (slow Get() due to loading)
*/
func (m *Map) Find(k int) (*KeyRange, bool) {
if k >= m.MemRange.Min && k < m.MemRange.Max {
return m.MemRange, true
} else {
return m.KeyRangeFor(k), false
}
}
/**
* Free-up space by cutting the map in half: keeping approx. Max/2 items in
* memory, and saving the rest to a file for later probing.
*
* The split is performed around a Pivot key. The map is then scanned and split:
* keeping all keys below the Pivot in-memory, and dumping the rest to a new
* file. The file is therefor guaranteed to contain only values above the pivot,
* a fact that's used to optimize the Load operation.
*/
func (m *Map) Split() error {
oldmax := m.MemRange.Max
m.MemRange.Max = pivot(m.Data)
// update the maximum value of the active in-memory map
// breaks the invariant that all memory data is within the bounds
fmt.Println("Splitting by", m.MemRange.Max)
// create the new key-range
kr := &KeyRange{Min: m.MemRange.Max, Max: oldmax}
// restore the invariant by dumping all entries that doesn't match to a file
err := m.Dump(kr)
if err != nil {
return err
}
// TODO: compare len(m.Data) with the original length to verify an effective
// pivot. Otherwise, reselect a more aggressive pivot.
// remember the newly created key-range to handle cases where future
// insertions and lookups can't be performed in-memory and needs to be
// appended to this file.
m.FileRanges = append(m.FileRanges, kr)
return nil
}
/**
* Load a key-range from file back into memory.
*/
func (m *Map) Load(kr *KeyRange) error {
fmt.Println("Loading", kr)
// before loading - save the current in-memory map to a new key-range
// for later probing and insertions. Clear the table while we're at it.
if m.Dirty {
err := m.Dump(m.MemRange)
m.Dirty = false
if err != nil {
return err
}
}
// remember the key-range to handle cases where future insertions and
// lookups can't be performed in-memory and needs to be appended to this
// file.
m.FileRanges = append(m.FileRanges, m.MemRange)
m.MemRange = kr
// now, load the requested key-range
kr.File.Seek(0, 0)
header := make([]byte, 2)
for {
_, err := kr.File.Read(header)
if err == io.EOF {
break
}
k := int(header[0])
l := int(header[1])
body := make([]byte, l)
_, err = kr.File.Read(body)
if err != nil {
return err
}
// set it using the normal set function to support further splitting
// while loading. This can happen when the key-range being appended to
// beyond the Max limit.
err = m.Set(k, string(body))
if err != nil {
return err
}
}
// close the file as we're not expected to use it again?
return nil
}
// helper function for dumping a specific key-range from memory to file
func (m *Map) Dump(kr *KeyRange) error {
tofile := make([]byte, 0)
for k, v := range m.Data {
if k >= kr.Min && k < kr.Max {
delete(m.Data, k) // free it up.
// add it to the file
tofile = append(tofile, byte(k)) // key
tofile = append(tofile, byte(len(v))) // value length
tofile = append(tofile, []byte(v)...) // value
}
}
// save the new key-range to file
var err error
if kr.File == nil {
prefix := fmt.Sprintf("externalmap.%d-%d", kr.Min, kr.Max)
kr.File, err = ioutil.TempFile("", prefix)
if err != nil {
return err
}
}
// write it.
_, err = kr.File.Write(tofile)
return err
}
/**
* Select a pivot key to split the in-memory map by. The pivot is selected to as
* the estimated median key of the map, implemented with Bentley and McIlroy’s
* pseudomedian of nine.
*/
func pivot(data map[int]string) int {
samples := make([]int, 0, 9)
i := 0
l := len(data)
for k, _ := range data {
if i == 0 || i == l/2 || i == l - 1 {
samples = append(samples, k)
}
i += 1
}
// take the median of the 3 samples
sorted := sort.IntSlice(samples)
sort.Sort(sorted)
return sorted[1]
}
func (m *Map) KeyRangeFor(k int) *KeyRange {
for _, kr := range m.FileRanges {
if k >= kr.Min && k < kr.Max {
return kr
}
}
return nil
}
// -- test & play.
func main() {
m := New(30)
for i := 0 ; i < 100 ; i += 1 {
m.Set(i, fmt.Sprintf("i%d", i))
fmt.Println("Set", i)
}
for i := 0 ; i < 100 ; i += 1 {
v, _ := m.Get(i)
fmt.Println(i, v)
}
// m.Set(1, "hello")
// m.Set(2, "world")
// m.Set(3, "foo")
// m.Set(4, "bar")
// m.Set(5, "cookie")
// fmt.Println("Pre-split=", m.Data)
// m.Set(6, "monster")
// fmt.Println("Post-split=", m.Data)
// m.Set(7, "happy")
// fmt.Println("Excessive insert=", m.Data)
// v, _ := m.Get(2)
// fmt.Println("Get(2)=", v)
// fmt.Println("Pre-load=", m.Data)
// v, _ = m.Get(6)
// fmt.Println("Get(6)=", v)
// fmt.Println("Post-load=", m.Data)
// v, _ = m.Get(2)
// fmt.Println("Get(2)=", v)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment