HackerFoo/lookahead-diff.patch Secret

## lookahead-diff.patch
diff --git a/libs/libvtrcapnproto/connection_map.capnp b/libs/libvtrcapnproto/connection_map.capnp
index a0192a57c..30b986415 100644
--- a/libs/libvtrcapnproto/connection_map.capnp
+++ b/libs/libvtrcapnproto/connection_map.capnp
@@ -13,8 +13,13 @@ struct VprVector2D {
     y @1 :Int64;
 }

+struct VprFloatEntry {
+    value @0 :Float32;
+}
+
 struct VprCostMap {
     costMap @0 :Matrix.Matrix((Matrix.Matrix(VprCostEntry)));
     offset @1 :Matrix.Matrix(VprVector2D);
     segmentMap @2 :List(Int64);
+    penalty @3 :Matrix.Matrix(VprFloatEntry);
 }
diff --git a/libs/libvtrutil/src/vtr_geometry.h b/libs/libvtrutil/src/vtr_geometry.h
index 38563f518..0545581b3 100644
--- a/libs/libvtrutil/src/vtr_geometry.h
+++ b/libs/libvtrutil/src/vtr_geometry.h
@@ -1,10 +1,12 @@
 #ifndef VTR_GEOMETRY_H
 #define VTR_GEOMETRY_H
 #include "vtr_range.h"
+#include "vtr_assert.h"

 #include <vector>
 #include <tuple>
 #include <limits>
+#include <type_traits>

 namespace vtr {

@@ -78,7 +80,13 @@ class Rect {
     Rect();
     Rect(T left_val, T bottom_val, T right_val, T top_val);
     Rect(Point<T> bottom_left_val, Point<T> top_right_val);
-    Rect(Point<T> bottom_left_val, T size);
+
+    //Constructs a rectangle that only contains the given point
+    //  Rect(p1).contains(p2) => p1 == p2
+    //It is only enabled for integral types, because making this work for floating point types would be difficult and brittle.
+    //The following line only enables the constructor if std::is_integral<T>::value == true
+    template<typename U = T, typename std::enable_if<std::is_integral<U>::value>::type...>
+    Rect(Point<U> point);

   public: //Accessors
     //Co-ordinates
@@ -103,16 +111,13 @@ class Rect {
     bool coincident(Point<T> point) const;

     //Returns true if no points are contained in the rectangle
+    //  rect.empty() => not exists p. rect.contains(p)
+    //  This also implies either the width or height is 0.
     bool empty() const;

     friend bool operator== <>(const Rect<T>& lhs, const Rect<T>& rhs);
     friend bool operator!= <>(const Rect<T>& lhs, const Rect<T>& rhs);

-    template<class U>
-    friend Rect<U> operator|(const Rect<U>& lhs, const Rect<U>& rhs);
-    template<class U>
-    friend Rect<U>& operator|=(Rect<U>& lhs, const Rect<U>& rhs);
-
   public: //Mutators
     //Co-ordinates
     void set_xmin(T xmin_val);
@@ -120,11 +125,27 @@ class Rect {
     void set_xmax(T xmax_val);
     void set_ymax(T ymax_val);

+    //Equivalent to `*this = bounding_box(*this, other)`
+    Rect<T>& expand_bounding_box(const Rect<T>& other);
+
   private:
     Point<T> bottom_left_;
     Point<T> top_right_;
 };

+//Return the smallest rectangle containing both given rectangles
+//Note that this isn't a union and the resulting rectangle may include points not in either given rectangle
+template<class T>
+Rect<T> bounding_box(const Rect<T>& lhs, const Rect<T>& rhs);
+
+//Sample on a uniformly spaced grid within a rectangle
+//  sample(vtr::Rect(l, h), 0, 0, M) == l
+//  sample(vtr::Rect(l, h), M, M, M) == h
+//To avoid the edges, use `sample(r, x+1, y+1, N+1) for x, y, in 0..N-1
+//Only defined for integral types
+template<typename T, typename std::enable_if<std::is_integral<T>::value>::type...>
+Point<T> sample(const vtr::Rect<T>& r, T x, T y, T d);
+
 //A 2D line
 template<class T>
 class Line {
diff --git a/libs/libvtrutil/src/vtr_geometry.tpp b/libs/libvtrutil/src/vtr_geometry.tpp
index 00a6decc2..5fd065b31 100644
--- a/libs/libvtrutil/src/vtr_geometry.tpp
+++ b/libs/libvtrutil/src/vtr_geometry.tpp
@@ -80,11 +80,13 @@ Rect<T>::Rect(Point<T> bottom_left_val, Point<T> top_right_val)
     //pass
 }

+//Only defined for integral types
 template<class T>
-Rect<T>::Rect(Point<T> bottom_left_val, T size)
-    : bottom_left_(bottom_left_val)
-    , top_right_(bottom_left_val.x() + size,
-                 bottom_left_val.y() + size) {
+template<typename U, typename std::enable_if<std::is_integral<U>::value>::type...>
+Rect<T>::Rect(Point<U> point)
+    : bottom_left_(point)
+    , top_right_(point.x() + 1,
+                 point.y() + 1) {
     //pass
 }

@@ -165,6 +167,22 @@ bool operator!=(const Rect<T>& lhs, const Rect<T>& rhs) {
     return !(lhs == rhs);
 }

+template<class T>
+Rect<T> bounding_box(const Rect<T>& lhs, const Rect<T>& rhs) {
+    return Rect<T>(std::min(lhs.xmin(), rhs.xmin()),
+                   std::min(lhs.ymin(), rhs.ymin()),
+                   std::max(lhs.xmax(), rhs.xmax()),
+                   std::max(lhs.ymax(), rhs.ymax()));
+}
+
+//Only defined for integral types
+template<typename T, typename std::enable_if<std::is_integral<T>::value>::type...>
+Point<T> sample(const vtr::Rect<T>& r, T x, T y, T d) {
+    VTR_ASSERT(d > 0);
+    return Point<T>((r.xmin() * (d - x) + r.xmax() * x + d / 2) / d,
+                    (r.ymin() * (d - y) + r.ymax() * y + d / 2) / d);
+}
+
 template<class T>
 void Rect<T>::set_xmin(T xmin_val) {
     bottom_left_.set_x(xmin_val);
@@ -186,16 +204,9 @@ void Rect<T>::set_ymax(T ymax_val) {
 }

 template<class T>
-Rect<T> operator|(const Rect<T>& lhs, const Rect<T>& rhs) {
-    return Rect<T>(std::min(lhs.xmin(), rhs.xmin()),
-                   std::min(lhs.ymin(), rhs.ymin()),
-                   std::max(lhs.xmax(), rhs.xmax()),
-                   std::max(lhs.ymax(), rhs.ymax()));
-}
-
-template<class T>
-Rect<T>& operator|=(Rect<T>& lhs, const Rect<T>& rhs) {
-    return lhs = lhs | rhs;
+Rect<T>& Rect<T>::expand_bounding_box(const Rect<T>& other) {
+    *this = bounding_box(*this, other);
+    return *this;
 }

 /*
diff --git a/libs/libvtrutil/test/test_geometry.cpp b/libs/libvtrutil/test/test_geometry.cpp
index d005a4e26..8d3d91f44 100644
--- a/libs/libvtrutil/test/test_geometry.cpp
+++ b/libs/libvtrutil/test/test_geometry.cpp
@@ -61,6 +61,7 @@ TEST_CASE("Rect", "[vtr_geometry/Rect]") {
             REQUIRE(r2.contains({6, 4}));
             REQUIRE_FALSE(r2.contains({100, 4}));
             REQUIRE_FALSE(r2.contains(pi_2));
+            REQUIRE(vtr::Rect<int>(pi_1).contains(pi_1));
         }

         SECTION("strictly_contains_int") {
@@ -78,7 +79,19 @@ TEST_CASE("Rect", "[vtr_geometry/Rect]") {
         }

         SECTION("bounds_int") {
-            REQUIRE(r1 == (r3 | r4));
+            REQUIRE(r1 == bounding_box(r3, r4));
+        }
+
+        SECTION("empty_int") {
+            REQUIRE(vtr::Rect<int>().empty());
+        }
+
+        SECTION("sample_int") {
+            auto r = vtr::Rect<int>(pi_1, pi_2);
+            REQUIRE(sample(r, 0, 0, 17) == pi_1);
+            REQUIRE(sample(r, 17, 17, 17) == pi_2);
+            auto inside = sample(r, 3, 11, 17);
+            REQUIRE(r.contains(inside));
         }
     }

@@ -92,6 +105,7 @@ TEST_CASE("Rect", "[vtr_geometry/Rect]") {
         vtr::Rect<float> r4(pf_1, pf_2);
         vtr::Rect<float> r5(pf_1, pf_3);
         vtr::Rect<float> r6(pf_3, pf_2);
+        // vtr::Rect<float> r7(pf_1); // <-- will fail to compile

         SECTION("equality_float") {
             REQUIRE(r3 == r4);
@@ -140,7 +154,11 @@ TEST_CASE("Rect", "[vtr_geometry/Rect]") {
         }

         SECTION("bounds_float") {
-            REQUIRE(r3 == (r5 | r6));
+            REQUIRE(r3 == bounding_box(r5, r6));
+        }
+
+        SECTION("empty_float") {
+            REQUIRE(vtr::Rect<float>().empty());
         }
     }
 }
diff --git a/vpr/src/route/connection_box_lookahead_map.cpp b/vpr/src/route/connection_box_lookahead_map.cpp
index 481d83aac..a6341de4f 100644
--- a/vpr/src/route/connection_box_lookahead_map.cpp
+++ b/vpr/src/route/connection_box_lookahead_map.cpp
@@ -38,101 +38,85 @@
  *
  * See e_representative_entry_method */
 #define REPRESENTATIVE_ENTRY_METHOD util::SMALLEST
-// #define FILL_LIMIT 30

 #define CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_MAPS

 // Sample based an NxN grid of starting segments, where N = SAMPLE_GRID_SIZE
 static constexpr int SAMPLE_GRID_SIZE = 3;

+// Don't continue storing a path after hitting a lower-or-same cost entry.
+static constexpr bool BREAK_ON_MISS = false;
+
 // Distance penalties filling are calculated based on available samples, but can be adjusted with this factor.
 static constexpr float PENALTY_FACTOR = 1.f;
+static constexpr float PENALTY_MIN = 1e-12f;
+
+static constexpr int MIN_PATH_COUNT = 1000;
+
+// quantiles (like percentiles but 0-1) of segment count to use as a selection criteria
+// choose locations with higher, but not extreme, counts of each segment type
+static constexpr double kSamplingCountLowerQuantile = 0.5;
+static constexpr double kSamplingCountUpperQuantile = 0.7;

 // a sample point for a segment type, contains all segments at the VPR location
 struct SamplePoint {
-    uint64_t order; // used to order sample points
+    // canonical location
     vtr::Point<int> location;
-    std::vector<ssize_t> samples;
-    SamplePoint()
-        : location(0, 0) {}
-};

-// a grid of sample points
-struct SampleGrid {
-    SamplePoint point[SAMPLE_GRID_SIZE][SAMPLE_GRID_SIZE];
+    // nodes to expand
+    std::vector<ssize_t> nodes;
 };

-// implements a simple cache of key(K)/value(V) pairs of N entries
-template<class K, class V, int N>
-class SimpleCache {
-  public:
-    SimpleCache()
-        : pos(0)
-        , hits(0)
-        , misses(0) {}
-
-    // O(N) lookup
-    bool get(K key, V* value) {
-        for (int i = 0; i < N; i++) {
-            auto& k = keys[i];
-            if (k == key) {
-                auto& v = values[i];
-#if defined(CONNECTION_BOX_LOOKAHEAD_PUSH_BACK_HITS)
-                // preserve the found key by pushing it back
-                int last = (pos + N - 1) % N;
-                std::swap(k, keys[last]);
-                std::swap(v, values[last]);
-#endif
-                *value = v;
-                hits++;
-                return true;
-            }
-        }
-        misses++;
-        return false;
-    }
+struct SampleRegion {
+    // all nodes in `points' have this segment type
+    int segment_type;

-    // O(1) insertion (overwriting an older entry)
-    void insert(K key, V val) {
-        keys[pos] = key;
-        values[pos] = val;
-        pos = (pos + 1) % N;
-    }
+    // location on the sample grid
+    vtr::Point<int> grid_location;

-    // ratio of successful lookups
-    float hit_ratio() {
-        return hits ? static_cast<float>(hits) / static_cast<float>(hits + misses) : 0.f;
-    }
+    // locations to try
+    // The computation will keep expanding each of the points
+    // until a number of paths (segment -> connection box) are found.
+    std::vector<SamplePoint> points;

-    // ratio of unsuccessful lookups
-    float miss_ratio() {
-        return misses ? static_cast<float>(misses) / static_cast<float>(hits + misses) : 0.f;
-    }
-
-  private:
-    std::array<K, N> keys; // keep keys together for faster scanning
-    std::array<V, N> values;
-    size_t pos;
-    uint64_t hits;
-    uint64_t misses;
+    // used to sort the regions to improve caching
+    uint64_t order;
 };

 template<typename Entry>
-static void run_dijkstra(int start_node_ind,
-                         RoutingCosts* routing_costs);
-static void find_inodes_for_segment_types(std::vector<SampleGrid>* inodes_for_segment);
+static std::pair<float, int> run_dijkstra(int start_node_ind,
+                                          RoutingCosts* routing_costs);
+
+static std::vector<SampleRegion> find_sample_regions(int num_segments);

 // also known as the L1 norm
 static int manhattan_distance(const vtr::Point<int>& a, const vtr::Point<int>& b) {
     return abs(b.x() - a.x()) + abs(b.y() - a.y());
 }

+template<class T>
+constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
+    return std::min(std::max(v, lo), hi);
+}
+
+template<typename T>
+static vtr::Point<T> closest_point_in_rect(const vtr::Rect<T>& r, const vtr::Point<T>& p) {
+    if (r.empty()) {
+        return vtr::Point<T>(0, 0);
+    } else {
+        return vtr::Point<T>(clamp<T>(p.x(), r.xmin(), r.xmax() - 1),
+                             clamp<T>(p.y(), r.ymin(), r.ymax() - 1));
+    }
+}
+
 // resize internal data structures
 void CostMap::set_counts(size_t seg_count, size_t box_count) {
     cost_map_.clear();
     offset_.clear();
+    penalty_.clear();
     cost_map_.resize({seg_count, box_count});
     offset_.resize({seg_count, box_count});
+    penalty_.resize({seg_count, box_count});
     seg_count_ = seg_count;
     box_count_ = box_count;

@@ -153,57 +137,39 @@ int CostMap::node_to_segment(int from_node_ind) const {
     return segment_map_[from_node_ind];
 }

+static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) {
+    penalty = std::max(penalty, PENALTY_MIN);
+    return util::Cost_Entry(entry.delay + distance * penalty * PENALTY_FACTOR,
+                            entry.congestion, entry.fill);
+}
+
 // get a cost entry for a segment type, connection box type, and offset
-util::Cost_Entry CostMap::find_cost(int from_seg_index, ConnectionBoxId box_id, int delta_x, int delta_y, int* out_of_bounds_val) const {
+util::Cost_Entry CostMap::find_cost(int from_seg_index, ConnectionBoxId box_id, int delta_x, int delta_y) const {
     VTR_ASSERT(from_seg_index >= 0 && from_seg_index < (ssize_t)offset_.size());
     const auto& cost_map = cost_map_[from_seg_index][size_t(box_id)];
     if (cost_map.dim_size(0) == 0 || cost_map.dim_size(1) == 0) {
         return util::Cost_Entry();
     }

-    int dx = delta_x - offset_[from_seg_index][size_t(box_id)].first;
-    int dy = delta_y - offset_[from_seg_index][size_t(box_id)].second;
-
-    int out_of_bounds_distance = 0;
-    if (dx < 0) {
-        out_of_bounds_distance += abs(dx);
-        dx = 0;
-    }
-    if (dy < 0) {
-        out_of_bounds_distance += abs(dy);
-        dy = 0;
-    }
-
-    if (dx >= (ssize_t)cost_map.dim_size(0)) {
-        out_of_bounds_distance += abs(int(dx) - int(cost_map.dim_size(0) - 1));
-        dx = cost_map.dim_size(0) - 1;
-    }
-    if (dy >= (ssize_t)cost_map.dim_size(1)) {
-        out_of_bounds_distance += abs(int(dy) - int(cost_map.dim_size(1) - 1));
-        dy = cost_map.dim_size(1) - 1;
-    }
-
-    if (out_of_bounds_val != nullptr) {
-        *out_of_bounds_val = out_of_bounds_distance;
-    }
-
-    return cost_map_[from_seg_index][size_t(box_id)][dx][dy];
-}
-
-static util::Cost_Entry penalize(const util::Cost_Entry& entry, int distance, float penalty) {
-    auto out = util::Cost_Entry(entry.delay + distance * penalty * PENALTY_FACTOR,
-                                entry.congestion);
-
-    out.fill = entry.fill;
-    return out;
+    vtr::Point<int> coord(delta_x - offset_[from_seg_index][size_t(box_id)].first,
+                          delta_y - offset_[from_seg_index][size_t(box_id)].second);
+    vtr::Rect<int> bounds(0, 0, cost_map.dim_size(0), cost_map.dim_size(1));
+    auto closest = closest_point_in_rect(bounds, coord);
+    auto cost = cost_map_[from_seg_index][size_t(box_id)][closest.x()][closest.y()];
+    float penalty = penalty_[from_seg_index][size_t(box_id)];
+    auto distance = manhattan_distance(closest, coord);
+    return penalize(cost, distance, penalty);
 }

 // set the cost map for a segment type and connection box type, filling holes
-void CostMap::set_cost_map(const RoutingCosts& costs) {
+void CostMap::set_cost_map(const RoutingCosts& delay_costs, const RoutingCosts& base_costs) {
     // calculate the bounding boxes
     vtr::Matrix<vtr::Rect<int>> bounds({seg_count_, box_count_});
-    for (const auto& entry : costs) {
-        bounds[entry.first.seg_index][size_t(entry.first.box_id)] |= vtr::Rect<int>(entry.first.delta, 1);
+    for (const auto& entry : delay_costs) {
+        bounds[entry.first.seg_index][size_t(entry.first.box_id)].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
+    }
+    for (const auto& entry : base_costs) {
+        bounds[entry.first.seg_index][size_t(entry.first.box_id)].expand_bounding_box(vtr::Rect<int>(entry.first.delta));
     }

     // store bounds
@@ -225,18 +191,27 @@ void CostMap::set_cost_map(const RoutingCosts& costs) {
     }

     // store entries into the matrices
-    for (const auto& entry : costs) {
+    for (const auto& entry : delay_costs) {
+        int seg = entry.first.seg_index;
+        int box = size_t(entry.first.box_id);
+        const auto& seg_box_bounds = bounds[seg][box];
+        int x = entry.first.delta.x() - seg_box_bounds.xmin();
+        int y = entry.first.delta.y() - seg_box_bounds.ymin();
+        cost_map_[seg][box][x][y].delay = entry.second;
+    }
+    for (const auto& entry : base_costs) {
         int seg = entry.first.seg_index;
         int box = size_t(entry.first.box_id);
         const auto& seg_box_bounds = bounds[seg][box];
         int x = entry.first.delta.x() - seg_box_bounds.xmin();
         int y = entry.first.delta.y() - seg_box_bounds.ymin();
-        cost_map_[seg][box][x][y] = entry.second;
+        cost_map_[seg][box][x][y].congestion = entry.second;
     }

     // fill the holes
     for (size_t seg = 0; seg < seg_count_; seg++) {
         for (size_t box = 0; box < box_count_; box++) {
+            penalty_[seg][box] = std::numeric_limits<float>::infinity();
             const auto& seg_box_bounds = bounds[seg][box];
             if (seg_box_bounds.empty()) {
                 continue;
@@ -244,7 +219,7 @@ void CostMap::set_cost_map(const RoutingCosts& costs) {
             auto& matrix = cost_map_[seg][box];

             // calculate delay penalty
-            float min_delay = 0.f, max_delay = 0.f;
+            float min_delay = std::numeric_limits<float>::infinity(), max_delay = 0.f;
             vtr::Point<int> min_location(0, 0), max_location(0, 0);
             for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
                 for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
@@ -262,6 +237,7 @@ void CostMap::set_cost_map(const RoutingCosts& costs) {
                 }
             }
             float delay_penalty = (max_delay - min_delay) / static_cast<float>(std::max(1, manhattan_distance(max_location, min_location)));
+            penalty_[seg][box] = delay_penalty;

             // find missing cost entries and fill them in by copying a nearby cost entry
             std::vector<std::tuple<unsigned, unsigned, util::Cost_Entry>> missing;
@@ -284,11 +260,10 @@ void CostMap::set_cost_map(const RoutingCosts& costs) {
                         }
                     }
                 }
-#if !defined(FILL_LIMIT)
                 if (couldnt_fill) {
+                    // give up trying to fill an empty matrix
                     break;
                 }
-#endif
             }

             if (!couldnt_fill) {
@@ -303,13 +278,11 @@ void CostMap::set_cost_map(const RoutingCosts& costs) {
             if (couldnt_fill) {
                 VTR_LOG_WARN("Couldn't fill holes in the cost matrix for %d -> %ld, %d x %d bounding box\n",
                              seg, box, seg_box_bounds.width(), seg_box_bounds.height());
-#if !defined(FILL_LIMIT)
                 for (unsigned y = 0; y < matrix.dim_size(1); y++) {
                     for (unsigned x = 0; x < matrix.dim_size(0); x++) {
                         VTR_ASSERT(!matrix[x][y].valid());
                     }
                 }
-#endif
             }
         }
     }
@@ -325,11 +298,11 @@ void CostMap::print(int iseg) const {
          box_id < device_ctx.connection_boxes.num_connection_box_types();
          box_id++) {
         auto& matrix = cost_map_[iseg][box_id];
-        VTR_LOG("cost for box_id = %lu (%zu, %zu)\n", box_id, matrix.dim_size(0), matrix.dim_size(1));
         if (matrix.dim_size(0) == 0 || matrix.dim_size(1) == 0) {
             VTR_LOG("cost EMPTY for box_id = %lu\n", box_id);
             continue;
         }
+        VTR_LOG("cost for box_id = %lu (%zu, %zu)\n", box_id, matrix.dim_size(0), matrix.dim_size(1));
         double sum = 0.0;
         for (unsigned iy = 0; iy < matrix.dim_size(1); iy++) {
             for (unsigned ix = 0; ix < matrix.dim_size(0); ix++) {
@@ -387,46 +360,40 @@ std::pair<util::Cost_Entry, int> CostMap::get_nearby_cost_entry(const vtr::NdMat
                                                                 int cy,
                                                                 const vtr::Rect<int>& bounds) {
     // spiral around (cx, cy) looking for a nearby entry
-    int n = 1;
-    bool in_bounds;
-    util::Cost_Entry entry;
-    entry.fill = true;
+    bool in_bounds = bounds.contains(vtr::Point<int>(cx, cy));
+    if (!in_bounds) {
+        return std::make_pair(util::Cost_Entry(), 0);
+    }
+    int n = 0;
+    util::Cost_Entry fill(matrix[cx][cy]);
+    fill.fill = true;

-    do {
+    while (in_bounds && !fill.valid()) {
+        n++;
         in_bounds = false;
+        util::Cost_Entry min_entry;
         for (int ox = -n; ox <= n; ox++) {
             int x = cx + ox;
             int oy = n - abs(ox);
             int yp = cy + oy;
             int yn = cy - oy;
             if (bounds.contains(vtr::Point<int>(x, yp))) {
-                assign_min_entry(&entry, matrix[x][yp]);
+                assign_min_entry(&min_entry, matrix[x][yp]);
                 in_bounds = true;
             }
             if (bounds.contains(vtr::Point<int>(x, yn))) {
-                assign_min_entry(&entry, matrix[x][yn]);
+                assign_min_entry(&min_entry, matrix[x][yn]);
                 in_bounds = true;
             }
         }
-        if (entry.valid()) return std::make_pair(entry, n);
-        n++;
-#if defined(FILL_LIMIT)
-        if (n > FILL_LIMIT) {
-            break;
+        if (!std::isfinite(fill.delay)) {
+            fill.delay = min_entry.delay;
         }
-#endif
-    } while (in_bounds);
-    return std::make_pair(util::Cost_Entry(), n);
-}
-
-// sample on a uniformly spaced grid within a rectangle
-// sample(vtr::Rect(N), 0, 0, M) is vtr::Point(0, 0)
-// sample(vtr::Rect(N), M, M, M) is vtr::Point(N, N)
-// To avoid the edges, use `sample(r, x+1, y+1, N+1) for x, y, in 0..N-1
-template<typename T>
-vtr::Point<T> sample(const vtr::Rect<T>& r, T x, T y, T d) {
-    return vtr::Point<T>((r.xmin() * (d - x) + r.xmax() * x + d / 2) / d,
-                         (r.ymin() * (d - y) + r.ymax() * y + d / 2) / d);
+        if (!std::isfinite(fill.congestion)) {
+            fill.congestion = min_entry.congestion;
+        }
+    }
+    return std::make_pair(fill, n);
 }

 // derive a cost from the map between two nodes
@@ -479,7 +446,7 @@ float ConnectionBoxMapLookahead::get_map_cost(int from_node_ind,
     ssize_t dy = ssize_t(from_canonical_loc->second) - ssize_t(box_location.second);

     int from_seg_index = cost_map_.node_to_segment(from_node_ind);
-    util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, box_id, dx, dy, nullptr);
+    util::Cost_Entry cost_entry = cost_map_.find_cost(from_seg_index, box_id, dx, dy);

     if (!cost_entry.valid()) {
         // there is no route
@@ -521,51 +488,41 @@ float ConnectionBoxMapLookahead::get_map_cost(int from_node_ind,
 }

 // add a best cost routing path from start_node_ind to node_ind to routing costs
-static void add_paths(int start_node_ind,
-                      int node_ind,
+template<typename Entry>
+static bool add_paths(int start_node_ind,
+                      Entry current,
                       std::unordered_map<int, util::Search_Path>* paths,
                       RoutingCosts* routing_costs) {
     auto& device_ctx = g_vpr_ctx.device();
     ConnectionBoxId box_id;
     std::pair<size_t, size_t> box_location;
     float site_pin_delay;
+    int node_ind = current.rr_node_ind;
     bool found = device_ctx.connection_boxes.find_connection_box(
         node_ind, &box_id, &box_location, &site_pin_delay);
+    bool new_sample_found = false;
     if (!found) {
         VPR_THROW(VPR_ERROR_ROUTE, "No connection box for IPIN %d", node_ind);
     }

     // reconstruct the path
     std::vector<int> path;
-    for (int i = node_ind; i != start_node_ind; path.push_back(i = (*paths)[i].parent))
-        ;
-    util::PQ_Entry parent_entry(start_node_ind, UNDEFINED, 0, 0, 0, true, /*Tsw_adjust=*/0.f);
-
-    // recalculate the path with congestion
-    util::PQ_Entry current_full = parent_entry;
-    int parent = start_node_ind;
-    for (auto it = path.rbegin(); it != path.rend(); it++) {
-        auto& parent_node = device_ctx.rr_nodes[parent];
-        float Tsw_adjust = 0.f;
-
-        // Remove site pin delay when taking edge from last channel to IPIN.
-        if (*it == node_ind) {
-            Tsw_adjust = -site_pin_delay;
-        }
-        current_full = util::PQ_Entry(*it, parent_node.edge_switch((*paths)[*it].edge), current_full.delay,
-                                      current_full.R_upstream, current_full.congestion_upstream, false, Tsw_adjust);
-        parent = *it;
+    for (int i = (*paths)[node_ind].parent; i != start_node_ind; i = (*paths)[i].parent) {
+        path.push_back(i);
     }
+    path.push_back(start_node_ind);
+
+    current.adjust_Tsw(-site_pin_delay);

     // add each node along the path subtracting the incremental costs from the current costs
-    parent = start_node_ind;
+    Entry start_to_here(start_node_ind, UNDEFINED, nullptr);
+    int parent = start_node_ind;
     for (auto it = path.rbegin(); it != path.rend(); it++) {
-        auto& parent_node = device_ctx.rr_nodes[parent];
-        int seg_index = device_ctx.rr_indexed_data[parent_node.cost_index()].seg_index;
-        const std::pair<size_t, size_t>* from_canonical_loc = device_ctx.connection_boxes.find_canonical_loc(parent);
+        auto& here = device_ctx.rr_nodes[*it];
+        int seg_index = device_ctx.rr_indexed_data[here.cost_index()].seg_index;
+        const std::pair<size_t, size_t>* from_canonical_loc = device_ctx.connection_boxes.find_canonical_loc(*it);
         if (from_canonical_loc == nullptr) {
-            VPR_THROW(VPR_ERROR_ROUTE, "No canonical location of node %d",
-                      parent);
+            VPR_THROW(VPR_ERROR_ROUTE, "No canonical location of node %d", *it);
         }

         vtr::Point<int> delta(ssize_t(from_canonical_loc->first) - ssize_t(box_location.first),
@@ -574,36 +531,52 @@ static void add_paths(int start_node_ind,
             seg_index,
             box_id,
             delta};
-        CompressedRoutingCostKey compressed_key(key);

-        float new_delay = current_full.delay - parent_entry.delay;
-        float new_congestion = current_full.congestion_upstream - parent_entry.congestion_upstream;
+        if (*it != start_node_ind) {
+            auto& parent_node = device_ctx.rr_nodes[parent];
+            start_to_here = Entry(*it, parent_node.edge_switch((*paths)[*it].edge), &start_to_here);
+            parent = *it;
+        }

-        VTR_ASSERT(new_delay >= 0.f);
-        VTR_ASSERT(new_congestion >= 0.f);
+        float cost = current.cost() - start_to_here.cost();
+        if (cost < 0.f && cost > -10e-15 /* 10 femtosecond */) {
+            cost = 0.f;
+        }

-        auto val = util::Cost_Entry(new_delay, new_congestion);
+        VTR_ASSERT(cost >= 0.f);

         // NOTE: implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
-        auto result = routing_costs->insert(std::make_pair(key, val));
+        auto result = routing_costs->insert(std::make_pair(key, cost));
         if (!result.second) {
-            assign_min_entry(&result.first->second, val);
+            if (cost < result.first->second) {
+                result.first->second = cost;
+                new_sample_found = true;
+            } else if (BREAK_ON_MISS) {
+                break;
+            }
+        } else {
+            new_sample_found = true;
         }
-
-        // update parent data
-        parent_entry = util::PQ_Entry(*it, parent_node.edge_switch((*paths)[*it].edge), parent_entry.delay,
-                                      parent_entry.R_upstream, parent_entry.congestion_upstream, false, /*Tsw_adjust=*/0.f);
-        parent = *it;
     }
+    return new_sample_found;
 }

-/* runs Dijkstra's algorithm from specified node until all nodes have been
+/* Runs Dijkstra's algorithm from specified node until all nodes have been
  * visited. Each time a pin is visited, the delay/congestion information
- * to that pin is stored to an entry in the routing_cost_map */
+ * to that pin is stored to an entry in the routing_cost_map.
+ *
+ * Returns the maximum (last) minimum cost path stored, and
+ * the number of paths from start_node_ind stored. */
 template<typename Entry>
-static void run_dijkstra(int start_node_ind,
-                         RoutingCosts* routing_costs) {
+static std::pair<float, int> run_dijkstra(int start_node_ind,
+                                          RoutingCosts* routing_costs) {
     auto& device_ctx = g_vpr_ctx.device();
+    int path_count = 0;
+    const std::pair<size_t, size_t>* start_canonical_loc = device_ctx.connection_boxes.find_canonical_loc(start_node_ind);
+    if (start_canonical_loc == nullptr) {
+        VPR_THROW(VPR_ERROR_ROUTE, "No canonical location of node %d",
+                  start_node_ind);
+    }

     /* a list of boolean flags (one for each rr node) to figure out if a
      * certain node has already been expanded */
@@ -617,7 +590,8 @@ static void run_dijkstra(int start_node_ind,
     std::priority_queue<Entry, std::vector<Entry>, std::greater<Entry>> pq;

     /* first entry has no upstream delay or congestion */
-    Entry first_entry(start_node_ind, UNDEFINED, 0, true);
+    Entry first_entry(start_node_ind, UNDEFINED, nullptr);
+    float max_cost = first_entry.cost();

     pq.push(first_entry);

@@ -635,13 +609,18 @@ static void run_dijkstra(int start_node_ind,

         /* if this node is an ipin record its congestion/delay in the routing_cost_map */
         if (device_ctx.rr_nodes[node_ind].type() == IPIN) {
-            add_paths(start_node_ind, node_ind, &paths, routing_costs);
+            // the last cost should be the highest
+            max_cost = current.cost();
+
+            path_count++;
+            add_paths<Entry>(start_node_ind, current, &paths, routing_costs);
         } else {
-            expand_dijkstra_neighbours(device_ctx.rr_nodes,
-                                       current, paths, node_expanded, pq);
+            util::expand_dijkstra_neighbours(device_ctx.rr_nodes,
+                                             current, paths, node_expanded, pq);
             node_expanded[node_ind] = true;
         }
     }
+    return std::make_pair(max_cost, path_count);
 }

 // compute the cost maps for lookahead
@@ -652,30 +631,7 @@ void ConnectionBoxMapLookahead::compute(const std::vector<t_segment_inf>& segmen
     alloc_and_load_rr_node_route_structs();

     size_t num_segments = segment_inf.size();
-    std::vector<SamplePoint> sample_points;
-    {
-        std::vector<SampleGrid> inodes_for_segment(num_segments);
-        find_inodes_for_segment_types(&inodes_for_segment);
-
-        // collapse into a vector
-        for (auto& grid : inodes_for_segment) {
-            for (int y = 0; y < SAMPLE_GRID_SIZE; y++) {
-                for (int x = 0; x < SAMPLE_GRID_SIZE; x++) {
-                    auto& point = grid.point[y][x];
-                    if (!point.samples.empty()) {
-                        point.order = point.samples[0];
-                        sample_points.push_back(point);
-                    }
-                }
-            }
-        }
-    }
-
-    // sort by VPR coordinate
-    std::sort(sample_points.begin(), sample_points.end(),
-              [](const SamplePoint& a, const SamplePoint& b) {
-                  return a.order < b.order;
-              });
+    std::vector<SampleRegion> sample_regions = find_sample_regions(num_segments);

     /* free previous delay map and allocate new one */
     auto& device_ctx = g_vpr_ctx.device();
@@ -683,42 +639,87 @@ void ConnectionBoxMapLookahead::compute(const std::vector<t_segment_inf>& segmen
                          device_ctx.connection_boxes.num_connection_box_types());

     VTR_ASSERT(REPRESENTATIVE_ENTRY_METHOD == util::SMALLEST);
-    RoutingCosts all_costs;
+    RoutingCosts all_delay_costs;
+    RoutingCosts all_base_costs;

     /* run Dijkstra's algorithm for each segment type & channel type combination */
 #if defined(VPR_USE_TBB)
     tbb::mutex all_costs_mutex;
-    tbb::parallel_for_each(sample_points, [&](const SamplePoint& point) {
+    tbb::parallel_for_each(sample_regions, [&](const SampleRegion& region) {
 #else
-    for (const auto& point : sample_points) {
+    for (const auto& region : sample_regions) {
 #endif
         // holds the cost entries for a run
-        RoutingCosts costs;
+        RoutingCosts delay_costs;
+        RoutingCosts base_costs;
+        int total_path_count = 0;
+
+        for (auto& point : region.points) {
+            // statistics
+            vtr::Timer run_timer;
+            float max_delay_cost = 0.f;
+            float max_base_cost = 0.f;
+            int path_count = 0;
+            for (auto node_ind : point.nodes) {
+                {
+                    auto result = run_dijkstra<util::PQ_Entry_Delay>(node_ind, &delay_costs);
+                    max_delay_cost = std::max(max_delay_cost, result.first);
+                    path_count += result.second;
+                }
+                {
+                    auto result = run_dijkstra<util::PQ_Entry_Base_Cost>(node_ind, &base_costs);
+                    max_base_cost = std::max(max_base_cost, result.first);
+                    path_count += result.second;
+                }
+            }

-        for (auto node_ind : point.samples) {
-            run_dijkstra<util::PQ_Entry_Delay>(node_ind, &costs);
-            run_dijkstra<util::PQ_Entry_Base_Cost>(node_ind, &costs);
+            if (path_count > 0) {
+                VTR_LOG_WARN("Expanded %d paths of segment type %s(%d) starting at (%d, %d) from %d segments, max_cost %e %e (%g paths/sec)\n",
+                             path_count, segment_inf[region.segment_type].name.c_str(), region.segment_type,
+                             point.location.x(), point.location.y(),
+                             (int)point.nodes.size(),
+                             max_delay_cost, max_base_cost,
+                             path_count / run_timer.elapsed_sec());
+            }
+
+            /*
+             * if (path_count == 0) {
+             * for (auto node_ind : point.nodes) {
+             * VTR_LOG("Expanded node %s\n", describe_rr_node(node_ind).c_str());
+             * }
+             * }
+             */
+
+            total_path_count += path_count;
+            if (total_path_count > MIN_PATH_COUNT) {
+                break;
+            }
         }

 #if defined(VPR_USE_TBB)
         all_costs_mutex.lock();
 #endif

-        for (auto node_ind : point.samples) {
-            VTR_LOG("Expanded node %s\n", describe_rr_node(node_ind).c_str());
+        if (total_path_count == 0) {
+            VTR_LOG("No paths found for sample region %s(%d, %d)\n",
+                    segment_inf[region.segment_type].name.c_str(), region.grid_location.x(), region.grid_location.y());
         }

-        VTR_LOG("Expanded sample point (%d, %d)\n",
-                point.location.x(), point.location.y());
-
         // combine the cost map from this run with the final cost maps for each segment
-        for (const auto& cost : costs) {
-            const auto& key = cost.first;
+        for (const auto& cost : delay_costs) {
             const auto& val = cost.second;
-            auto result = all_costs.insert(std::make_pair(key, val));
+            auto result = all_delay_costs.insert(std::make_pair(cost.first, val));
             if (!result.second) {
                 // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
-                assign_min_entry(&result.first->second, val);
+                result.first->second = std::min(result.first->second, val);
+            }
+        }
+        for (const auto& cost : base_costs) {
+            const auto& val = cost.second;
+            auto result = all_base_costs.insert(std::make_pair(cost.first, val));
+            if (!result.second) {
+                // implements REPRESENTATIVE_ENTRY_METHOD == SMALLEST
+                result.first->second = std::min(result.first->second, val);
             }
         }

@@ -732,7 +733,7 @@ void ConnectionBoxMapLookahead::compute(const std::vector<t_segment_inf>& segmen
     VTR_LOG("Combining results\n");
     /* boil down the cost list in routing_cost_map at each coordinate to a
      * representative cost entry and store it in the lookahead cost map */
-    cost_map_.set_cost_map(all_costs);
+    cost_map_.set_cost_map(all_delay_costs, all_base_costs);

 // diagnostics
 #if defined(CONNECTION_BOX_LOOKAHEAD_MAP_PRINT_COST_ENTRIES)
@@ -785,91 +786,208 @@ float ConnectionBoxMapLookahead::get_expected_cost(
 }

 // the smallest bounding box containing a node
-static vtr::Rect<int> bounding_box_for_node(const t_rr_node& node) {
-    return vtr::Rect<int>(node.xlow(), node.ylow(),
-                          node.xhigh() + 1, node.yhigh() + 1);
+static vtr::Rect<int> bounding_box_for_node(const ConnectionBoxes& connection_boxes, int node_ind) {
+    const std::pair<size_t, size_t>* loc = connection_boxes.find_canonical_loc(node_ind);
+    if (loc == nullptr) {
+        return vtr::Rect<int>();
+    } else {
+        return vtr::Rect<int>(vtr::Point<int>(loc->first, loc->second));
+    }
+}
+
+static vtr::Rect<int> sample_window(const vtr::Rect<int>& bounding_box, int sx, int sy, int n) {
+    return vtr::Rect<int>(sample(bounding_box, sx, sy, n),
+                          sample(bounding_box, sx + 1, sy + 1, n));
+}
+
+static std::vector<SamplePoint> choose_points(const vtr::Matrix<int>& counts,
+                                              const vtr::Rect<int>& window,
+                                              int min_count,
+                                              int max_count) {
+    VTR_ASSERT(min_count <= max_count);
+    std::vector<SamplePoint> points;
+    for (int y = window.ymin(); y < window.ymax(); y++) {
+        for (int x = window.xmin(); x < window.xmax(); x++) {
+            if (counts[x][y] >= min_count && counts[x][y] <= max_count) {
+                points.push_back(SamplePoint{/* .location = */ vtr::Point<int>(x, y),
+                                             /* .nodes = */ {}});
+            }
+        }
+    }
+
+    vtr::Point<int> center = sample(window, 1, 1, 2);
+
+    // sort by distance from center
+    std::sort(points.begin(), points.end(),
+              [&](const SamplePoint& a, const SamplePoint& b) {
+                  return manhattan_distance(a.location, center) < manhattan_distance(b.location, center);
+              });
+
+    return points;
+}
+
+// histogram is a map from segment count to number of locations having that count
+static int quantile(const std::map<int, int>& histogram, float ratio) {
+    if (histogram.empty()) {
+        return 0;
+    }
+    int sum = 0;
+    for (const auto& entry : histogram) {
+        sum += entry.second;
+    }
+    int limit = std::ceil(sum * ratio);
+    for (const auto& entry : histogram) {
+        limit -= entry.second;
+        if (limit <= 0) {
+            return entry.first;
+        }
+    }
+    return 0;
+}
+
+// select a good number of segments to find
+static std::map<int, int> count_histogram(const vtr::Rect<int>& box, const vtr::Matrix<int>& counts) {
+    std::map<int, int> histogram;
+    for (int y = box.ymin(); y < box.ymax(); y++) {
+        for (int x = box.xmin(); x < box.xmax(); x++) {
+            int count = counts[x][y];
+            if (count > 0) {
+                ++histogram[count];
+            }
+        }
+    }
+    return histogram;
 }

-// the center point for a node
-// it is unknown where the the node starts, so use the average
-static vtr::Point<int> point_for_node(const t_rr_node& node) {
-    int x = (node.xhigh() + node.xlow()) / 2;
-    int y = (node.yhigh() + node.ylow()) / 2;
-    return vtr::Point<int>(x, y);
+// Used to calculate each region's `order.'
+// A space-filling curve will order the regions so that
+// nearby points stay close in order. A Hilbert curve might
+// be better, but a Morton (Z)-order curve is easy to compute,
+// because it's just interleaving binary bits, so this
+// function interleaves with 0's so that the X and Y
+// dimensions can then be OR'ed together.
+static uint64_t interleave(uint32_t x) {
+    uint64_t i = x;
+    i = (i ^ (i << 16)) & 0x0000ffff0000ffff;
+    i = (i ^ (i << 8)) & 0x00ff00ff00ff00ff;
+    i = (i ^ (i << 4)) & 0x0f0f0f0f0f0f0f0f;
+    i = (i ^ (i << 2)) & 0x3333333333333333;
+    i = (i ^ (i << 1)) & 0x5555555555555555;
+    return i;
 }

 // for each segment type, find the nearest nodes to an equally spaced grid of points
 // within the bounding box for that segment type
-static void find_inodes_for_segment_types(std::vector<SampleGrid>* inodes_for_segment) {
+static std::vector<SampleRegion> find_sample_regions(int num_segments) {
+    vtr::ScopedStartFinishTimer timer("finding sample regions");
+    std::vector<SampleRegion> sample_regions;
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_nodes = device_ctx.rr_nodes;
-    const int num_segments = inodes_for_segment->size();
+    std::vector<vtr::Matrix<int>> segment_counts(num_segments);

     // compute bounding boxes for each segment type
     std::vector<vtr::Rect<int>> bounding_box_for_segment(num_segments, vtr::Rect<int>());
     for (size_t i = 0; i < rr_nodes.size(); i++) {
         auto& node = rr_nodes[i];
         if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
         int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;

         VTR_ASSERT(seg_index != OPEN);
         VTR_ASSERT(seg_index < num_segments);

-        bounding_box_for_segment[seg_index] |= bounding_box_for_node(node);
+        bounding_box_for_segment[seg_index].expand_bounding_box(bounding_box_for_node(device_ctx.connection_boxes, i));
     }

-    // select an inode near the center of the bounding box for each segment type
-    inodes_for_segment->clear();
-    inodes_for_segment->resize(num_segments);
-    for (auto& grid : *inodes_for_segment) {
-        for (int y = 0; y < SAMPLE_GRID_SIZE; y++) {
-            for (int x = 0; x < SAMPLE_GRID_SIZE; x++) {
-                grid.point[y][x].samples = std::vector<ssize_t>();
-            }
-        }
+    // initialize counts
+    for (int seg = 0; seg < num_segments; seg++) {
+        const auto& box = bounding_box_for_segment[seg];
+        segment_counts[seg] = vtr::Matrix<int>({size_t(box.width()), size_t(box.height())}, 0);
     }

+    // count sample points
     for (size_t i = 0; i < rr_nodes.size(); i++) {
         auto& node = rr_nodes[i];
-        vtr::Rect<int> node_bounds = bounding_box_for_node(node);
         if (node.type() != CHANX && node.type() != CHANY) continue;
-        if (node.capacity() == 0 || device_ctx.connection_boxes.find_canonical_loc(i) == nullptr) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        const std::pair<size_t, size_t>* loc = device_ctx.connection_boxes.find_canonical_loc(i);
+        if (loc == nullptr) continue;

         int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+        segment_counts[seg_index][loc->first][loc->second] += 1;

         VTR_ASSERT(seg_index != OPEN);
         VTR_ASSERT(seg_index < num_segments);
+    }

-        auto& grid = (*inodes_for_segment)[seg_index];
-        for (int sy = 0; sy < SAMPLE_GRID_SIZE; sy++) {
-            for (int sx = 0; sx < SAMPLE_GRID_SIZE; sx++) {
-                auto& point = grid.point[sy][sx];
-                if (point.samples.empty()) {
-                    point.samples.push_back(i);
-                    point.location = vtr::Point<int>(node.xlow(), node.ylow());
-                    goto next_rr_node;
+    // select sample points
+    for (int i = 0; i < num_segments; i++) {
+        const auto& counts = segment_counts[i];
+        const auto& bounding_box = bounding_box_for_segment[i];
+        if (bounding_box.empty()) continue;
+        for (int y = 0; y < SAMPLE_GRID_SIZE; y++) {
+            for (int x = 0; x < SAMPLE_GRID_SIZE; x++) {
+                vtr::Rect<int> window = sample_window(bounding_box, x, y, SAMPLE_GRID_SIZE);
+                auto histogram = count_histogram(window, segment_counts[i]);
+                SampleRegion region = {
+                    /* .segment_type = */ i,
+                    /* .grid_location = */ vtr::Point<int>(x, y),
+                    /* .points = */ choose_points(counts, window, quantile(histogram, kSamplingCountLowerQuantile), quantile(histogram, kSamplingCountUpperQuantile)),
+                    /* .order = */ 0};
+                if (!region.points.empty()) {
+                    /* In order to improve caching, the list of sample points are
+                     * sorted to keep points that are nearby on the Euclidean plane also
+                     * nearby in the vector of sample points.
+                     *
+                     * This means subsequent expansions on the same thread are likely
+                     * to cover a similar set of nodes, so they are more likely to be
+                     * cached. This improves performance by about 7%, which isn't a lot,
+                     * but not a bad improvement for a few lines of code. */
+                    vtr::Point<int> location = region.points[0].location;
+
+                    // interleave bits of X and Y for a Z-curve ordering.
+                    region.order = interleave(location.x()) | (interleave(location.y()) << 1);
+
+                    sample_regions.push_back(region);
                 }
+            }
+        }
+    }

-                if (node_bounds.contains(point.location)) {
-                    point.samples.push_back(i);
-                    goto next_rr_node;
-                }
+    // sort regions
+    std::sort(sample_regions.begin(), sample_regions.end(),
+              [](const SampleRegion& a, const SampleRegion& b) {
+                  return a.order < b.order;
+              });

-                vtr::Point<int> target = sample(bounding_box_for_segment[seg_index], sx + 1, sy + 1, SAMPLE_GRID_SIZE + 1);
-                int distance_new = manhattan_distance(point_for_node(node), target);
-                int distance_stored = manhattan_distance(point.location, target);
-                if (distance_new < distance_stored) {
-                    point.samples.clear();
-                    point.samples.push_back(i);
-                    goto next_rr_node;
-                }
-            }
+    // build an index of sample points on segment type and location
+    std::map<std::tuple<int, int, int>, SamplePoint*> sample_point_index;
+    for (auto& region : sample_regions) {
+        for (auto& point : region.points) {
+            sample_point_index[std::make_tuple(region.segment_type, point.location.x(), point.location.y())] = &point;
         }
+    }
+
+    // collect the node indices for each segment type at the selected sample points
+    for (size_t i = 0; i < rr_nodes.size(); i++) {
+        auto& node = rr_nodes[i];
+        if (node.type() != CHANX && node.type() != CHANY) continue;
+        if (node.capacity() == 0 || node.num_edges() == 0) continue;
+        const std::pair<size_t, size_t>* loc = device_ctx.connection_boxes.find_canonical_loc(i);
+        if (loc == nullptr) continue;
+
+        int seg_index = device_ctx.rr_indexed_data[node.cost_index()].seg_index;
+
+        VTR_ASSERT(seg_index != OPEN);
+        VTR_ASSERT(seg_index < num_segments);

-    // to break out from the inner loop
-    next_rr_node:
-        continue;
+        auto point = sample_point_index.find(std::make_tuple(seg_index, loc->first, loc->second));
+        if (point != sample_point_index.end()) {
+            point->second->nodes.push_back(i);
+        }
     }
+
+    return sample_regions;
 }

 #ifndef VTR_ENABLE_CAPNPROTO
@@ -893,6 +1011,7 @@ void ConnectionBoxMapLookahead::write(const std::string& file) const {
 static void ToCostEntry(util::Cost_Entry* out, const VprCostEntry::Reader& in) {
     out->delay = in.getDelay();
     out->congestion = in.getCongestion();
+    out->fill = in.getFill();
 }

 static void FromCostEntry(VprCostEntry::Builder* out, const util::Cost_Entry& in) {
@@ -922,6 +1041,16 @@ static void FromMatrixCostEntry(
         out, in, FromCostEntry);
 }

+static void ToFloat(float* out, const VprFloatEntry::Reader& in) {
+    // Getting a scalar field is always "get<field name>()".
+    *out = in.getValue();
+}
+
+static void FromFloat(VprFloatEntry::Builder* out, const float& in) {
+    // Setting a scalar field is always "set<field name>(value)".
+    out->setValue(in);
+}
+
 void CostMap::read(const std::string& file) {
     MmapFile f(file);

@@ -952,6 +1081,12 @@ void CostMap::read(const std::string& file) {
         ToNdMatrix<2, Matrix<VprCostEntry>, vtr::NdMatrix<util::Cost_Entry, 2>>(
             &cost_map_, cost_maps, ToMatrixCostEntry);
     }
+
+    {
+        const auto& penalty = cost_map.getPenalty();
+        ToNdMatrix<2, VprFloatEntry, float>(
+            &penalty_, penalty, ToFloat);
+    }
 }

 void CostMap::write(const std::string& file) const {
@@ -978,6 +1113,12 @@ void CostMap::write(const std::string& file) const {
             &cost_maps, cost_map_, FromMatrixCostEntry);
     }

+    {
+        auto penalty = cost_map.initPenalty();
+        FromNdMatrix<2, VprFloatEntry, float>(
+            &penalty, penalty_, FromFloat);
+    }
+
     writeMessageToFile(file, &builder);
 }
 #endif
diff --git a/vpr/src/route/connection_box_lookahead_map.h b/vpr/src/route/connection_box_lookahead_map.h
index 960ef84f2..70423409f 100644
--- a/vpr/src/route/connection_box_lookahead_map.h
+++ b/vpr/src/route/connection_box_lookahead_map.h
@@ -32,54 +32,27 @@ struct RoutingCostKey {
     }
 };

-// compressed version of RoutingCostKey
-// TODO add bounds checks
-struct CompressedRoutingCostKey {
-    uint32_t data;
-
-    CompressedRoutingCostKey() {
-        data = -1;
-    }
-    CompressedRoutingCostKey(const RoutingCostKey& key) {
-        data = key.seg_index & 0xff;
-        data <<= 8;
-        data |= size_t(key.box_id) & 0xff;
-        data <<= 8;
-        data |= key.delta.x() & 0xff;
-        data <<= 8;
-        data |= key.delta.y() & 0xff;
-    }
-
-    bool operator==(CompressedRoutingCostKey other) const {
-        return data == other.data;
-    }
-};
-
 // hash implementation for RoutingCostKey
 struct HashRoutingCostKey {
     std::size_t operator()(RoutingCostKey const& key) const noexcept {
-        uint64_t data;
-        data = key.seg_index & 0xffff;
-        data <<= 16;
-        data |= size_t(key.box_id) & 0xffff;
-        data <<= 16;
-        data |= key.delta.x() & 0xffff;
-        data <<= 16;
-        data |= key.delta.y() & 0xffff;
-        return std::hash<uint64_t>{}(data);
+        std::size_t hash = std::hash<int>{}(key.seg_index);
+        vtr::hash_combine(hash, key.box_id);
+        vtr::hash_combine(hash, key.delta.x());
+        vtr::hash_combine(hash, key.delta.y());
+        return hash;
     }
 };

 // Map used to store intermediate routing costs
-typedef std::unordered_map<RoutingCostKey, util::Cost_Entry, HashRoutingCostKey> RoutingCosts;
+typedef std::unordered_map<RoutingCostKey, float, HashRoutingCostKey> RoutingCosts;

 // Dense cost maps per source segment and destination connection box types
 class CostMap {
   public:
     void set_counts(size_t seg_count, size_t box_count);
     int node_to_segment(int from_node_ind) const;
-    util::Cost_Entry find_cost(int from_seg_index, ConnectionBoxId box_id, int delta_x, int delta_y, int* out_of_bounds_val) const;
-    void set_cost_map(const RoutingCosts& costs);
+    util::Cost_Entry find_cost(int from_seg_index, ConnectionBoxId box_id, int delta_x, int delta_y) const;
+    void set_cost_map(const RoutingCosts& delay_costs, const RoutingCosts& base_costs);
     std::pair<util::Cost_Entry, int> get_nearby_cost_entry(const vtr::NdMatrix<util::Cost_Entry, 2>& matrix, int cx, int cy, const vtr::Rect<int>& bounds);
     void read(const std::string& file);
     void write(const std::string& file) const;
@@ -87,8 +60,9 @@ class CostMap {
     std::vector<std::pair<int, int>> list_empty() const;

   private:
-    vtr::NdMatrix<vtr::NdMatrix<util::Cost_Entry, 2>, 2> cost_map_;
-    vtr::NdMatrix<std::pair<int, int>, 2> offset_;
+    vtr::Matrix<vtr::Matrix<util::Cost_Entry>> cost_map_;
+    vtr::Matrix<std::pair<int, int>> offset_;
+    vtr::Matrix<float> penalty_;
     std::vector<int> segment_map_;
     size_t seg_count_;
     size_t box_count_;
diff --git a/vpr/src/route/router_lookahead_map_utils.cpp b/vpr/src/route/router_lookahead_map_utils.cpp
index 6226662e7..081d62760 100644
--- a/vpr/src/route/router_lookahead_map_utils.cpp
+++ b/vpr/src/route/router_lookahead_map_utils.cpp
@@ -5,7 +5,9 @@
 #include "vtr_math.h"
 #include "route_common.h"

-util::PQ_Entry::PQ_Entry(
+namespace util {
+
+PQ_Entry::PQ_Entry(
     int set_rr_node_ind,
     int switch_ind,
     float parent_delay,
@@ -53,13 +55,11 @@ util::PQ_Entry::PQ_Entry(
 util::PQ_Entry_Delay::PQ_Entry_Delay(
     int set_rr_node_ind,
     int switch_ind,
-    float parent_delay,
-    bool starting_node) {
+    const util::PQ_Entry_Delay* parent) {
     this->rr_node_ind = set_rr_node_ind;

-    auto& device_ctx = g_vpr_ctx.device();
-    this->delay_cost = parent_delay;
-    if (!starting_node) {
+    if (parent != nullptr) {
+        auto& device_ctx = g_vpr_ctx.device();
         float Tsw = device_ctx.rr_switch_inf[switch_ind].Tdel;
         float Rsw = device_ctx.rr_switch_inf[switch_ind].R;
         float Cnode = device_ctx.rr_nodes[set_rr_node_ind].C();
@@ -73,23 +73,27 @@ util::PQ_Entry_Delay::PQ_Entry_Delay(
         }

         VTR_ASSERT(T_linear >= 0.);
-        this->delay_cost += T_linear;
+        this->delay_cost = parent->delay_cost + T_linear;
+    } else {
+        this->delay_cost = 0.f;
     }
 }

 util::PQ_Entry_Base_Cost::PQ_Entry_Base_Cost(
     int set_rr_node_ind,
     int switch_ind,
-    float upstream_base_costs,
-    bool starting_node) {
+    const util::PQ_Entry_Base_Cost* parent) {
     this->rr_node_ind = set_rr_node_ind;

-    auto& device_ctx = g_vpr_ctx.device();
-    this->base_cost = upstream_base_costs;
-    if (!starting_node) {
+    if (parent != nullptr) {
+        auto& device_ctx = g_vpr_ctx.device();
         if (device_ctx.rr_switch_inf[switch_ind].configurable()) {
-            this->base_cost += get_rr_cong_cost(set_rr_node_ind);
+            this->base_cost = parent->base_cost + get_rr_cong_cost(set_rr_node_ind);
+        } else {
+            this->base_cost = parent->base_cost;
         }
+    } else {
+        this->base_cost = 0.f;
     }
 }

@@ -188,3 +192,59 @@ util::Cost_Entry util::Expansion_Cost_Entry::get_median_entry() const {

     return representative_entry;
 }
+
+template<typename Entry>
+void expand_dijkstra_neighbours(const std::vector<t_rr_node>& rr_nodes,
+                                const Entry& parent_entry,
+                                std::unordered_map<int, util::Search_Path>& paths,
+                                std::vector<bool>& node_expanded,
+                                std::priority_queue<Entry,
+                                                    std::vector<Entry>,
+                                                    std::greater<Entry>>& pq) {
+    int parent_ind = parent_entry.rr_node_ind;
+
+    auto& parent_node = rr_nodes[parent_ind];
+
+    for (int iedge = 0; iedge < parent_node.num_edges(); iedge++) {
+        int child_node_ind = parent_node.edge_sink_node(iedge);
+        int switch_ind = parent_node.edge_switch(iedge);
+
+        /* skip this child if it has already been expanded from */
+        if (node_expanded[child_node_ind]) {
+            continue;
+        }
+
+        Entry child_entry(child_node_ind, switch_ind, &parent_entry);
+        VTR_ASSERT(child_entry.cost() >= 0);
+        pq.push(child_entry);
+
+        /* Create (if it doesn't exist) or update (if the new cost is lower)
+         * to specified node */
+        Search_Path path_entry = {child_entry.cost(), parent_ind, iedge};
+        auto result = paths.insert(std::make_pair(
+            child_node_ind,
+            path_entry));
+        if (!result.second) {
+            if (child_entry.cost() < result.first->second.cost) {
+                result.first->second = path_entry;
+            }
+        }
+    }
+}
+
+template void expand_dijkstra_neighbours(const std::vector<t_rr_node>& rr_nodes,
+                                         const PQ_Entry_Delay& parent_entry,
+                                         std::unordered_map<int, Search_Path>& paths,
+                                         std::vector<bool>& node_expanded,
+                                         std::priority_queue<PQ_Entry_Delay,
+                                                             std::vector<PQ_Entry_Delay>,
+                                                             std::greater<PQ_Entry_Delay>>& pq);
+template void expand_dijkstra_neighbours(const std::vector<t_rr_node>& rr_nodes,
+                                         const PQ_Entry_Base_Cost& parent_entry,
+                                         std::unordered_map<int, Search_Path>& paths,
+                                         std::vector<bool>& node_expanded,
+                                         std::priority_queue<PQ_Entry_Base_Cost,
+                                                             std::vector<PQ_Entry_Base_Cost>,
+                                                             std::greater<PQ_Entry_Base_Cost>>& pq);
+
+} // namespace util
diff --git a/vpr/src/route/router_lookahead_map_utils.h b/vpr/src/route/router_lookahead_map_utils.h
index 0b39c434f..36e6ea510 100644
--- a/vpr/src/route/router_lookahead_map_utils.h
+++ b/vpr/src/route/router_lookahead_map_utils.h
@@ -49,12 +49,14 @@ class Cost_Entry {
         congestion = std::numeric_limits<float>::infinity();
         fill = false;
     }
-    Cost_Entry(float set_delay, float set_congestion) {
-        delay = set_delay;
-        congestion = set_congestion;
-        fill = false;
-    }
-
+    Cost_Entry(float set_delay, float set_congestion)
+        : delay(set_delay)
+        , congestion(set_congestion)
+        , fill(false) {}
+    Cost_Entry(float set_delay, float set_congestion, bool set_fill)
+        : delay(set_delay)
+        , congestion(set_congestion)
+        , fill(set_fill) {}
     bool valid() const {
         return std::isfinite(delay) && std::isfinite(congestion);
     }
@@ -141,34 +143,43 @@ class PQ_Entry {
     }
 };

-// A version of PQ_Entry that only calculates and stores the delay (cost.)
+// A version of PQ_Entry that only calculates and stores the delay.
 class PQ_Entry_Delay {
   public:
     int rr_node_ind;  //index in device_ctx.rr_nodes that this entry represents
     float delay_cost; //the cost of the path to get to this node

-    PQ_Entry_Delay(int set_rr_node_ind, int /*switch_ind*/, float parent_cost, bool starting_node);
+    PQ_Entry_Delay(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Delay* parent);

     float cost() const {
         return delay_cost;
     }

+    void adjust_Tsw(float amount) {
+        delay_cost += amount;
+    }
+
     bool operator>(const PQ_Entry_Delay& obj) const {
         return (this->delay_cost > obj.delay_cost);
     }
 };

+// A version of PQ_Entry that only calculates and stores the base cost.
 class PQ_Entry_Base_Cost {
   public:
     int rr_node_ind; //index in device_ctx.rr_nodes that this entry represents
     float base_cost;

-    PQ_Entry_Base_Cost(int set_rr_node_ind, int /*switch_ind*/, float parent_cost, bool starting_node);
+    PQ_Entry_Base_Cost(int set_rr_node_ind, int /*switch_ind*/, const PQ_Entry_Base_Cost* parent);

     float cost() const {
         return base_cost;
     }

+    void adjust_Tsw(float /* amount */) {
+        // do nothing
+    }
+
     bool operator>(const PQ_Entry_Base_Cost& obj) const {
         return (this->base_cost > obj.base_cost);
     }
@@ -180,46 +191,16 @@ struct Search_Path {
     int edge;
 };

-} // namespace util
-
 /* iterates over the children of the specified node and selectively pushes them onto the priority queue */
 template<typename Entry>
 void expand_dijkstra_neighbours(const std::vector<t_rr_node>& rr_nodes,
                                 const Entry& parent_entry,
-                                std::unordered_map<int, util::Search_Path>& paths,
+                                std::unordered_map<int, Search_Path>& paths,
                                 std::vector<bool>& node_expanded,
                                 std::priority_queue<Entry,
                                                     std::vector<Entry>,
-                                                    std::greater<Entry>>& pq) {
-    int parent_ind = parent_entry.rr_node_ind;
+                                                    std::greater<Entry>>& pq);

-    auto& parent_node = rr_nodes[parent_ind];
-
-    for (int iedge = 0; iedge < parent_node.num_edges(); iedge++) {
-        int child_node_ind = parent_node.edge_sink_node(iedge);
-        int switch_ind = parent_node.edge_switch(iedge);
-
-        /* skip this child if it has already been expanded from */
-        if (node_expanded[child_node_ind]) {
-            continue;
-        }
-
-        Entry child_entry(child_node_ind, switch_ind, parent_entry.cost(), false);
-        VTR_ASSERT(child_entry.cost() >= 0);
-        pq.push(child_entry);
-
-        /* Create (if it doesn't exist) or update (if the new cost is lower)
-         * to specified node */
-        util::Search_Path path_entry = {child_entry.cost(), parent_ind, iedge};
-        auto result = paths.insert(std::make_pair(
-            child_node_ind,
-            path_entry));
-        if (!result.second) {
-            if (child_entry.cost() < result.first->second.cost) {
-                result.first->second = path_entry;
-            }
-        }
-    }
-}
+} // namespace util

 #endif