Improve handling of edge cases and move first_level_in_fun logic to m…

…ake_segmentation
gvinciguerra · Apr 23, 2024 · c423a88 · c423a88
1 parent 9283a70
commit c423a88
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 88 deletions.
diff --git a/include/pgm/pgm_index.hpp b/include/pgm/pgm_index.hpp
@@ -17,7 +17,6 @@
 
 #include "piecewise_linear_model.hpp"
 #include <algorithm>
-#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -33,26 +32,6 @@ namespace pgm {
 #define PGM_SUB_EPS(x, epsilon) ((x) <= (epsilon) ? 0 : ((x) - (epsilon)))
 #define PGM_ADD_EPS(x, epsilon, size) ((x) + (epsilon) + 2 >= (size) ? (size) : (x) + (epsilon) + 2)
 
-namespace internal {
-    template<typename K, typename RandomIt>
-    auto first_level_in_fun(RandomIt first, size_t n) {
-        return [=](size_t i) {
-            auto x = first[i];
-            // Here there is an adjustment for inputs with duplicate keys: at the end of a run of duplicate keys equal
-            // to x=first[i] such that x+1!=first[i+1], we map the values x+1,...,first[i+1]-1 to their correct rank i.
-            // For floating-point keys, the value x+1 above is replaced by the next representable value following x.
-            if constexpr (std::is_floating_point_v<K>) {
-                K next;
-                constexpr auto infinity = std::numeric_limits<K>::infinity();
-                auto flag = i - 1u < n - 2u && x == first[i - 1] && (next = std::nextafter(x, infinity)) < first[i + 1];
-                return std::pair<K, size_t>(flag ? next : x, i);
-            }
-            auto flag = i - 1u < n - 2u && x == first[i - 1] && x + 1 < first[i + 1];
-            return std::pair<K, size_t>(x + flag, i);
-        };
-    }
-}
-
 /**
  * A struct that stores the result of a query to a @ref PGMIndex, that is, a range [@ref lo, @ref hi)
  * centered around an approximate position @ref pos of the sought key.
@@ -101,7 +80,9 @@ class PGMIndex {
     std::vector<Segment> segments;      ///< The segments composing the index.
     std::vector<size_t> levels_offsets; ///< The starting position of each level in segments[], in reverse order.
 
-    static constexpr K sentinel = std::numeric_limits<K>::max(); ///< Sentinel value to avoid bounds checking.
+    /// Sentinel value to avoid bounds checking.
+    static constexpr K sentinel = std::numeric_limits<K>::has_infinity ? std::numeric_limits<K>::infinity()
+                                                                       : std::numeric_limits<K>::max();
 
     template<typename RandomIt>
     static void build(RandomIt first, RandomIt last,
@@ -118,30 +99,30 @@ class PGMIndex {
         if (*std::prev(last) == sentinel)
             throw std::invalid_argument("The value " + std::to_string(sentinel) + " is reserved as a sentinel.");
 
-        auto last_n = n;
-        auto build_level = [&](auto epsilon, auto in_fun, auto out_fun) {
+        auto build_level = [&](auto epsilon, auto in_fun, auto out_fun, size_t last_n) {
             auto n_segments = internal::make_segmentation_par(last_n, epsilon, in_fun, out_fun);
-            if (last_n > 1 && segments.back().slope == 0) {
-                // Here we need to ensure that keys > *(last-1) are approximated to a position == prev_level_size
-                segments.emplace_back(*std::prev(last) + 1, 0, last_n);
-                ++n_segments;
+            if (segments.back() == sentinel)
+                --n_segments;
+            else {
+                if (segments.back()(sentinel - 1) < last_n)
+                    segments.emplace_back(*std::prev(last) + 1, 0, last_n); // Ensure keys > last are mapped to last_n
+                segments.emplace_back(sentinel, 0, last_n);
             }
-            segments.emplace_back(last_n); // Add the sentinel segment
             return n_segments;
         };
 
         // Build first level
-        auto in_fun = internal::first_level_in_fun<K, decltype(first)>(first, n);
+        auto in_fun = [&](auto i) { return K(first[i]); };
         auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
-        last_n = build_level(epsilon, in_fun, out_fun);
-        levels_offsets.push_back(levels_offsets.back() + last_n + 1);
+        auto last_n = build_level(epsilon, in_fun, out_fun, n);
+        levels_offsets.push_back(segments.size());
 
         // Build upper levels
         while (epsilon_recursive && last_n > 1) {
             auto offset = levels_offsets[levels_offsets.size() - 2];
-            auto in_fun_rec = [&](auto i) { return std::pair<K, size_t>(segments[offset + i].key, i); };
-            last_n = build_level(epsilon_recursive, in_fun_rec, out_fun);
-            levels_offsets.push_back(levels_offsets.back() + last_n + 1);
+            auto in_fun_rec = [&](auto i) { return segments[offset + i].key; };
+            last_n = build_level(epsilon_recursive, in_fun_rec, out_fun, last_n);
+            levels_offsets.push_back(segments.size());
         }
     }
 
@@ -248,8 +229,6 @@ struct PGMIndex<K, Epsilon, EpsilonRecursive, Floating>::Segment {
 
     Segment(K key, Floating slope, uint32_t intercept) : key(key), slope(slope), intercept(intercept) {};
 
-    explicit Segment(size_t n) : key(sentinel), slope(), intercept(n) {};
-
     explicit Segment(const typename internal::OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment &cs)
         : key(cs.get_first_x()) {
         auto[cs_slope, cs_intercept] = cs.get_floating_point_segment(key);

diff --git a/include/pgm/pgm_index_variants.hpp b/include/pgm/pgm_index_variants.hpp
@@ -79,8 +79,9 @@ class CompressedPGMIndex {
     std::vector<Floating> slopes_table;   ///< The vector containing the slopes used by the segments in the index.
     std::vector<CompressedLevel> levels;  ///< The levels composing the compressed index.
 
-    static constexpr K sentinel = std::numeric_limits<K>::max(); ///< Sentinel value to avoid bounds checking.
-
+    /// Sentinel value to avoid bounds checking.
+    static constexpr K sentinel = std::numeric_limits<K>::has_infinity ? std::numeric_limits<K>::infinity()
+                                                                       : std::numeric_limits<K>::max();
     using floating_pair = std::pair<Floating, Floating>;
     using canonical_segment = typename internal::OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment;
 
@@ -116,15 +117,15 @@ class CompressedPGMIndex {
             throw std::invalid_argument("The value " + std::to_string(sentinel) + " is reserved as a sentinel.");
 
         // Build first level
-        auto in_fun = internal::first_level_in_fun<K, decltype(first)>(first, n);
+        auto in_fun = [&](auto i) { return first[i]; };
         auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
         auto last_n = internal::make_segmentation_par(n, Epsilon, in_fun, out_fun);
         levels_offsets.push_back(levels_offsets.back() + last_n);
 
         // Build upper levels
         while (EpsilonRecursive && last_n > 1) {
             auto offset = levels_offsets[levels_offsets.size() - 2];
-            auto in_fun_rec = [&](auto i) { return std::pair<K, size_t>(segments[offset + i].get_first_x(), i); };
+            auto in_fun_rec = [&](auto i) { return segments[offset + i].get_first_x(); };
             last_n = internal::make_segmentation(last_n, EpsilonRecursive, in_fun_rec, out_fun);
             levels_offsets.push_back(levels_offsets.back() + last_n);
         }

diff --git a/include/pgm/piecewise_linear_model.hpp b/include/pgm/piecewise_linear_model.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -272,34 +273,51 @@ class OptimalPiecewiseLinearModel<X, Y>::CanonicalSegment {
 };
 
 template<typename Fin, typename Fout>
-size_t make_segmentation(size_t n, size_t epsilon, Fin in, Fout out) {
-    if (n == 0)
-        return 0;
-
-    using X = typename std::invoke_result_t<Fin, size_t>::first_type;
-    using Y = typename std::invoke_result_t<Fin, size_t>::second_type;
+size_t make_segmentation(size_t n, size_t start, size_t end, size_t epsilon, Fin in, Fout out) {
+    using K = typename std::invoke_result_t<Fin, size_t>;
     size_t c = 0;
-    auto p = in(0);
-
-    OptimalPiecewiseLinearModel<X, Y> opt(epsilon);
-    opt.add_point(p.first, p.second);
-
-    for (size_t i = 1; i < n; ++i) {
-        auto next_p = in(i);
-        if (next_p.first == p.first)
-            continue;
-        p = next_p;
-        if (!opt.add_point(p.first, p.second)) {
+    OptimalPiecewiseLinearModel<K, size_t> opt(epsilon);
+    auto add_point = [&](K x, size_t y) {
+        if (!opt.add_point(x, y)) {
             out(opt.get_segment());
-            opt.add_point(p.first, p.second);
+            opt.add_point(x, y);
             ++c;
         }
+    };
+
+    add_point(in(start), start);
+    for (size_t i = start + 1; i < end - 1; ++i) {
+        if (in(i) == in(i - 1)) {
+            // Here there is an adjustment for inputs with duplicate keys: at the end of a run of duplicate keys equal
+            // to x=in(i) such that x+1!=in(i+1), we map the values x+1,...,in(i+1)-1 to their correct rank i.
+            // For floating-point keys, the value x+1 above is replaced by the next representable value following x.
+            if constexpr (std::is_floating_point_v<K>) {
+                K next;
+                if ((next = std::nextafter(in(i), std::numeric_limits<K>::infinity())) < in(i + 1))
+                    add_point(next, i);
+            } else {
+                if (in(i) + 1 < in(i + 1))
+                    add_point(in(i) + 1, i);
+            }
+        } else {
+            add_point(in(i), i);
+        }
     }
+    if (in(end - 1) != in(end - 2))
+        add_point(in(end - 1), end - 1);
+
+    if (end == n)
+        add_point(in(n - 1) + 1, n); // Ensure values greater than the last one are mapped to n
 
     out(opt.get_segment());
     return ++c;
 }
 
+template<typename Fin, typename Fout>
+size_t make_segmentation(size_t n, size_t epsilon, Fin in, Fout out) {
+    return make_segmentation(n, 0, n, epsilon, in, out);
+}
+
 template<typename Fin, typename Fout>
 size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
     auto parallelism = std::min(std::min(omp_get_num_procs(), omp_get_max_threads()), 20);
@@ -309,9 +327,8 @@ size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
     if (parallelism == 1 || n < 1ull << 15)
         return make_segmentation(n, epsilon, in, out);
 
-    using X = typename std::invoke_result_t<Fin, size_t>::first_type;
-    using Y = typename std::invoke_result_t<Fin, size_t>::second_type;
-    using canonical_segment = typename OptimalPiecewiseLinearModel<X, Y>::CanonicalSegment;
+    using K = typename std::invoke_result_t<Fin, size_t>;
+    using canonical_segment = typename OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment;
     std::vector<std::vector<canonical_segment>> results(parallelism);
 
     #pragma omp parallel for reduction(+:c) num_threads(parallelism)
@@ -320,16 +337,16 @@ size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
         auto last = i == parallelism - 1 ? n : first + chunk_size;
         if (first > 0) {
             for (; first < last; ++first)
-                if (in(first).first != in(first - 1).first)
+                if (in(first) != in(first - 1))
                     break;
             if (first == last)
                 continue;
         }
 
-        auto in_fun = [in, first](auto j) { return in(first + j); };
+        auto in_fun = [in](auto j) { return in(j); };
         auto out_fun = [&results, i](const auto &cs) { results[i].emplace_back(cs); };
         results[i].reserve(chunk_size / (epsilon > 0 ? epsilon * epsilon : 16));
-        c += make_segmentation(last - first, epsilon, in_fun, out_fun);
+        c += make_segmentation(n, first, last, epsilon, in_fun, out_fun);
     }
 
     for (auto &v : results)
@@ -339,21 +356,4 @@ size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
     return c;
 }
 
-template<typename RandomIt>
-auto make_segmentation(RandomIt first, RandomIt last, size_t epsilon) {
-    using key_type = typename RandomIt::value_type;
-    using canonical_segment = typename OptimalPiecewiseLinearModel<key_type, size_t>::CanonicalSegment;
-    using pair_type = typename std::pair<key_type, size_t>;
-
-    size_t n = std::distance(first, last);
-    std::vector<canonical_segment> out;
-    out.reserve(epsilon > 0 ? n / (epsilon * epsilon) : n / 16);
-
-    auto in_fun = [first](auto i) { return pair_type(first[i], i); };
-    auto out_fun = [&out](const auto &cs) { out.push_back(cs); };
-    make_segmentation(n, epsilon, in_fun, out_fun);
-
-    return out;
-}
-
 }
diff --git a/test/tests.cpp b/test/tests.cpp
@@ -48,14 +48,14 @@ void test_index(const Index &index, const Data &data) {
         REQUIRE(*k == q);
     }
 
-    // Test elements outside range
-    auto q = data.back() + 42;
+    // Test elements at extremes
+    auto q = std::numeric_limits<typename Data::value_type>::max() - 1;
     auto range = index.search(q);
     auto lo = data.begin() + range.lo;
     auto hi = data.begin() + range.hi;
-    REQUIRE(std::lower_bound(lo, hi, q) == data.end());
+    REQUIRE(std::lower_bound(lo, hi, q) == std::lower_bound(data.begin(), data.end(), q));
 
-    q = std::numeric_limits<typename Data::value_type>::min();
+    q = std::numeric_limits<typename Data::value_type>::lowest();
     range = index.search(q);
     lo = data.begin() + range.lo;
     hi = data.begin() + range.hi;
@@ -65,7 +65,11 @@ void test_index(const Index &index, const Data &data) {
 TEMPLATE_TEST_CASE("Segmentation algorithm", "", float, double, uint32_t, uint64_t) {
     auto epsilon = GENERATE(32, 64, 128);
     auto data = generate_data<TestType>(1000000);
-    auto segments = pgm::internal::make_segmentation(data.begin(), data.end(), epsilon);
+    using canonical_segment = typename pgm::internal::OptimalPiecewiseLinearModel<TestType, size_t>::CanonicalSegment;
+    std::vector<canonical_segment> segments;
+    auto in_fun = [&](auto i) { return data[i]; };
+    auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
+    pgm::internal::make_segmentation(data.size(), epsilon, in_fun, out_fun);
     auto it = segments.begin();
     auto [slope, intercept] = it->get_floating_point_segment(it->get_first_x());
 
@@ -89,7 +93,9 @@ TEMPLATE_TEST_CASE_SIG("PGM-index", "",
                        (uint64_t, 8, 4), (uint64_t, 32, 4), (uint64_t, 128, 4),
                        (int32_t, 8, 0), (int32_t, 32, 0), (int32_t, 128, 0),
                        (int64_t, 8, 4), (int64_t, 32, 4), (int64_t, 128, 4),
-                       (int64_t, 1024, 1024), (uint64_t, 1024, 1024)) {
+                       (int64_t, 1024, 1024), (uint64_t, 1024, 1024),
+                       (int8_t, 16, 4), (uint8_t, 16, 4),  (int16_t, 16, 4), (uint16_t, 16, 4),
+                       (float, 16, 4), (double, 16, 4)) {
     auto data = generate_data<T>(2000000);
     pgm::PGMIndex<T, E1, E2> index(data.begin(), data.end());
     test_index(index, data);

diff --git a/test/utils.hpp b/test/utils.hpp
@@ -37,6 +37,11 @@ std::vector<T> generate_data(size_t n) {
         RandomFunction exponential = std::bind(std::exponential_distribution<T>(1.2), engine);
         auto rand = GENERATE_COPY(as<RandomFunction>{}, lognormal, exponential);
         std::generate(data.begin(), data.end(), rand);
+    } else if constexpr (sizeof(T) <= 2) {
+        auto value_min = std::numeric_limits<T>::min();
+        auto value_max = std::numeric_limits<T>::max() - 1;
+        RandomFunction uniform = std::bind(std::uniform_int_distribution<T>(value_min, value_max), engine);
+        std::generate(data.begin(), data.end(), rand);
     } else {
         T min = 0;
         if constexpr (std::is_signed_v<T>)
@@ -52,6 +57,8 @@ std::vector<T> generate_data(size_t n) {
     }
 
     std::sort(data.begin(), data.end());
+    while (data.back() == std::numeric_limits<T>::max())
+        data.pop_back();
     return data;
 }