Skip to content

Commit

Permalink
Improve handling of edge cases and move first_level_in_fun logic to m…
Browse files Browse the repository at this point in the history
…ake_segmentation
  • Loading branch information
gvinciguerra committed Apr 23, 2024
1 parent 9283a70 commit c423a88
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 88 deletions.
53 changes: 16 additions & 37 deletions include/pgm/pgm_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include "piecewise_linear_model.hpp"
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <iterator>
Expand All @@ -33,26 +32,6 @@ namespace pgm {
#define PGM_SUB_EPS(x, epsilon) ((x) <= (epsilon) ? 0 : ((x) - (epsilon)))
#define PGM_ADD_EPS(x, epsilon, size) ((x) + (epsilon) + 2 >= (size) ? (size) : (x) + (epsilon) + 2)

namespace internal {
template<typename K, typename RandomIt>
auto first_level_in_fun(RandomIt first, size_t n) {
return [=](size_t i) {
auto x = first[i];
// Here there is an adjustment for inputs with duplicate keys: at the end of a run of duplicate keys equal
// to x=first[i] such that x+1!=first[i+1], we map the values x+1,...,first[i+1]-1 to their correct rank i.
// For floating-point keys, the value x+1 above is replaced by the next representable value following x.
if constexpr (std::is_floating_point_v<K>) {
K next;
constexpr auto infinity = std::numeric_limits<K>::infinity();
auto flag = i - 1u < n - 2u && x == first[i - 1] && (next = std::nextafter(x, infinity)) < first[i + 1];
return std::pair<K, size_t>(flag ? next : x, i);
}
auto flag = i - 1u < n - 2u && x == first[i - 1] && x + 1 < first[i + 1];
return std::pair<K, size_t>(x + flag, i);
};
}
}

/**
* A struct that stores the result of a query to a @ref PGMIndex, that is, a range [@ref lo, @ref hi)
* centered around an approximate position @ref pos of the sought key.
Expand Down Expand Up @@ -101,7 +80,9 @@ class PGMIndex {
std::vector<Segment> segments; ///< The segments composing the index.
std::vector<size_t> levels_offsets; ///< The starting position of each level in segments[], in reverse order.

static constexpr K sentinel = std::numeric_limits<K>::max(); ///< Sentinel value to avoid bounds checking.
/// Sentinel value to avoid bounds checking.
static constexpr K sentinel = std::numeric_limits<K>::has_infinity ? std::numeric_limits<K>::infinity()
: std::numeric_limits<K>::max();

template<typename RandomIt>
static void build(RandomIt first, RandomIt last,
Expand All @@ -118,30 +99,30 @@ class PGMIndex {
if (*std::prev(last) == sentinel)
throw std::invalid_argument("The value " + std::to_string(sentinel) + " is reserved as a sentinel.");

auto last_n = n;
auto build_level = [&](auto epsilon, auto in_fun, auto out_fun) {
auto build_level = [&](auto epsilon, auto in_fun, auto out_fun, size_t last_n) {
auto n_segments = internal::make_segmentation_par(last_n, epsilon, in_fun, out_fun);
if (last_n > 1 && segments.back().slope == 0) {
// Here we need to ensure that keys > *(last-1) are approximated to a position == prev_level_size
segments.emplace_back(*std::prev(last) + 1, 0, last_n);
++n_segments;
if (segments.back() == sentinel)
--n_segments;
else {
if (segments.back()(sentinel - 1) < last_n)
segments.emplace_back(*std::prev(last) + 1, 0, last_n); // Ensure keys > last are mapped to last_n
segments.emplace_back(sentinel, 0, last_n);
}
segments.emplace_back(last_n); // Add the sentinel segment
return n_segments;
};

// Build first level
auto in_fun = internal::first_level_in_fun<K, decltype(first)>(first, n);
auto in_fun = [&](auto i) { return K(first[i]); };
auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
last_n = build_level(epsilon, in_fun, out_fun);
levels_offsets.push_back(levels_offsets.back() + last_n + 1);
auto last_n = build_level(epsilon, in_fun, out_fun, n);
levels_offsets.push_back(segments.size());

// Build upper levels
while (epsilon_recursive && last_n > 1) {
auto offset = levels_offsets[levels_offsets.size() - 2];
auto in_fun_rec = [&](auto i) { return std::pair<K, size_t>(segments[offset + i].key, i); };
last_n = build_level(epsilon_recursive, in_fun_rec, out_fun);
levels_offsets.push_back(levels_offsets.back() + last_n + 1);
auto in_fun_rec = [&](auto i) { return segments[offset + i].key; };
last_n = build_level(epsilon_recursive, in_fun_rec, out_fun, last_n);
levels_offsets.push_back(segments.size());
}
}

Expand Down Expand Up @@ -248,8 +229,6 @@ struct PGMIndex<K, Epsilon, EpsilonRecursive, Floating>::Segment {

Segment(K key, Floating slope, uint32_t intercept) : key(key), slope(slope), intercept(intercept) {};

explicit Segment(size_t n) : key(sentinel), slope(), intercept(n) {};

explicit Segment(const typename internal::OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment &cs)
: key(cs.get_first_x()) {
auto[cs_slope, cs_intercept] = cs.get_floating_point_segment(key);
Expand Down
9 changes: 5 additions & 4 deletions include/pgm/pgm_index_variants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ class CompressedPGMIndex {
std::vector<Floating> slopes_table; ///< The vector containing the slopes used by the segments in the index.
std::vector<CompressedLevel> levels; ///< The levels composing the compressed index.

static constexpr K sentinel = std::numeric_limits<K>::max(); ///< Sentinel value to avoid bounds checking.

/// Sentinel value to avoid bounds checking.
static constexpr K sentinel = std::numeric_limits<K>::has_infinity ? std::numeric_limits<K>::infinity()
: std::numeric_limits<K>::max();
using floating_pair = std::pair<Floating, Floating>;
using canonical_segment = typename internal::OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment;

Expand Down Expand Up @@ -116,15 +117,15 @@ class CompressedPGMIndex {
throw std::invalid_argument("The value " + std::to_string(sentinel) + " is reserved as a sentinel.");

// Build first level
auto in_fun = internal::first_level_in_fun<K, decltype(first)>(first, n);
auto in_fun = [&](auto i) { return first[i]; };
auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
auto last_n = internal::make_segmentation_par(n, Epsilon, in_fun, out_fun);
levels_offsets.push_back(levels_offsets.back() + last_n);

// Build upper levels
while (EpsilonRecursive && last_n > 1) {
auto offset = levels_offsets[levels_offsets.size() - 2];
auto in_fun_rec = [&](auto i) { return std::pair<K, size_t>(segments[offset + i].get_first_x(), i); };
auto in_fun_rec = [&](auto i) { return segments[offset + i].get_first_x(); };
last_n = internal::make_segmentation(last_n, EpsilonRecursive, in_fun_rec, out_fun);
levels_offsets.push_back(levels_offsets.back() + last_n);
}
Expand Down
82 changes: 41 additions & 41 deletions include/pgm/piecewise_linear_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <iterator>
Expand Down Expand Up @@ -272,34 +273,51 @@ class OptimalPiecewiseLinearModel<X, Y>::CanonicalSegment {
};

template<typename Fin, typename Fout>
size_t make_segmentation(size_t n, size_t epsilon, Fin in, Fout out) {
if (n == 0)
return 0;

using X = typename std::invoke_result_t<Fin, size_t>::first_type;
using Y = typename std::invoke_result_t<Fin, size_t>::second_type;
size_t make_segmentation(size_t n, size_t start, size_t end, size_t epsilon, Fin in, Fout out) {
using K = typename std::invoke_result_t<Fin, size_t>;
size_t c = 0;
auto p = in(0);

OptimalPiecewiseLinearModel<X, Y> opt(epsilon);
opt.add_point(p.first, p.second);

for (size_t i = 1; i < n; ++i) {
auto next_p = in(i);
if (next_p.first == p.first)
continue;
p = next_p;
if (!opt.add_point(p.first, p.second)) {
OptimalPiecewiseLinearModel<K, size_t> opt(epsilon);
auto add_point = [&](K x, size_t y) {
if (!opt.add_point(x, y)) {
out(opt.get_segment());
opt.add_point(p.first, p.second);
opt.add_point(x, y);
++c;
}
};

add_point(in(start), start);
for (size_t i = start + 1; i < end - 1; ++i) {
if (in(i) == in(i - 1)) {
// Here there is an adjustment for inputs with duplicate keys: at the end of a run of duplicate keys equal
// to x=in(i) such that x+1!=in(i+1), we map the values x+1,...,in(i+1)-1 to their correct rank i.
// For floating-point keys, the value x+1 above is replaced by the next representable value following x.
if constexpr (std::is_floating_point_v<K>) {
K next;
if ((next = std::nextafter(in(i), std::numeric_limits<K>::infinity())) < in(i + 1))
add_point(next, i);
} else {
if (in(i) + 1 < in(i + 1))
add_point(in(i) + 1, i);
}
} else {
add_point(in(i), i);
}
}
if (in(end - 1) != in(end - 2))
add_point(in(end - 1), end - 1);

if (end == n)
add_point(in(n - 1) + 1, n); // Ensure values greater than the last one are mapped to n

out(opt.get_segment());
return ++c;
}

template<typename Fin, typename Fout>
size_t make_segmentation(size_t n, size_t epsilon, Fin in, Fout out) {
return make_segmentation(n, 0, n, epsilon, in, out);
}

template<typename Fin, typename Fout>
size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
auto parallelism = std::min(std::min(omp_get_num_procs(), omp_get_max_threads()), 20);
Expand All @@ -309,9 +327,8 @@ size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
if (parallelism == 1 || n < 1ull << 15)
return make_segmentation(n, epsilon, in, out);

using X = typename std::invoke_result_t<Fin, size_t>::first_type;
using Y = typename std::invoke_result_t<Fin, size_t>::second_type;
using canonical_segment = typename OptimalPiecewiseLinearModel<X, Y>::CanonicalSegment;
using K = typename std::invoke_result_t<Fin, size_t>;
using canonical_segment = typename OptimalPiecewiseLinearModel<K, size_t>::CanonicalSegment;
std::vector<std::vector<canonical_segment>> results(parallelism);

#pragma omp parallel for reduction(+:c) num_threads(parallelism)
Expand All @@ -320,16 +337,16 @@ size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
auto last = i == parallelism - 1 ? n : first + chunk_size;
if (first > 0) {
for (; first < last; ++first)
if (in(first).first != in(first - 1).first)
if (in(first) != in(first - 1))
break;
if (first == last)
continue;
}

auto in_fun = [in, first](auto j) { return in(first + j); };
auto in_fun = [in](auto j) { return in(j); };
auto out_fun = [&results, i](const auto &cs) { results[i].emplace_back(cs); };
results[i].reserve(chunk_size / (epsilon > 0 ? epsilon * epsilon : 16));
c += make_segmentation(last - first, epsilon, in_fun, out_fun);
c += make_segmentation(n, first, last, epsilon, in_fun, out_fun);
}

for (auto &v : results)
Expand All @@ -339,21 +356,4 @@ size_t make_segmentation_par(size_t n, size_t epsilon, Fin in, Fout out) {
return c;
}

template<typename RandomIt>
auto make_segmentation(RandomIt first, RandomIt last, size_t epsilon) {
using key_type = typename RandomIt::value_type;
using canonical_segment = typename OptimalPiecewiseLinearModel<key_type, size_t>::CanonicalSegment;
using pair_type = typename std::pair<key_type, size_t>;

size_t n = std::distance(first, last);
std::vector<canonical_segment> out;
out.reserve(epsilon > 0 ? n / (epsilon * epsilon) : n / 16);

auto in_fun = [first](auto i) { return pair_type(first[i], i); };
auto out_fun = [&out](const auto &cs) { out.push_back(cs); };
make_segmentation(n, epsilon, in_fun, out_fun);

return out;
}

}
18 changes: 12 additions & 6 deletions test/tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ void test_index(const Index &index, const Data &data) {
REQUIRE(*k == q);
}

// Test elements outside range
auto q = data.back() + 42;
// Test elements at extremes
auto q = std::numeric_limits<typename Data::value_type>::max() - 1;
auto range = index.search(q);
auto lo = data.begin() + range.lo;
auto hi = data.begin() + range.hi;
REQUIRE(std::lower_bound(lo, hi, q) == data.end());
REQUIRE(std::lower_bound(lo, hi, q) == std::lower_bound(data.begin(), data.end(), q));

q = std::numeric_limits<typename Data::value_type>::min();
q = std::numeric_limits<typename Data::value_type>::lowest();
range = index.search(q);
lo = data.begin() + range.lo;
hi = data.begin() + range.hi;
Expand All @@ -65,7 +65,11 @@ void test_index(const Index &index, const Data &data) {
TEMPLATE_TEST_CASE("Segmentation algorithm", "", float, double, uint32_t, uint64_t) {
auto epsilon = GENERATE(32, 64, 128);
auto data = generate_data<TestType>(1000000);
auto segments = pgm::internal::make_segmentation(data.begin(), data.end(), epsilon);
using canonical_segment = typename pgm::internal::OptimalPiecewiseLinearModel<TestType, size_t>::CanonicalSegment;
std::vector<canonical_segment> segments;
auto in_fun = [&](auto i) { return data[i]; };
auto out_fun = [&](auto cs) { segments.emplace_back(cs); };
pgm::internal::make_segmentation(data.size(), epsilon, in_fun, out_fun);
auto it = segments.begin();
auto [slope, intercept] = it->get_floating_point_segment(it->get_first_x());

Expand All @@ -89,7 +93,9 @@ TEMPLATE_TEST_CASE_SIG("PGM-index", "",
(uint64_t, 8, 4), (uint64_t, 32, 4), (uint64_t, 128, 4),
(int32_t, 8, 0), (int32_t, 32, 0), (int32_t, 128, 0),
(int64_t, 8, 4), (int64_t, 32, 4), (int64_t, 128, 4),
(int64_t, 1024, 1024), (uint64_t, 1024, 1024)) {
(int64_t, 1024, 1024), (uint64_t, 1024, 1024),
(int8_t, 16, 4), (uint8_t, 16, 4), (int16_t, 16, 4), (uint16_t, 16, 4),
(float, 16, 4), (double, 16, 4)) {
auto data = generate_data<T>(2000000);
pgm::PGMIndex<T, E1, E2> index(data.begin(), data.end());
test_index(index, data);
Expand Down
7 changes: 7 additions & 0 deletions test/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ std::vector<T> generate_data(size_t n) {
RandomFunction exponential = std::bind(std::exponential_distribution<T>(1.2), engine);
auto rand = GENERATE_COPY(as<RandomFunction>{}, lognormal, exponential);
std::generate(data.begin(), data.end(), rand);
} else if constexpr (sizeof(T) <= 2) {
auto value_min = std::numeric_limits<T>::min();
auto value_max = std::numeric_limits<T>::max() - 1;
RandomFunction uniform = std::bind(std::uniform_int_distribution<T>(value_min, value_max), engine);
std::generate(data.begin(), data.end(), rand);
} else {
T min = 0;
if constexpr (std::is_signed_v<T>)
Expand All @@ -52,6 +57,8 @@ std::vector<T> generate_data(size_t n) {
}

std::sort(data.begin(), data.end());
while (data.back() == std::numeric_limits<T>::max())
data.pop_back();
return data;
}

Expand Down

0 comments on commit c423a88

Please sign in to comment.