From 96fb7ad733feb0d9090ab82c88233cf47e98ecf5 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:03:35 -0400 Subject: [PATCH 01/11] core-clp: Rewrite wildcard matching method and add systematic unit tests (fixes #427). --- .../src/clp/string_utils/string_utils.cpp | 264 ++++--- components/core/tests/test-string_utils.cpp | 693 +++++++----------- 2 files changed, 403 insertions(+), 554 deletions(-) diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index c68865bf9..3653deb9c 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -1,72 +1,73 @@ #include "string_utils/string_utils.hpp" #include -#include +#include #include using std::string; using std::string_view; +namespace clp::string_utils { namespace { /** - * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the pointer in - * tame to the next character which matches wild. This method should be inlined - * for performance. - * @param tame_current - * @param tame_bookmark - * @param tame_end - * @param wild_current - * @param wild_bookmark - * @return true on success, false if wild cannot match tame + * Helper for `wildcard_match_unsafe_case_sensitive` to advance `tame`'s iterator to the next + * character that matches the current character in `wild`, and to bookmark this character. If the + * current character in `wild` is escaped, `wild`'s iterator will also be advanced. + * + * NOTE: + * - This method expects that `tame_it` < `tame_end_it` + * - This method should be inlined for performance. + * + * @param tame_end_it + * @param tame_it Returns `tame`'s updated iterator. + * @param tame_bookmark_it Returns `tame`'s updated bookmark. + * @param wild_it Returns `wild`'s updated iterator. + * @return true on success, false if `tame` cannot match `wild`. */ inline bool advance_tame_to_next_match( - char const*& tame_current, - char const*& tame_bookmark, - char const* tame_end, - char const*& wild_current + string_view::const_iterator tame_end_it, + string_view::const_iterator& tame_it, + string_view::const_iterator& tame_bookmark_it, + string_view::const_iterator& wild_it ); inline bool advance_tame_to_next_match( - char const*& tame_current, - char const*& tame_bookmark, - char const* tame_end, - char const*& wild_current + string_view::const_iterator tame_end_it, + string_view::const_iterator& tame_it, + string_view::const_iterator& tame_bookmark_it, + string_view::const_iterator& wild_it ) { - auto w = *wild_current; + auto w = *wild_it; if ('?' != w) { // No need to check for '*' since the caller ensures wild doesn't // contain consecutive '*' // Handle escaped characters if ('\\' == w) { - ++wild_current; - // This is safe without a bounds check since this the caller ensures - // there are no dangling escape characters - w = *wild_current; + // Safe without a bounds check + ++wild_it; + w = *wild_it; } - // Advance tame_current until it matches wild_current + // Advance `tame_it` until it matches `w` while (true) { - if (tame_end == tame_current) { - // Wild group is longer than last group in tame, so can't match - // e.g. "*abc" doesn't match "zab" - return false; - } - auto t = *tame_current; + auto t = *tame_it; if (t == w) { break; } - ++tame_current; + ++tame_it; + if (tame_end_it == tame_it) { + return false; + } } } - tame_bookmark = tame_current; + tame_bookmark_it = tame_it; return true; } } // namespace -namespace clp::string_utils { size_t find_first_of( string const& haystack, char const* needles, @@ -182,114 +183,157 @@ bool wildcard_match_unsafe(string_view tame, string_view wild, bool case_sensiti /** * The algorithm basically works as follows: - * Given a wild string "*abc*def*ghi*", it can be broken into groups of - * characters delimited by one or more '*' characters. The goal of the algorithm - * is then to determine whether the tame string contains each of those groups in - * the same order. + * Given a wildcard string (a.k.a. "wild") like "abc*def*ghi*", it can be broken into groups of + * characters delimited by one or more '*' characters. The goal of the algorithm is then to + * determine whether the "tame" string contains each of those groups in the same order. + * + * Matching a group in `wild` against `tame` requires iteratively matching each character in `tame` + * against each character in the group, with the exception of the '?' wildcard and escaped + * characters ('*', '?', or '\'). When a mismatch occurs, there are two possibilities: + * + * 1. The mismatch occurs before the first '*' in `wild`, meaning that the entire wildcard match + * fails. + * 2. The mismatch occurs after a '*' in `wild`. This case requires additional handling explained + * below. + * + * Consider `tame` = "ccd", `wild` = "*cd". When we start matching `tame` against the first group + * in `wild`, the first 'c' will match, but the second 'c' won't match 'd'. In this case, we should + * restart the matching process from the second 'c'. + * + * To generalize this, we need to maintain bookmarks for both `tame` and `wild`. Whenever we have a + * mismatch, we should reset `wild` to its bookmark and `tame` to its bookmark + 1, and then try + * the match again. If we get to the end of `tame` without getting to the end of the group in + * `wild`, the entire wildcard match fails. * - * Thus, the algorithm: - * 1. searches for the start of one of these groups in wild, - * 2. searches for a group in tame starting with the same character, and then - * 3. checks if the two match. If not, the search repeats with the next group in - * tame. + * NOTE: + * - Since the caller guarantees that there are no consecutive '*', we don't need to handle the + * case where a group in `wild` is empty. + * - Since the caller guarantees that there every '\' is followed by a character, we can advance + * passed '\' without doing a subsequent bounds check. + * - The second part of this method could be rewritten in the following form: + * + * ``` + * while(true) { + * if (false == advance_tame_to_next_match(...)) return false; + * + * while (true) { + * // Advance iterators + * // If we reached the end of `tame` before the end of `wild`, break + * // If we see a '*' in `wild`, break + * // If we see a mismatch, break + * } + * } + * ``` + * + * However, this form is ~2% slower. */ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { - auto const tame_length = tame.length(); - auto const wild_length = wild.length(); - char const* tame_current = tame.data(); - char const* wild_current = wild.data(); - char const* tame_bookmark = nullptr; - char const* wild_bookmark = nullptr; - char const* tame_end = tame_current + tame_length; - char const* wild_end = wild_current + wild_length; + auto tame_it = tame.cbegin(); + auto wild_it = wild.cbegin(); + auto const tame_end_it = tame.cend(); + auto const wild_end_it = wild.cend(); + string_view::const_iterator tame_bookmark_it{}; + string_view::const_iterator wild_bookmark_it{}; - // Handle wild or tame being empty - if (0 == wild_length) { - return 0 == tame_length; - } else { - if (0 == tame_length) { - return "*" == wild; + // Handle `tame` or `wild` being empty + if (wild.empty()) { + return tame.empty(); + } + if (tame.empty()) { + return "*" == wild; + } + + // Match `tame` against `wild` against until we reach the first '*' in `wild` or they no longer + // match + while (true) { + auto w = *wild_it; + if ('*' == w) { + break; + } + if ('?' != w) { + // Handle escaped characters + if ('\\' == w) { + // Safe without a bounds check + ++wild_it; + w = *wild_it; + } + + // Handle a mismatch + if (w != *tame_it) { + return false; + } + } + + ++tame_it; + ++wild_it; + + // Handle boundary conditions + if (tame_end_it == tame_it) { + return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); + } else if (wild_end_it == wild_it) { + return tame_end_it == tame_it; } } - char w; - char t; - bool is_escaped = false; + // Find a match in `tame` for every group of characters between '*' in `wild` while (true) { - w = *wild_current; + auto w = *wild_it; if ('*' == w) { - ++wild_current; - if (wild_end == wild_current) { - // Trailing '*' means everything remaining in tame will match + ++wild_it; + if (wild_end_it == wild_it) { + // `wild` ending with '*' means that it'll match the rest of `tame` return true; } - // Set wild and tame bookmarks - wild_bookmark = wild_current; + // Set `tame` and `wild` bookmarks + wild_bookmark_it = wild_it; if (false - == advance_tame_to_next_match(tame_current, tame_bookmark, tame_end, wild_current)) + == advance_tame_to_next_match(tame_end_it, tame_it, tame_bookmark_it, wild_it)) { return false; } - } else { + } else if ('?' != w) { // Handle escaped characters if ('\\' == w) { - is_escaped = true; - ++wild_current; - // This is safe without a bounds check since this the caller - // ensures there are no dangling escape characters - w = *wild_current; + // Safe without a bounds check + ++wild_it; + w = *wild_it; } // Handle a mismatch - t = *tame_current; - if (!((false == is_escaped && '?' == w) || t == w)) { - if (nullptr == wild_bookmark) { - // No bookmark to return to + if (w != *tame_it) { + // Reset to bookmarks + tame_it = tame_bookmark_it + 1; + if (tame_end_it == tame_it) { return false; } - - wild_current = wild_bookmark; - tame_current = tame_bookmark + 1; + wild_it = wild_bookmark_it; if (false - == advance_tame_to_next_match( - tame_current, - tame_bookmark, - tame_end, - wild_current - )) + == advance_tame_to_next_match(tame_end_it, tame_it, tame_bookmark_it, wild_it)) { return false; } } } - ++tame_current; - ++wild_current; + ++tame_it; + ++wild_it; - // Handle reaching the end of tame or wild - if (tame_end == tame_current) { - return (wild_end == wild_current - || ('*' == *wild_current && (wild_current + 1) == wild_end)); - } else { - if (wild_end == wild_current) { - if (nullptr == wild_bookmark) { - // No bookmark to return to - return false; - } else { - wild_current = wild_bookmark; - tame_current = tame_bookmark + 1; - if (false - == advance_tame_to_next_match( - tame_current, - tame_bookmark, - tame_end, - wild_current - )) - { - return false; - } - } + // Handle boundary conditions + if (tame_end_it == tame_it) { + return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); + } else if (wild_end_it == wild_it) { + if (tame_end_it == tame_it) { + return true; + } + + // Reset to bookmarks + tame_it = tame_bookmark_it + 1; + wild_it = wild_bookmark_it; + if (false + == advance_tame_to_next_match(tame_end_it, tame_it, tame_bookmark_it, wild_it)) + { + return false; } } } diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 747ebe000..965a8bada 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -1,10 +1,14 @@ +#include #include +#include +#include +#include -#include -#include #include #include +#include "FileReader.hpp" + using clp::string_utils::clean_up_wildcard_search_string; using clp::string_utils::convert_string_to_int; using clp::string_utils::wildcard_match_unsafe; @@ -13,9 +17,156 @@ using std::chrono::duration; using std::chrono::high_resolution_clock; using std::cout; using std::endl; +using std::span; using std::string; +using std::string_view; using std::vector; +namespace { +/** + * All possible alphabets that could appear in a wildcard string. Note that the alphabets are + * conceptual (e.g. EscapedAsterisk) rather than concrete (e.g. "\\*"). + */ +enum class WildcardStringAlphabet { + Empty = 0, + AnyChar, + Asterisk, + QuestionMark, + EscapedAsterisk, + EscapedQuestionMark, + EscapedBackslash, +}; + +/** + * Recursively generates strings that will match the given wildcard string and tests that they + * match. + * @param chosen_alphabets + * @param wild + * @param tame Returns the string generated so far. + */ +void generate_and_test_tame_str( + span chosen_alphabets, + string_view wild, + string& tame +); + +/** + * Recursively generates and tests a wildcard string using the given template. Testing requires + * generating one or more matching strings. + * @param template_wildcard_str + * @param chosen_alphabets Returns the alphabets chosen so far. + * @param wild Returns the string generated so far. + */ +void generate_and_test_wildcard_str( + span> template_wildcard_str, + vector& chosen_alphabets, + string& wild +); + +void generate_and_test_tame_str( + span chosen_alphabets, + string_view wild, + string& tame +) { + // Base case + if (chosen_alphabets.empty()) { + INFO("tame: \"" << tame << "\", wild: \"" << wild << "\""); + REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); + return; + } + + size_t const tame_size_before_modification = tame.size(); + auto alphabet = chosen_alphabets.front(); + auto next_chosen_alphabets = chosen_alphabets.subspan(1); + switch (alphabet) { + case WildcardStringAlphabet::Empty: + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + return; + case WildcardStringAlphabet::AnyChar: + tame += 'a'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::Asterisk: + // Generate "", "a", and "ab" + for (size_t i = 0; i < 3; ++i) { + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + + tame += static_cast('a' + i); + } + break; + case WildcardStringAlphabet::QuestionMark: + tame += 'a'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::EscapedAsterisk: + tame += '*'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::EscapedQuestionMark: + tame += '?'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + case WildcardStringAlphabet::EscapedBackslash: + tame += '\\'; + generate_and_test_tame_str(next_chosen_alphabets, wild, tame); + break; + default: + REQUIRE(false); + } + + tame.resize(tame_size_before_modification); +} + +void generate_and_test_wildcard_str( + span> template_wildcard_str, + vector& chosen_alphabets, + string& wild +) { + // Base case + if (template_wildcard_str.empty()) { + string tame; + generate_and_test_tame_str(chosen_alphabets, wild, tame); + return; + } + + size_t const wild_size_before_modification = wild.size(); + + auto const& test_alphabet = *template_wildcard_str.begin(); + for (auto alphabet : test_alphabet) { + switch (alphabet) { + case WildcardStringAlphabet::Empty: + break; + case WildcardStringAlphabet::AnyChar: + wild += 'a'; + break; + case WildcardStringAlphabet::Asterisk: + wild += '*'; + break; + case WildcardStringAlphabet::QuestionMark: + wild += '?'; + break; + case WildcardStringAlphabet::EscapedAsterisk: + wild += "\\*"; + break; + case WildcardStringAlphabet::EscapedQuestionMark: + wild += "\\?"; + break; + case WildcardStringAlphabet::EscapedBackslash: + wild += "\\\\"; + break; + default: + REQUIRE(false); + } + + chosen_alphabets.push_back(alphabet); + generate_and_test_wildcard_str(template_wildcard_str.subspan(1), chosen_alphabets, wild); + chosen_alphabets.pop_back(); + + wild.resize(wild_size_before_modification); + } +} +} // namespace + TEST_CASE("to_lower", "[to_lower]") { string str = "test123TEST"; clp::string_utils::to_lower(str); @@ -54,464 +205,118 @@ TEST_CASE("clean_up_wildcard_search_string", "[clean_up_wildcard_search_string]" REQUIRE(clean_up_wildcard_search_string(str) == "abc"); } -SCENARIO("Test case sensitive wild card match in all possible ways", "[wildcard]") { - std::string tameString, wildString; - - WHEN("Match is expected if wild card character is \"*\"") { - GIVEN("Single wild with no suffix char") { - tameString = "abcd", wildString = "a*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild with no prefix char") { - tameString = "abcd", wildString = "*d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild on both side & has 1st char as literal") { - tameString = "abcd", wildString = "*a*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild on both side & has middle char as literal") { - tameString = "abcd", wildString = "*b*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild on both side & has last char as literal") { - tameString = "abcd", wildString = "*d*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild only") { - tameString = "abcd", wildString = "*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } +TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { + string tame1{"a?c"}; + string wild1{"*a\\??*"}; + REQUIRE(wildcard_match_unsafe_case_sensitive(tame1, wild1)); + + // We want to test all varieties of wildcard strings and strings that can be matched by them. + // We do this by using a kind of template wildcard string---where each character has a set of + // possibilities---to generate this variety. For each wildcard string, we also generate one or + // more strings that can be matched by the wildcard string. + + // The template below is meant to test 1-3 groups of WildcardStringAlphabets separated by '*'. + // The groups allow contiguous repeats of all possible alphabets except '*' since + // `wildcard_match_unsafe_case_sensitive` doesn't accept such wildcard strings. Each alphabet in + // the template may be empty except at least one in each group (so we don't unintentionally + // create two contiguous '*'). Overall, this should cover all matching cases. + for (size_t i = 0; i < 3; ++i) { + vector> template_wildcard_str{{ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::Asterisk, + }}; + for (size_t j = 0; j <= i; ++j) { + template_wildcard_str.push_back({ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }); + template_wildcard_str.push_back({ + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }); + template_wildcard_str.push_back({ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }); + } + + vector chosen_alphabets; + string wild; + generate_and_test_wildcard_str(template_wildcard_str, chosen_alphabets, wild); } - WHEN("Match is expected if Wild card character is \"?\"") { - GIVEN("Single wild in the middle") { - tameString = "abcd", wildString = "a?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild in the beginning") { - tameString = "abcd", wildString = "?bcd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild at the end") { - tameString = "abcd", wildString = "abc?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild in the middle") { - tameString = "abcd", wildString = "a??d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild in the beginning") { - tameString = "abcd", wildString = "??cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild in the end") { - tameString = "abcd", wildString = "ab??"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Single wild in the beginning and end") { - tameString = "abcd", wildString = "?bc?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Multiple wild anywhere") { - tameString = "abcdef", wildString = "a?c?ef"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("All wild") { - tameString = "abcd", wildString = "????"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + // We test non-matching cases using a tame string that matches a diverse wildcard string as + // follows. We test that every substring (anchored at index 0) of tame doesn't match the + // complete wildcard string. + string tame = "abcdef?*?ghixyz"; + string wild = R"(*a?c*\?\*\?*x?z*)"; + // Sanity-check that they match. + REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); + for (size_t i = 1; i < tame.size(); ++i) { + REQUIRE(false == wildcard_match_unsafe_case_sensitive(string_view{tame.data(), i}, wild)); } +} - WHEN("Match is expected if wild card character has both \"*\" and \"?\"") { - GIVEN("Wild begins with \"*?\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "*?bcd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with \"?*\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "?*bcd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with \"*?\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "*?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with \"?*\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "*?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"*?\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "abc*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"?*\" pattern with 0 matched char for \"*\"") { - tameString = "abcd", wildString = "abc*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"*?\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "ab*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild ends with \"?*\" pattern with 1 matched char for \"*\"") { - tameString = "abcd", wildString = "ab?*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with exactly \"*?\" pattern") { - tameString = "abcd", wildString = "*?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Wild begins with exactly \"?*\" pattern") { - tameString = "abcd", wildString = "?*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - } - - WHEN("Match unexpected containing wild card character(s)") { - GIVEN("Missing literal character w/ \"*\"") { - tameString = "abcd", wildString = "ac*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("More literals in wild than tame w/ \"*\" in the middle") { - tameString = "abcd", wildString = "abc*de"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the beginning with \"*\" in the middle") { - tameString = "abcd", wildString = "b**d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the end with \"*\" in the middle") { - tameString = "abcd", wildString = "a**c"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the beginning with both \"*\" and \"?\" in the middle" - ) { - tameString = "abcd", wildString = "b*?d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the beginning with \"?\" at the beginning") { - tameString = "abcd", wildString = "?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - - GIVEN("MISSING matching literals in the end with both \"?\" at the end") { - tameString = "abcd", wildString = "ab?"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == false); - } - } - - WHEN("Match is expected when escape character(s) are used") { - GIVEN("Escaping \"*\"") { - tameString = "a*cd", wildString = "a\\*cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"?\"") { - tameString = "a?cd", wildString = "a\\?cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"*\" and \"?\"") { - tameString = "a?c*e", wildString = "a\\?c\\*e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"\\\"") { - tameString = "a\\cd", wildString = "a\\\\cd"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"?\" when fast forwarding") { - tameString = "abc?e", wildString = "a*\\?e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"*\" when fast forwarding") { - tameString = "abc*e", wildString = "a*\\*e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - - GIVEN("Escaping \"\\\" when fast forwarding") { - tameString = "abc\\e", wildString = "a*\\\\e"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } +TEST_CASE("wildcard_match_unsafe", "[wildcard]") { + string tame{"0!2#4%6&8(aBcDeFgHiJkLmNoPqRsTuVwXyZ"}; + string wild; - GIVEN("Escaping \"?\" when rewinding") { - tameString = "\\ab\\ab\\c?ef", wildString = "*ab\\\\c\\?*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + wild = "0!2#4%6&8(AbCdEfGhIjKlMnOpQrStUvWxYz"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); - GIVEN("Escaping \"*\" when rewinding") { - tameString = "\\ab\\ab\\c*ef", wildString = "*ab\\\\c\\**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + wild = "0?2?4?6?8?A?C?E?G?I?K?M?O?Q?S?U?W?Y?"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); - GIVEN("Escaping \"\\\" when rewinding") { - tameString = "\\ab\\ab\\c\\ef", wildString = "*ab\\\\c\\\\*"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } + wild = "?!?#?%?&?(?b?d?f?h?j?l?n?p?r?t?v?x?z"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); - GIVEN("Silently ignore unsupported escape sequence \\a") { - tameString = "ab?d", wildString = "\\ab?d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - } + wild = "*?b?d?f?h?j?l?n?p?r?t?v?x?z*"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); - WHEN("Case wild card match is case insensitive") { - // This test is meant to supplement the case sensitive wild card implementation - // The case insensitive implementation is exactly the same as case sensitive except it - // automatically adjust the inputs to lower case when needed before doing comparison. It is - // rarely used due to lower performance - bool isCaseSensitive = false; - GIVEN("All lower case tame and all upper case wild") { - tameString = "abcde", wildString = "A?C*"; - REQUIRE(wildcard_match_unsafe(tameString, wildString, isCaseSensitive) == true); - } + wild = "*?A?C?E?G?I?K?M?O?Q?S?U?W?Y?*"; + REQUIRE(wildcard_match_unsafe(tame, wild, false)); + REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); +} - GIVEN("All lower case tame and mixed lower and upper case wild") { - tameString = "abcde", wildString = "A?c*"; - REQUIRE(wildcard_match_unsafe(tameString, wildString, isCaseSensitive) == true); +SCENARIO("wildcard_match_unsafe_case_sensitive performance", "[wildcard performance]") { + auto const tests_dir = std::filesystem::path{__FILE__}.parent_path(); + auto log_file_path = tests_dir / "test_network_reader_src" / "random.log"; - tameString = "abcde", wildString = "A?c*"; - REQUIRE(wildcard_match_unsafe(tameString, wildString, isCaseSensitive) == true); - } + clp::FileReader file_reader; + file_reader.open(log_file_path.string()); + string line; + vector lines; + while (file_reader.read_to_delimiter('\n', false, false, line)) { + lines.push_back(line); } + file_reader.close(); - WHEN("Tested with a bunch of additional test cases found online") { - bool allPassed = true; - - GIVEN("Case with repeating character sequences") { - allPassed &= wildcard_match_unsafe_case_sensitive("abcccd", "*ccd"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississipissippi", "*issip*ss*"); - allPassed - &= !wildcard_match_unsafe_case_sensitive("xxxx*zzzzzzzzy*f", "xxxx*zzy*fffff"); - allPassed &= wildcard_match_unsafe_case_sensitive("xxxx*zzzzzzzzy*f", "xxx*zzy*f"); - allPassed &= !wildcard_match_unsafe_case_sensitive("xxxxzzzzzzzzyf", "xxxx*zzy*fffff"); - allPassed &= wildcard_match_unsafe_case_sensitive("xxxxzzzzzzzzyf", "xxxx*zzy*f"); - allPassed &= wildcard_match_unsafe_case_sensitive("xyxyxyzyxyz", "xy*z*xyz"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississippi", "*sip*"); - allPassed &= wildcard_match_unsafe_case_sensitive("xyxyxyxyz", "xy*xyz"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississippi", "mi*sip*"); - allPassed &= wildcard_match_unsafe_case_sensitive("ababac", "*abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("ababac", "*abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("aaazz", "a*zz*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("a12b12", "*12*23"); - allPassed &= !wildcard_match_unsafe_case_sensitive("a12b12", "a12b"); - allPassed &= wildcard_match_unsafe_case_sensitive("a12b12", "*12*12*"); - REQUIRE(allPassed == true); - } - - GIVEN("Additional cases where the '*' char appears in the tame string") { - allPassed &= wildcard_match_unsafe_case_sensitive("*", "*"); - allPassed &= wildcard_match_unsafe_case_sensitive("a*abab", "a*b"); - allPassed &= wildcard_match_unsafe_case_sensitive("a*r", "a*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("a*ar", "a*aar"); - REQUIRE(allPassed == true); - } - - GIVEN("More double wildcard scenarios") { - allPassed &= wildcard_match_unsafe_case_sensitive("XYXYXYZYXYz", "XY*Z*XYz"); - allPassed &= wildcard_match_unsafe_case_sensitive("missisSIPpi", "*SIP*"); - allPassed &= wildcard_match_unsafe_case_sensitive("mississipPI", "*issip*PI"); - allPassed &= wildcard_match_unsafe_case_sensitive("xyxyxyxyz", "xy*xyz"); - allPassed &= wildcard_match_unsafe_case_sensitive("miSsissippi", "mi*sip*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("miSsissippi", "mi*Sip*"); - allPassed &= wildcard_match_unsafe_case_sensitive("abAbac", "*Abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("abAbac", "*Abac*"); - allPassed &= wildcard_match_unsafe_case_sensitive("aAazz", "a*zz*"); - allPassed &= !wildcard_match_unsafe_case_sensitive("A12b12", "*12*23"); - allPassed &= wildcard_match_unsafe_case_sensitive("a12B12", "*12*12*"); - allPassed &= wildcard_match_unsafe_case_sensitive("oWn", "*oWn*"); - REQUIRE(allPassed == true); - } - - GIVEN("Completely tame (no wildcards) cases") { - allPassed &= wildcard_match_unsafe_case_sensitive("bLah", "bLah"); - allPassed &= !wildcard_match_unsafe_case_sensitive("bLah", "bLaH"); - REQUIRE(allPassed == true); - } - - GIVEN("Simple mixed wildcard tests suggested by IBMer Marlin Deckert") { - allPassed &= wildcard_match_unsafe_case_sensitive("a", "*?"); - allPassed &= wildcard_match_unsafe_case_sensitive("ab", "*?"); - allPassed &= wildcard_match_unsafe_case_sensitive("abc", "*?"); - REQUIRE(allPassed == true); - } - - GIVEN("More mixed wildcard tests including coverage for false positives") { - allPassed &= !wildcard_match_unsafe_case_sensitive("a", "??"); - allPassed &= wildcard_match_unsafe_case_sensitive("ab", "?*?"); - allPassed &= wildcard_match_unsafe_case_sensitive("ab", "*?*?*"); - allPassed &= wildcard_match_unsafe_case_sensitive("abcd", "?b*??"); - allPassed &= !wildcard_match_unsafe_case_sensitive("abcd", "?a*??"); - allPassed &= wildcard_match_unsafe_case_sensitive("abcde", "?*b*?*d*?"); - REQUIRE(allPassed == true); - } - - GIVEN("Single-character-match cases") { - allPassed &= wildcard_match_unsafe_case_sensitive("bLah", "bL?h"); - allPassed &= !wildcard_match_unsafe_case_sensitive("bLaaa", "bLa?"); - allPassed &= wildcard_match_unsafe_case_sensitive("bLah", "bLa?"); - allPassed &= !wildcard_match_unsafe_case_sensitive("bLaH", "?Lah"); - allPassed &= wildcard_match_unsafe_case_sensitive("bLaH", "?LaH"); - REQUIRE(allPassed == true); - } - - GIVEN("Many-wildcard scenarios") { - allPassed &= wildcard_match_unsafe_case_sensitive( - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - "aaaaaaaaaaaab", - "a*a*a*a*a*a*aa*aaa*a*a*b" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*a*aa*aaa*fa*ga*b*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*a*x*aaa*fa*ga*b*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*aaaa*fa*ga*gggg*b*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abababababababababababababababababababaacacacacacacacadaeafagahaiajakalaaaaaaa" - "aaaaaaaaaaffafagaagggagaaaaaaaab", - "*a*b*ba*ca*aaaa*fa*ga*ggg*b*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive("aaabbaabbaab", "*aabbaa*a*"); - allPassed &= wildcard_match_unsafe_case_sensitive( - "a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*", - "a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "aaaaaaaaaaaaaaaaa", - "*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "aaaaaaaaaaaaaaaa", - "*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcde*abcdef*abcdefg*abcdefgh*abcdefghi*abcdefghij*abcdefghijk*" - "abcdefghijkl*abcdefghijklm*abcdefghijklmn", - "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcde*abcdef*abcdefg*abcdefgh*abcdefghi*abcdefghij*abcdefghijk*" - "abcdefghijkl*abcdefghijklm*abcdefghijklmn", - "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*" - ); - allPassed &= !wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcd*abc*abcd", - "abc*abc*abc*abc*abc" - ); - allPassed &= wildcard_match_unsafe_case_sensitive( - "abc*abcd*abcd*abc*abcd*abcd*abc*abcd*abc*abc*abcd", - "abc*abc*abc*abc*abc*abc*abc*abc*abc*abc*abcd" - ); - REQUIRE(allPassed == true); - } - - GIVEN("A case-insensitive algorithm test") { - bool isCaseSensitive = false; - allPassed &= wildcard_match_unsafe("mississippi", "*issip*PI", isCaseSensitive); - REQUIRE(allPassed == true); + auto begin_timestamp = high_resolution_clock::now(); + for (size_t i = 0; i < 10'000; ++i) { + for (auto const& tame : lines) { + wildcard_match_unsafe_case_sensitive(tame, "*to*blk_1073742594_1770*"); } } -} + auto end_timestamp = high_resolution_clock::now(); -SCENARIO("Test wild card performance", "[wildcard performance]") { - // This test is to ensure there is no performance regression - // We use our current implementation vs the next best implementation as a reference - // If performance becomes slower than our next best implementation, then it is considered a fail - - high_resolution_clock::time_point t1, t2; - string tameStr, wildStr; - - int const nReps = 1'000'000; - int testReps; - bool allPassed_currentImplementation = true; - bool allPassed_nextBestImplementation = true; - - /*********************************************************************************************** - * Inputs Begin - **********************************************************************************************/ - vector tameVec, wildVec; - - // Typical apache log - tameVec.push_back("64.242.88.10 - - [07/Mar/2004:16:06:51 -0800] \"GET " - "/twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1" - ".3&rev2=1.2 HTTP/1.1\" 200 4523"); - wildVec.push_back("*64.242.88.10*Mar/2004*GET*200*"); - - /*********************************************************************************************** - * Inputs End - **********************************************************************************************/ - - // Profile current implementation - testReps = nReps; - t1 = high_resolution_clock::now(); - while (testReps--) { - BOOST_FOREACH (boost::tie(tameStr, wildStr), boost::combine(tameVec, wildVec)) { - allPassed_currentImplementation - &= wildcard_match_unsafe_case_sensitive(tameStr, wildStr); - } - } - t2 = high_resolution_clock::now(); - duration timeSpan_currentImplementation = t2 - t1; - - // Profile next best implementation - testReps = nReps; - t1 = high_resolution_clock::now(); - while (testReps--) { - // Replace this part with slow implementation - BOOST_FOREACH (boost::tie(tameStr, wildStr), boost::combine(tameVec, wildVec)) { - allPassed_currentImplementation - &= wildcard_match_unsafe_case_sensitive(tameStr, wildStr); - } - } - t2 = high_resolution_clock::now(); - duration timeSpan_nextBestImplementation = t2 - t1; - REQUIRE(allPassed_currentImplementation == true); - - if (allPassed_currentImplementation) { - cout << "Passed performance test in " << (timeSpan_currentImplementation.count() * 1000) - << " milliseconds." << endl; - } else { - cout << "Failed performance test in " << (timeSpan_currentImplementation.count() * 1000) - << " milliseconds." << endl; - } + cout << std::chrono::duration_cast(end_timestamp - begin_timestamp) + .count() + << " milliseconds." << endl; } TEST_CASE("convert_string_to_int", "[convert_string_to_int]") { From efc9ffcea65110e7d0afcbf5328e419f2b0c3ef6 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:18:55 -0400 Subject: [PATCH 02/11] Minor fixes. --- components/core/src/clp/string_utils/string_utils.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index 3653deb9c..d5fc6c005 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -39,8 +39,7 @@ inline bool advance_tame_to_next_match( ) { auto w = *wild_it; if ('?' != w) { - // No need to check for '*' since the caller ensures wild doesn't - // contain consecutive '*' + // No need to check for '*' since the caller ensures `wild` doesn't contain consecutive '*' // Handle escaped characters if ('\\' == w) { @@ -208,8 +207,8 @@ bool wildcard_match_unsafe(string_view tame, string_view wild, bool case_sensiti * NOTE: * - Since the caller guarantees that there are no consecutive '*', we don't need to handle the * case where a group in `wild` is empty. - * - Since the caller guarantees that there every '\' is followed by a character, we can advance - * passed '\' without doing a subsequent bounds check. + * - Since the caller guarantees that every '\' is followed by a character, we can advance passed + * '\' without doing a subsequent bounds check. * - The second part of this method could be rewritten in the following form: * * ``` From 3200d7d48f86a9358117427a2bb9e6f4188cc4ee Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:23:12 -0400 Subject: [PATCH 03/11] Replace test log. --- components/core/tests/test-string_utils.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 965a8bada..b15f6e8ec 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -8,6 +8,7 @@ #include #include "FileReader.hpp" +#include "spdlog_with_specializations.hpp" using clp::string_utils::clean_up_wildcard_search_string; using clp::string_utils::convert_string_to_int; @@ -314,9 +315,11 @@ SCENARIO("wildcard_match_unsafe_case_sensitive performance", "[wildcard performa } auto end_timestamp = high_resolution_clock::now(); - cout << std::chrono::duration_cast(end_timestamp - begin_timestamp) + SPDLOG_INFO( + "wildcard_match_unsafe_case_sensitive performance test took {} milliseconds.", + std::chrono::duration_cast(end_timestamp - begin_timestamp) .count() - << " milliseconds." << endl; + ); } TEST_CASE("convert_string_to_int", "[convert_string_to_int]") { From b3d384e8ca8da0db6b9b00907e9ec8a331e854ba Mon Sep 17 00:00:00 2001 From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Fri, 7 Jun 2024 04:25:27 -0400 Subject: [PATCH 04/11] Update components/core/src/clp/string_utils/string_utils.cpp Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- components/core/src/clp/string_utils/string_utils.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index d5fc6c005..c1eb8c86b 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -50,8 +50,7 @@ inline bool advance_tame_to_next_match( // Advance `tame_it` until it matches `w` while (true) { - auto t = *tame_it; - if (t == w) { + if (*tame_it == w) { break; } ++tame_it; From 5a0ceae09686f692b61a706448c2f8f78be41989 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 03:38:08 -0400 Subject: [PATCH 05/11] tests: Add missing end-wildcard to test case template; Reduce test case complexity to reduce runtime. --- components/core/tests/test-string_utils.cpp | 71 ++++++++++++--------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index b15f6e8ec..1b05c116e 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -216,40 +216,51 @@ TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { // possibilities---to generate this variety. For each wildcard string, we also generate one or // more strings that can be matched by the wildcard string. - // The template below is meant to test 1-3 groups of WildcardStringAlphabets separated by '*'. + // The template below is meant to test 1-2 groups of WildcardStringAlphabets separated by '*'. // The groups allow contiguous repeats of all possible alphabets except '*' since // `wildcard_match_unsafe_case_sensitive` doesn't accept such wildcard strings. Each alphabet in // the template may be empty except at least one in each group (so we don't unintentionally // create two contiguous '*'). Overall, this should cover all matching cases. - for (size_t i = 0; i < 3; ++i) { - vector> template_wildcard_str{{ - WildcardStringAlphabet::Empty, - WildcardStringAlphabet::Asterisk, - }}; - for (size_t j = 0; j <= i; ++j) { - template_wildcard_str.push_back({ - WildcardStringAlphabet::Empty, - WildcardStringAlphabet::QuestionMark, - WildcardStringAlphabet::EscapedAsterisk, - WildcardStringAlphabet::EscapedQuestionMark, - WildcardStringAlphabet::EscapedBackslash, - WildcardStringAlphabet::AnyChar, - }); - template_wildcard_str.push_back({ - WildcardStringAlphabet::QuestionMark, - WildcardStringAlphabet::EscapedAsterisk, - WildcardStringAlphabet::EscapedQuestionMark, - WildcardStringAlphabet::EscapedBackslash, - WildcardStringAlphabet::AnyChar, - }); - template_wildcard_str.push_back({ - WildcardStringAlphabet::Empty, - WildcardStringAlphabet::QuestionMark, - WildcardStringAlphabet::EscapedAsterisk, - WildcardStringAlphabet::EscapedQuestionMark, - WildcardStringAlphabet::EscapedBackslash, - WildcardStringAlphabet::AnyChar, - }); + vector const nullable_asterisk_template{ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::Asterisk, + }; + vector const nullable_non_asterisk_template{ + WildcardStringAlphabet::Empty, + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }; + vector const non_asterisk_template{ + WildcardStringAlphabet::QuestionMark, + WildcardStringAlphabet::EscapedAsterisk, + WildcardStringAlphabet::EscapedQuestionMark, + WildcardStringAlphabet::EscapedBackslash, + WildcardStringAlphabet::AnyChar, + }; + vector> template_wildcard_str; + for (size_t i = 0; i < 2; ++i) { + if (0 == i) { + template_wildcard_str.emplace_back(nullable_asterisk_template); + template_wildcard_str.emplace_back(nullable_non_asterisk_template); + template_wildcard_str.emplace_back(non_asterisk_template); + template_wildcard_str.emplace_back(nullable_non_asterisk_template); + template_wildcard_str.push_back(nullable_asterisk_template); + } else { + // Insert "*" before the last asterisk + // NOTE: We insert in reverse since we're using the same iterator for all inserts + auto insert_pos_it = template_wildcard_str.end() - 1; + template_wildcard_str.insert(insert_pos_it, nullable_non_asterisk_template); + template_wildcard_str.insert(insert_pos_it, non_asterisk_template); + template_wildcard_str.insert(insert_pos_it, nullable_non_asterisk_template); + template_wildcard_str.insert( + insert_pos_it, + { + WildcardStringAlphabet::Asterisk, + } + ); } vector chosen_alphabets; From 7168c6e2f85d3ed5f7dd016ca45b1ca9369dc4c1 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 03:59:38 -0400 Subject: [PATCH 06/11] Clean-up test-string_utils.cpp. --- components/core/tests/test-string_utils.cpp | 46 ++++++++++----------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 1b05c116e..d707f6d67 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -1,5 +1,7 @@ +#include +#include +#include #include -#include #include #include #include @@ -14,10 +16,7 @@ using clp::string_utils::clean_up_wildcard_search_string; using clp::string_utils::convert_string_to_int; using clp::string_utils::wildcard_match_unsafe; using clp::string_utils::wildcard_match_unsafe_case_sensitive; -using std::chrono::duration; using std::chrono::high_resolution_clock; -using std::cout; -using std::endl; using std::span; using std::string; using std::string_view; @@ -28,7 +27,7 @@ namespace { * All possible alphabets that could appear in a wildcard string. Note that the alphabets are * conceptual (e.g. EscapedAsterisk) rather than concrete (e.g. "\\*"). */ -enum class WildcardStringAlphabet { +enum class WildcardStringAlphabet : uint8_t { Empty = 0, AnyChar, Asterisk, @@ -64,6 +63,7 @@ void generate_and_test_wildcard_str( string& wild ); +// NOLINTNEXTLINE(misc-no-recursion) void generate_and_test_tame_str( span chosen_alphabets, string_view wild, @@ -76,9 +76,9 @@ void generate_and_test_tame_str( return; } - size_t const tame_size_before_modification = tame.size(); + auto const tame_size_before_modification = tame.size(); auto alphabet = chosen_alphabets.front(); - auto next_chosen_alphabets = chosen_alphabets.subspan(1); + auto const next_chosen_alphabets = chosen_alphabets.subspan(1); switch (alphabet) { case WildcardStringAlphabet::Empty: generate_and_test_tame_str(next_chosen_alphabets, wild, tame); @@ -118,6 +118,7 @@ void generate_and_test_tame_str( tame.resize(tame_size_before_modification); } +// NOLINTNEXTLINE(misc-no-recursion) void generate_and_test_wildcard_str( span> template_wildcard_str, vector& chosen_alphabets, @@ -130,9 +131,9 @@ void generate_and_test_wildcard_str( return; } - size_t const wild_size_before_modification = wild.size(); + auto const wild_size_before_modification = wild.size(); - auto const& test_alphabet = *template_wildcard_str.begin(); + auto const& test_alphabet = template_wildcard_str.front(); for (auto alphabet : test_alphabet) { switch (alphabet) { case WildcardStringAlphabet::Empty: @@ -207,10 +208,6 @@ TEST_CASE("clean_up_wildcard_search_string", "[clean_up_wildcard_search_string]" } TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { - string tame1{"a?c"}; - string wild1{"*a\\??*"}; - REQUIRE(wildcard_match_unsafe_case_sensitive(tame1, wild1)); - // We want to test all varieties of wildcard strings and strings that can be matched by them. // We do this by using a kind of template wildcard string---where each character has a set of // possibilities---to generate this variety. For each wildcard string, we also generate one or @@ -271,38 +268,41 @@ TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { // We test non-matching cases using a tame string that matches a diverse wildcard string as // follows. We test that every substring (anchored at index 0) of tame doesn't match the // complete wildcard string. - string tame = "abcdef?*?ghixyz"; - string wild = R"(*a?c*\?\*\?*x?z*)"; + constexpr string_view tame{"abcdef?*?ghixyz"}; + constexpr string_view wild{R"(*a?c*\?\*\?*x?z*)"}; // Sanity-check that they match. REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); - for (size_t i = 1; i < tame.size(); ++i) { - REQUIRE(false == wildcard_match_unsafe_case_sensitive(string_view{tame.data(), i}, wild)); + auto const tame_begin_it = tame.cbegin(); + for (auto it = tame.cend() - 1; tame_begin_it != it; --it) { + REQUIRE(( + false == wildcard_match_unsafe_case_sensitive(string_view{tame_begin_it, it}, wild) + )); } } TEST_CASE("wildcard_match_unsafe", "[wildcard]") { - string tame{"0!2#4%6&8(aBcDeFgHiJkLmNoPqRsTuVwXyZ"}; + constexpr string_view tame{"0!2#4%6&8(aBcDeFgHiJkLmNoPqRsTuVwXyZ"}; string wild; wild = "0!2#4%6&8(AbCdEfGhIjKlMnOpQrStUvWxYz"; REQUIRE(wildcard_match_unsafe(tame, wild, false)); - REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); wild = "0?2?4?6?8?A?C?E?G?I?K?M?O?Q?S?U?W?Y?"; REQUIRE(wildcard_match_unsafe(tame, wild, false)); - REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); wild = "?!?#?%?&?(?b?d?f?h?j?l?n?p?r?t?v?x?z"; REQUIRE(wildcard_match_unsafe(tame, wild, false)); - REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); wild = "*?b?d?f?h?j?l?n?p?r?t?v?x?z*"; REQUIRE(wildcard_match_unsafe(tame, wild, false)); - REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); wild = "*?A?C?E?G?I?K?M?O?Q?S?U?W?Y?*"; REQUIRE(wildcard_match_unsafe(tame, wild, false)); - REQUIRE(false == wildcard_match_unsafe(tame, wild, true)); + REQUIRE((false == wildcard_match_unsafe(tame, wild, true))); } SCENARIO("wildcard_match_unsafe_case_sensitive performance", "[wildcard performance]") { From 15090620eb32ab1ccab991da87f7b80b2f959a76 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 04:02:35 -0400 Subject: [PATCH 07/11] Further clean-up test-string_utils.cpp. --- components/core/tests/test-string_utils.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index d707f6d67..dcd1986d0 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -307,7 +307,7 @@ TEST_CASE("wildcard_match_unsafe", "[wildcard]") { SCENARIO("wildcard_match_unsafe_case_sensitive performance", "[wildcard performance]") { auto const tests_dir = std::filesystem::path{__FILE__}.parent_path(); - auto log_file_path = tests_dir / "test_network_reader_src" / "random.log"; + auto const log_file_path = tests_dir / "test_network_reader_src" / "random.log"; clp::FileReader file_reader; file_reader.open(log_file_path.string()); @@ -318,13 +318,13 @@ SCENARIO("wildcard_match_unsafe_case_sensitive performance", "[wildcard performa } file_reader.close(); - auto begin_timestamp = high_resolution_clock::now(); + auto const begin_timestamp = high_resolution_clock::now(); for (size_t i = 0; i < 10'000; ++i) { for (auto const& tame : lines) { wildcard_match_unsafe_case_sensitive(tame, "*to*blk_1073742594_1770*"); } } - auto end_timestamp = high_resolution_clock::now(); + auto const end_timestamp = high_resolution_clock::now(); SPDLOG_INFO( "wildcard_match_unsafe_case_sensitive performance test took {} milliseconds.", From 8dd8dfe73a70a1e554c6257a70a41a53058baa59 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 05:19:03 -0400 Subject: [PATCH 08/11] Print output on failed non-match case. --- components/core/tests/test-string_utils.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index dcd1986d0..6f8f74024 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -274,9 +274,9 @@ TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); auto const tame_begin_it = tame.cbegin(); for (auto it = tame.cend() - 1; tame_begin_it != it; --it) { - REQUIRE(( - false == wildcard_match_unsafe_case_sensitive(string_view{tame_begin_it, it}, wild) - )); + auto const tame_substr = string_view{tame_begin_it, it}; + INFO("tame: \"" << tame_substr << "\", wild: \"" << wild << "\""); + REQUIRE((false == wildcard_match_unsafe_case_sensitive(tame_substr, wild))); } } From bc030a3b83474095ec45379d4b346354fad0916f Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 05:20:18 -0400 Subject: [PATCH 09/11] Address most code review concerns on string_utils.cpp. --- .../src/clp/string_utils/string_utils.cpp | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index c1eb8c86b..239058e31 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -22,7 +22,7 @@ namespace { * @param tame_it Returns `tame`'s updated iterator. * @param tame_bookmark_it Returns `tame`'s updated bookmark. * @param wild_it Returns `wild`'s updated iterator. - * @return true on success, false if `tame` cannot match `wild`. + * @return Whether `tame` might be able to match `wild`. */ inline bool advance_tame_to_next_match( string_view::const_iterator tame_end_it, @@ -31,6 +31,16 @@ inline bool advance_tame_to_next_match( string_view::const_iterator& wild_it ); +/** + * Helper for `wildcard_match_unsafe_case_sensitive` to determine if the given iterator points to + * the end of `wild`, or the second-last character of `wild` if`wild` ends with a '*'. + * @param wild_it + * @param wild_end_it + * @return Whether the match has reached the end of `tame` and `wild`. + */ +bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it +); + inline bool advance_tame_to_next_match( string_view::const_iterator tame_end_it, string_view::const_iterator& tame_it, @@ -64,6 +74,11 @@ inline bool advance_tame_to_next_match( return true; } + +bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it +) { + return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); +} } // namespace size_t find_first_of( @@ -204,6 +219,8 @@ bool wildcard_match_unsafe(string_view tame, string_view wild, bool case_sensiti * `wild`, the entire wildcard match fails. * * NOTE: + * - This method is on the critical path for searches in clg/clp-s/glt, so any modifications must be + * benchmarked to ensure performance is not significantly affected. * - Since the caller guarantees that there are no consecutive '*', we don't need to handle the * case where a group in `wild` is empty. * - Since the caller guarantees that every '\' is followed by a character, we can advance passed @@ -226,13 +243,6 @@ bool wildcard_match_unsafe(string_view tame, string_view wild, bool case_sensiti * However, this form is ~2% slower. */ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { - auto tame_it = tame.cbegin(); - auto wild_it = wild.cbegin(); - auto const tame_end_it = tame.cend(); - auto const wild_end_it = wild.cend(); - string_view::const_iterator tame_bookmark_it{}; - string_view::const_iterator wild_bookmark_it{}; - // Handle `tame` or `wild` being empty if (wild.empty()) { return tame.empty(); @@ -241,6 +251,13 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { return "*" == wild; } + auto tame_it = tame.cbegin(); + auto wild_it = wild.cbegin(); + auto const tame_end_it = tame.cend(); + auto const wild_end_it = wild.cend(); + string_view::const_iterator tame_bookmark_it{}; + string_view::const_iterator wild_bookmark_it{}; + // Match `tame` against `wild` against until we reach the first '*' in `wild` or they no longer // match while (true) { @@ -266,10 +283,12 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { ++wild_it; // Handle boundary conditions + // NOTE: The bodies of these if-blocks depend on the order of these conditions. if (tame_end_it == tame_it) { - return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); - } else if (wild_end_it == wild_it) { - return tame_end_it == tame_it; + return is_end_of_wild(wild_it, wild_end_it); + } + if (wild_end_it == wild_it) { + return false; } } @@ -318,13 +337,11 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { ++wild_it; // Handle boundary conditions + // NOTE: The bodies of these if-blocks depend on the order of these conditions. if (tame_end_it == tame_it) { - return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); - } else if (wild_end_it == wild_it) { - if (tame_end_it == tame_it) { - return true; - } - + return is_end_of_wild(wild_it, wild_end_it); + } + if (wild_end_it == wild_it) { // Reset to bookmarks tame_it = tame_bookmark_it + 1; wild_it = wild_bookmark_it; From 901549b6f697fef69fc9a07f7e2bc079969c4dff Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 05:32:38 -0400 Subject: [PATCH 10/11] Fix formatting. --- components/core/src/clp/string_utils/string_utils.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index 239058e31..ff25da6de 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -38,8 +38,7 @@ inline bool advance_tame_to_next_match( * @param wild_end_it * @return Whether the match has reached the end of `tame` and `wild`. */ -bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it -); +bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it); inline bool advance_tame_to_next_match( string_view::const_iterator tame_end_it, @@ -75,8 +74,7 @@ inline bool advance_tame_to_next_match( return true; } -bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it -) { +bool is_end_of_wild(string_view::const_iterator wild_it, string_view::const_iterator wild_end_it) { return (wild_end_it == wild_it) || (wild_end_it == wild_it + 1 && '*' == *wild_it); } } // namespace From 9258dd4d41e18675990fca34dec5955acd89fb5e Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 10 Jun 2024 10:20:20 -0400 Subject: [PATCH 11/11] Try matching iterator constness to fix macOS build. --- components/core/tests/test-string_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 6f8f74024..091e385b1 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -272,7 +272,7 @@ TEST_CASE("wildcard_match_unsafe_case_sensitive", "[wildcard]") { constexpr string_view wild{R"(*a?c*\?\*\?*x?z*)"}; // Sanity-check that they match. REQUIRE(wildcard_match_unsafe_case_sensitive(tame, wild)); - auto const tame_begin_it = tame.cbegin(); + auto tame_begin_it = tame.cbegin(); for (auto it = tame.cend() - 1; tame_begin_it != it; --it) { auto const tame_substr = string_view{tame_begin_it, it}; INFO("tame: \"" << tame_substr << "\", wild: \"" << wild << "\"");