[core] Replace SmallMap

Signed-off-by: x7z4w <x7z4w@noreply.codeberg.org>
This commit is contained in:
x7z4w
2025-11-24 17:34:56 +00:00
committed by Konstantin Pastbin
parent 969e1ef2da
commit 882dccb87d
25 changed files with 162 additions and 742 deletions

View File

@@ -65,8 +65,6 @@ set(SRC
set_operations.hpp
shared_buffer_manager.cpp
shared_buffer_manager.hpp
small_map.hpp
small_set.hpp
src_point.cpp
src_point.hpp
stats.hpp

View File

@@ -33,7 +33,6 @@ set(SRC
regexp_test.cpp
rolling_hash_test.cpp
scope_guard_test.cpp
small_set_test.cpp
stl_helpers_tests.cpp
string_utils_test.cpp
suffix_array_tests.cpp

View File

@@ -1,270 +0,0 @@
#include "testing/testing.hpp"
#include "base/small_map.hpp"
#include "base/small_set.hpp"
#include "base/timer.hpp"
#include <algorithm>
#include <iterator>
#include <random>
#include <vector>
#include "3party/ankerl/unordered_dense.h"
namespace small_set_test
{
using namespace base;
UNIT_TEST(SmallSet_Smoke)
{
SmallSet<300> set;
TEST_EQUAL(set.Size(), 0, ());
for (uint64_t i = 0; i < 300; ++i)
TEST(!set.Contains(i), ());
set.Insert(0);
TEST_EQUAL(set.Size(), 1, ());
TEST(set.Contains(0), ());
set.Insert(0);
TEST_EQUAL(set.Size(), 1, ());
TEST(set.Contains(0), ());
set.Insert(5);
TEST_EQUAL(set.Size(), 2, ());
TEST(set.Contains(0), ());
TEST(set.Contains(5), ());
set.Insert(64);
TEST_EQUAL(set.Size(), 3, ());
TEST(set.Contains(0), ());
TEST(set.Contains(5), ());
TEST(set.Contains(64), ());
{
auto cur = set.begin();
auto end = set.end();
for (uint64_t i : {0, 5, 64})
{
TEST(cur != end, ());
TEST_EQUAL(*cur, i, ());
++cur;
}
TEST(cur == end, ());
}
set.Remove(5);
TEST_EQUAL(set.Size(), 2, ());
TEST(set.Contains(0), ());
TEST(!set.Contains(5), ());
TEST(set.Contains(64), ());
set.Insert(297);
set.Insert(298);
set.Insert(299);
TEST_EQUAL(set.Size(), 5, ());
{
std::vector<uint64_t> const actual(set.begin(), set.end());
std::vector<uint64_t> const expected = {0, 64, 297, 298, 299};
TEST_EQUAL(actual, expected, ());
}
TEST_EQUAL(set.Size(), std::distance(set.begin(), set.end()), ());
}
bool BenchmarkTimeLessOrNear(uint64_t l, uint64_t r, double relativeTolerance)
{
return (l < r) || ((l - r) / static_cast<double>(l) < relativeTolerance);
}
#ifndef DEBUG
std::vector<uint32_t> GenerateIndices(uint32_t min, uint32_t max)
{
std::vector<uint32_t> res;
std::uniform_int_distribution<uint64_t> randDist(min, max);
std::random_device randDevice;
std::mt19937 randEngine(randDevice());
for (size_t i = 0; i < 10000000; ++i)
res.push_back(randDist(randEngine));
return res;
}
UNIT_TEST(SmallMap_Benchmark1)
{
// 1. Init maps.
// Dataset is similar to routing::VehicleModel.
ankerl::unordered_dense::map<uint32_t, bool> uMap = {
{1, true}, {2, false}, {4, false}, {6, true}, {7, true}, {8, true}, {12, false},
{15, false}, {26, true}, {30, false}, {36, false}, {43, false}, {54, false}, {57, true},
{58, true}, {65, true}, {69, true}, {90, true}, {95, false}, {119, false}, {167, true},
{176, false}, {259, true}, {272, false}, {994, true}, {1054, false}};
base::SmallMap<uint32_t, bool> sMap(uMap.begin(), uMap.end());
// 2. Generate indices.
std::vector<uint32_t> indices = GenerateIndices(1, 1054);
uint64_t t1, t2;
uint32_t sum1 = 0, sum2 = 0;
// 3. Run unordered_map.
{
base::HighResTimer timer;
for (auto i : indices)
sum1 += (uMap.find(i) != uMap.end() ? 1 : 0);
t1 = timer.ElapsedMilliseconds();
}
// 4. Run SmallMap.
{
base::HighResTimer timer;
for (auto i : indices)
sum2 += (sMap.Find(i) ? 1 : 0);
t2 = timer.ElapsedMilliseconds();
}
TEST_EQUAL(sum1, sum2, ());
// At this moment, we have rare t2 > t1 on Linux CI.
TEST(BenchmarkTimeLessOrNear(t2, t1, 0.3), (t2, t1));
LOG(LINFO, ("unordered_map time =", t1, "SmallMap time =", t2));
}
UNIT_TEST(SmallMap_Benchmark2)
{
using namespace std;
uint32_t i = 0;
// Dataset is similar to routing::VehicleModelFactory.
ankerl::unordered_dense::map<string, shared_ptr<int>> uMap = {
{"", make_shared<int>(i++)},
{"Australia", make_shared<int>(i++)},
{"Austria", make_shared<int>(i++)},
{"Belarus", make_shared<int>(i++)},
{"Belgium", make_shared<int>(i++)},
{"Brazil", make_shared<int>(i++)},
{"Denmark", make_shared<int>(i++)},
{"France", make_shared<int>(i++)},
{"Finland", make_shared<int>(i++)},
{"Germany", make_shared<int>(i++)},
{"Hungary", make_shared<int>(i++)},
{"Iceland", make_shared<int>(i++)},
{"Netherlands", make_shared<int>(i++)},
{"Norway", make_shared<int>(i++)},
{"Oman", make_shared<int>(i++)},
{"Poland", make_shared<int>(i++)},
{"Romania", make_shared<int>(i++)},
{"Russian Federation", make_shared<int>(i++)},
{"Slovakia", make_shared<int>(i++)},
{"Spain", make_shared<int>(i++)},
{"Switzerland", make_shared<int>(i++)},
{"Turkey", make_shared<int>(i++)},
{"Ukraine", make_shared<int>(i++)},
{"United Kingdom", make_shared<int>(i++)},
{"United States of America", make_shared<int>(i++)},
};
base::SmallMap<std::string, std::shared_ptr<int>> sMap(uMap.begin(), uMap.end());
// 2. Generate indices.
std::vector<std::string> keys;
for (auto const & e : uMap)
{
keys.push_back(e.first);
keys.push_back(e.first + "_Foo");
keys.push_back(e.first + "_Bar");
keys.push_back(e.first + "_Bazz");
}
std::vector<uint32_t> indices = GenerateIndices(0, keys.size() - 1);
uint64_t t1, t2;
uint32_t sum1 = 0, sum2 = 0;
// 3. Run unordered_map.
{
base::HighResTimer timer;
for (auto i : indices)
{
auto const it = uMap.find(keys[i]);
if (it != uMap.end())
sum1 += *it->second;
}
t1 = timer.ElapsedMilliseconds();
}
// 4. Run SmallMap.
{
base::HighResTimer timer;
for (auto i : indices)
{
auto const * p = sMap.Find(keys[i]);
if (p)
sum2 += **p;
}
t2 = timer.ElapsedMilliseconds();
}
TEST_EQUAL(sum1, sum2, ());
// std::hash(std::string) is better than std::less(std::string)
TEST_LESS(t1, t2, ());
LOG(LINFO, ("unordered_map time =", t1, "SmallMap time =", t2));
}
// Small 4 elements sample doesn't work for new (gcc11+, clang14+) toolchain.
/*
UNIT_TEST(SmallMap_Benchmark3)
{
// Dataset is similar to routing::VehicleModel.m_surfaceFactors.
ankerl::unordered_dense::map<int, int> uMap = {
{1, 0}, {10, 1}, {100, 2}, {1000, 3},
};
base::SmallMap<int, int> sMap(uMap.begin(), uMap.end());
base::SmallMapBase<int, int> sbMap(uMap.begin(), uMap.end());
std::vector<uint32_t> indices = GenerateIndices(0, 3);
// Missing key queries are even worse for the std map.
std::vector<int> keys;
for (auto const & e : uMap)
keys.push_back(e.first);
uint64_t t1, t2, t3;
uint32_t sum1 = 0, sum2 = 0, sum3 = 0;
// 3. Run unordered_map.
{
base::HighResTimer timer;
for (auto i : indices)
sum1 += uMap.find(keys[i])->second;
t1 = timer.ElapsedMilliseconds();
}
// 4. Run SmallMap.
{
base::HighResTimer timer;
for (auto i : indices)
sum2 += *sMap.Find(keys[i]);
t2 = timer.ElapsedMilliseconds();
}
// 5. Run SmallMapBase.
{
base::HighResTimer timer;
for (auto i : indices)
sum3 += *sbMap.Find(keys[i]);
t3 = timer.ElapsedMilliseconds();
}
TEST_EQUAL(sum1, sum2, ());
TEST_EQUAL(sum1, sum3, ());
TEST_LESS(t2, t1, ());
TEST(BenchmarkTimeLessOrNear(t3, t2, 0.05), (t3, t2));
LOG(LINFO, ("unordered_map time =", t1, "SmallMap time =", t2, "SmallMapBase time =", t3));
}
*/
#endif
} // namespace small_set_test

View File

@@ -1,100 +0,0 @@
#pragma once
#include "assert.hpp"
#include <vector>
namespace base
{
/// Consider using as a replacement of unordered_map (map) when:
/// - very small amount of elements (<8)
template <class Key, class Value>
class SmallMapBase
{
public:
using ValueType = std::pair<Key, Value>;
SmallMapBase() = default;
SmallMapBase(std::initializer_list<ValueType> init) : m_map(std::move(init)) {}
template <class Iter>
SmallMapBase(Iter beg, Iter end) : m_map(beg, end)
{}
bool operator==(SmallMapBase const & rhs) const { return m_map == rhs.m_map; }
void Reserve(size_t count) { m_map.reserve(count); }
void Insert(Key k, Value v) { m_map.emplace_back(std::move(k), std::move(v)); }
Value const * Find(Key const & k) const
{
for (auto const & e : m_map)
if (e.first == k)
return &e.second;
return nullptr;
}
size_t size() const { return m_map.size(); }
auto begin() const { return m_map.begin(); }
auto end() const { return m_map.end(); }
protected:
/// @todo buffer_vector is not suitable now, because Key/Value is not default constructible.
std::vector<ValueType> m_map;
};
/// Consider using as a replacement of unordered_map (map) when:
/// - initialize and don't modify
/// - relatively small amount of elements (8-128)
template <class Key, class Value>
class SmallMap : public SmallMapBase<Key, Value>
{
using BaseT = SmallMapBase<Key, Value>;
public:
using ValueType = typename BaseT::ValueType;
SmallMap() = default;
SmallMap(std::initializer_list<ValueType> init) : BaseT(std::move(init)) { FinishBuilding(); }
template <class Iter>
SmallMap(Iter beg, Iter end) : BaseT(beg, end)
{
FinishBuilding();
}
void FinishBuilding()
{
auto & theMap = this->m_map;
std::sort(theMap.begin(), theMap.end(), [](ValueType const & l, ValueType const & r) { return l.first < r.first; });
}
Value const * Find(Key const & k) const
{
auto const & theMap = this->m_map;
auto const it = std::lower_bound(theMap.cbegin(), theMap.cend(), k,
[](ValueType const & l, Key const & r) { return l.first < r; });
if (it != theMap.cend() && it->first == k)
return &(it->second);
return nullptr;
}
void Replace(Key const & k, Value v)
{
auto & theMap = this->m_map;
auto it = std::lower_bound(theMap.begin(), theMap.end(), k,
[](ValueType const & l, Key const & r) { return l.first < r; });
ASSERT(it != theMap.end() && it->first == k, ());
it->second = std::move(v);
}
Value const & Get(Key const & k) const
{
Value const * v = Find(k);
ASSERT(v, ());
return *v;
}
};
} // namespace base

View File

@@ -1,220 +0,0 @@
#pragma once
#include "base/assert.hpp"
#include "base/bits.hpp"
#include <cstdint>
#include <iterator>
#include <sstream>
#include <string>
namespace base
{
// A set of nonnegative integers less than |UpperBound|.
//
// Requires UpperBound + O(1) bits of memory. All operations except
// Clear() and iteration are O(1). Clear() and iteration require
// O(UpperBound) steps.
//
// *NOTE* This class *IS NOT* thread safe.
template <uint64_t UpperBound>
class SmallSet
{
public:
static uint64_t constexpr kNumBlocks = (UpperBound + 63) / 64;
static_assert(kNumBlocks > 0);
class Iterator
{
public:
using difference_type = uint64_t;
using value_type = uint64_t;
using pointer = void;
using reference = uint64_t;
using iterator_category = std::forward_iterator_tag;
Iterator(uint64_t const * blocks, uint64_t current_block_index)
: m_blocks(blocks)
, m_current_block_index(current_block_index)
, m_current_block(0)
{
ASSERT_LESS_OR_EQUAL(current_block_index, kNumBlocks, ());
if (current_block_index < kNumBlocks)
m_current_block = m_blocks[current_block_index];
SkipZeroes();
}
bool operator==(Iterator const & rhs) const
{
return m_blocks == rhs.m_blocks && m_current_block_index == rhs.m_current_block_index &&
m_current_block == rhs.m_current_block;
}
bool operator!=(Iterator const & rhs) const { return !(*this == rhs); }
uint64_t operator*() const
{
ASSERT_NOT_EQUAL(m_current_block, 0, ());
auto const bit = m_current_block & -m_current_block;
return bits::FloorLog(bit) + m_current_block_index * 64;
}
Iterator const & operator++()
{
ASSERT(m_current_block_index < kNumBlocks, ());
ASSERT_NOT_EQUAL(m_current_block, 0, ());
m_current_block = m_current_block & (m_current_block - 1);
SkipZeroes();
return *this;
}
private:
void SkipZeroes()
{
ASSERT_LESS_OR_EQUAL(m_current_block_index, kNumBlocks, ());
if (m_current_block != 0 || m_current_block_index == kNumBlocks)
return;
do
++m_current_block_index;
while (m_current_block_index < kNumBlocks && m_blocks[m_current_block_index] == 0);
if (m_current_block_index < kNumBlocks)
m_current_block = m_blocks[m_current_block_index];
else
m_current_block = 0;
}
uint64_t const * m_blocks;
uint64_t m_current_block_index;
uint64_t m_current_block;
};
#define DEFINE_BLOCK_OFFSET(value) \
uint64_t const block = value / 64; \
uint64_t const offset = value % 64
// This invalidates all iterators except end().
void Insert(uint64_t value)
{
ASSERT_LESS(value, UpperBound, ());
DEFINE_BLOCK_OFFSET(value);
auto const bit = kOne << offset;
m_size += (m_blocks[block] & bit) == 0;
m_blocks[block] |= bit;
}
// This invalidates all iterators except end().
void Remove(uint64_t value)
{
ASSERT_LESS(value, UpperBound, ());
DEFINE_BLOCK_OFFSET(value);
auto const bit = kOne << offset;
m_size -= (m_blocks[block] & bit) != 0;
m_blocks[block] &= ~bit;
}
bool Contains(uint64_t value) const
{
ASSERT_LESS(value, UpperBound, ());
DEFINE_BLOCK_OFFSET(value);
return m_blocks[block] & (kOne << offset);
}
#undef DEFINE_BLOCK_OFFSET
uint64_t Size() const { return m_size; }
// This invalidates all iterators except end().
void Clear()
{
std::fill(std::begin(m_blocks), std::end(m_blocks), static_cast<uint64_t>(0));
m_size = 0;
}
Iterator begin() const { return Iterator(m_blocks, 0); }
Iterator cbegin() const { return Iterator(m_blocks, 0); }
Iterator end() const { return Iterator(m_blocks, kNumBlocks); }
Iterator cend() const { return Iterator(m_blocks, kNumBlocks); }
private:
static uint64_t constexpr kOne = 1;
uint64_t m_blocks[kNumBlocks] = {};
uint64_t m_size = 0;
};
// static
template <uint64_t UpperBound>
uint64_t constexpr SmallSet<UpperBound>::kNumBlocks;
// static
template <uint64_t UpperBound>
uint64_t constexpr SmallSet<UpperBound>::kOne;
template <uint64_t UpperBound>
std::string DebugPrint(SmallSet<UpperBound> const & set)
{
std::ostringstream os;
os << "SmallSet<" << UpperBound << "> [" << set.Size() << ": ";
for (auto const & v : set)
os << v << " ";
os << "]";
return os.str();
}
// This is a delegate for SmallSet<>, that checks the validity of
// argument in Insert(), Remove() and Contains() methods and does
// nothing when the argument is not valid.
template <uint64_t UpperBound>
class SafeSmallSet
{
public:
using Set = SmallSet<UpperBound>;
using Iterator = typename Set::Iterator;
void Insert(uint64_t value)
{
if (IsValid(value))
m_set.Insert(value);
}
void Remove(uint64_t value)
{
if (IsValid(value))
m_set.Remove(value);
}
bool Contains(uint64_t value) const { return IsValid(value) && m_set.Contains(value); }
uint64_t Size() const { return m_set.Size(); }
void Clear() { m_set.Clear(); }
Iterator begin() const { return m_set.begin(); }
Iterator cbegin() const { return m_set.cbegin(); }
Iterator end() const { return m_set.end(); }
Iterator cend() const { return m_set.cend(); }
private:
bool IsValid(uint64_t value) const { return value < UpperBound; }
Set m_set;
};
template <uint64_t UpperBound>
std::string DebugPrint(SafeSmallSet<UpperBound> const & set)
{
std::ostringstream os;
os << "SafeSmallSet<" << UpperBound << "> [" << set.Size() << ": ";
for (auto const v : set)
os << v << " ";
os << "]";
return os.str();
}
} // namespace base