VaneDB 0.1.0
Embeddable vector database for edge AI
Loading...
Searching...
No Matches
mmap_vector_store.h
Go to the documentation of this file.
1// VaneDB - Copyright (c) 2025 Anton Tsvetkov - MIT License
2#pragma once
3
4#if defined(_WIN32) || defined(_WIN64)
5#define VANEDB_WINDOWS 1
6#ifndef NOMINMAX
7#define NOMINMAX // Prevent Windows.h from defining min/max macros
8#endif
9#include <windows.h>
10#else
11#define VANEDB_POSIX 1
12#include <fcntl.h>
13#include <sys/mman.h>
14#include <sys/stat.h>
15#include <unistd.h>
16#endif
17
18#include "detail/file_utils.h"
19#include "distance_strategy.h"
20#include "vector_store.h"
21#include <algorithm>
22#include <cstddef>
23#include <cstdint>
24#include <cstdio>
25#include <cstring>
26#include <fstream>
27#include <limits>
28#include <stdexcept>
29#include <string>
30#include <unordered_map>
31#include <unordered_set>
32#include <vector>
33
34namespace vanedb {
35
37public:
38 static constexpr uint32_t MAGIC = 0x42445651;
39 static constexpr uint32_t VERSION = 1;
40 static constexpr size_t HEADER_SIZE = 32;
41
42 explicit MMapVectorStore(const std::string& filename) {
43#ifdef VANEDB_WINDOWS
44 file_handle_ = CreateFileA(filename.c_str(), GENERIC_READ, FILE_SHARE_READ,
45 nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
46 if (file_handle_ == INVALID_HANDLE_VALUE) throw std::runtime_error("Cannot open: " + filename);
47 LARGE_INTEGER sz;
48 if (!GetFileSizeEx(file_handle_, &sz)) { CloseHandle(file_handle_); file_handle_ = INVALID_HANDLE_VALUE;
49 throw std::runtime_error("Cannot get file size"); }
50 file_size_ = static_cast<size_t>(sz.QuadPart);
51 if (file_size_ < HEADER_SIZE) { CloseHandle(file_handle_); file_handle_ = INVALID_HANDLE_VALUE;
52 throw std::runtime_error("File too small"); }
53 mapping_handle_ = CreateFileMappingA(file_handle_, nullptr, PAGE_READONLY, 0, 0, nullptr);
54 if (!mapping_handle_) { CloseHandle(file_handle_); file_handle_ = INVALID_HANDLE_VALUE;
55 throw std::runtime_error("Cannot create mapping"); }
56 mapped_ = MapViewOfFile(mapping_handle_, FILE_MAP_READ, 0, 0, 0);
57 if (!mapped_) { CloseHandle(mapping_handle_); CloseHandle(file_handle_);
58 mapping_handle_ = nullptr; file_handle_ = INVALID_HANDLE_VALUE;
59 throw std::runtime_error("Cannot map file"); }
60#else
61 fd_ = open(filename.c_str(), O_RDONLY);
62 if (fd_ < 0) throw std::runtime_error("Cannot open: " + filename);
63 struct stat sb;
64 if (fstat(fd_, &sb) < 0) { close(fd_); fd_ = -1; throw std::runtime_error("Cannot stat file"); }
65 file_size_ = static_cast<size_t>(sb.st_size);
66 if (file_size_ < HEADER_SIZE) { close(fd_); fd_ = -1; throw std::runtime_error("File too small"); }
67 mapped_ = mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fd_, 0);
68 if (mapped_ == MAP_FAILED) { close(fd_); fd_ = -1; mapped_ = nullptr;
69 throw std::runtime_error("Cannot mmap file"); }
70#endif
71 const uint8_t* p = static_cast<const uint8_t*>(mapped_);
72 uint32_t magic; std::memcpy(&magic, p, 4);
73 if (magic != MAGIC) { cleanup(); throw std::runtime_error("Invalid magic"); }
74 p += 4;
75 uint32_t ver; std::memcpy(&ver, p, 4);
76 if (ver != VERSION) { cleanup(); throw std::runtime_error("Unsupported version"); }
77 p += 4;
78 std::memcpy(&dim_, p, 8); p += 8;
79 std::memcpy(&num_vectors_, p, 8); p += 8;
80 uint32_t met; std::memcpy(&met, p, 4);
81 if (met > 2) { cleanup(); throw std::runtime_error("Invalid metric"); }
82 metric_ = static_cast<DistanceMetric>(met);
83
84 // Check for overflow in size calculations step by step
85 if (num_vectors_ > SIZE_MAX / sizeof(uint64_t)) {
86 cleanup(); throw std::runtime_error("File corrupted: size overflow");
87 }
88 if (dim_ == 0 && num_vectors_ > 0) {
89 cleanup(); throw std::runtime_error("File corrupted: zero dimension with vectors");
90 }
91 if (dim_ > SIZE_MAX / sizeof(float)) {
92 cleanup(); throw std::runtime_error("File corrupted: size overflow");
93 }
94 size_t vec_bytes_per = dim_ * sizeof(float); // Safe due to check above
95 if (vec_bytes_per > 0 && num_vectors_ > SIZE_MAX / vec_bytes_per) {
96 cleanup(); throw std::runtime_error("File corrupted: size overflow");
97 }
98 size_t ids_size = num_vectors_ * sizeof(uint64_t);
99 size_t vecs_size = num_vectors_ * vec_bytes_per;
100 if (ids_size > SIZE_MAX - HEADER_SIZE || vecs_size > SIZE_MAX - HEADER_SIZE - ids_size) {
101 cleanup(); throw std::runtime_error("File corrupted: size overflow");
102 }
103 size_t expected = HEADER_SIZE + ids_size + vecs_size;
104 if (file_size_ < expected) { cleanup(); throw std::runtime_error("File truncated"); }
105
106 ids_ptr_ = reinterpret_cast<const uint64_t*>(static_cast<const uint8_t*>(mapped_) + HEADER_SIZE);
107 vectors_ptr_ = reinterpret_cast<const float*>(
108 static_cast<const uint8_t*>(mapped_) + HEADER_SIZE + num_vectors_ * sizeof(uint64_t));
109 dist_ = DistanceComputer(metric_, dim_);
110 try {
111 id_map_.reserve(num_vectors_);
112 for (size_t i = 0; i < num_vectors_; ++i) id_map_[ids_ptr_[i]] = i;
113 } catch (...) { cleanup(); throw; }
114 }
115
116 ~MMapVectorStore() { cleanup(); }
119
121#ifdef VANEDB_WINDOWS
122 file_handle_(o.file_handle_), mapping_handle_(o.mapping_handle_),
123#else
124 fd_(o.fd_),
125#endif
126 mapped_(o.mapped_), file_size_(o.file_size_), dim_(o.dim_), num_vectors_(o.num_vectors_),
127 metric_(o.metric_), dist_(o.dist_), ids_ptr_(o.ids_ptr_), vectors_ptr_(o.vectors_ptr_), id_map_(std::move(o.id_map_)) {
128#ifdef VANEDB_WINDOWS
129 o.file_handle_ = INVALID_HANDLE_VALUE; o.mapping_handle_ = nullptr;
130#else
131 o.fd_ = -1;
132#endif
133 o.mapped_ = nullptr;
134 }
135
137 if (this != &o) {
138 cleanup();
139#ifdef VANEDB_WINDOWS
140 file_handle_ = o.file_handle_; mapping_handle_ = o.mapping_handle_;
141 o.file_handle_ = INVALID_HANDLE_VALUE; o.mapping_handle_ = nullptr;
142#else
143 fd_ = o.fd_; o.fd_ = -1;
144#endif
145 mapped_ = o.mapped_; file_size_ = o.file_size_; dim_ = o.dim_; num_vectors_ = o.num_vectors_;
146 metric_ = o.metric_; dist_ = o.dist_; ids_ptr_ = o.ids_ptr_; vectors_ptr_ = o.vectors_ptr_;
147 id_map_ = std::move(o.id_map_); o.mapped_ = nullptr;
148 }
149 return *this;
150 }
151
152 const float* get(uint64_t id) const {
153 auto it = id_map_.find(id);
154 return it == id_map_.end() ? nullptr : vectors_ptr_ + it->second * dim_;
155 }
156
157 bool contains(uint64_t id) const { return id_map_.count(id); }
158
159 std::vector<SearchResult> search(const float* query, size_t k) const {
160 if (!query) throw std::invalid_argument("Query must not be null");
161 if (k == 0) throw std::invalid_argument("k must be > 0");
162 std::vector<SearchResult> res;
163 res.reserve(num_vectors_);
164 for (size_t i = 0; i < num_vectors_; ++i)
165 res.push_back({ids_ptr_[i], dist_(query, vectors_ptr_ + i * dim_)});
166 size_t n = std::min(k, res.size());
167 std::partial_sort(res.begin(), res.begin() + n, res.end());
168 res.resize(n);
169 return res;
170 }
171
172 size_t size() const { return num_vectors_; }
173 size_t dimension() const { return dim_; }
174 DistanceMetric metric() const { return metric_; }
175
176private:
177 void cleanup() {
178#ifdef VANEDB_WINDOWS
179 if (mapped_) { UnmapViewOfFile(mapped_); mapped_ = nullptr; }
180 if (mapping_handle_) { CloseHandle(mapping_handle_); mapping_handle_ = nullptr; }
181 if (file_handle_ != INVALID_HANDLE_VALUE) { CloseHandle(file_handle_); file_handle_ = INVALID_HANDLE_VALUE; }
182#else
183 if (mapped_ && mapped_ != MAP_FAILED) { munmap(mapped_, file_size_); mapped_ = nullptr; }
184 if (fd_ >= 0) { close(fd_); fd_ = -1; }
185#endif
186 }
187
188#ifdef VANEDB_WINDOWS
189 HANDLE file_handle_ = INVALID_HANDLE_VALUE;
190 HANDLE mapping_handle_ = nullptr;
191#else
192 int fd_ = -1;
193#endif
194 void* mapped_ = nullptr;
195 size_t file_size_ = 0, dim_ = 0, num_vectors_ = 0;
197 DistanceComputer dist_;
198 const uint64_t* ids_ptr_ = nullptr;
199 const float* vectors_ptr_ = nullptr;
200 std::unordered_map<uint64_t, size_t> id_map_;
201};
202
204public:
206 : dim_(dimension), metric_(metric) {
207 if (dimension == 0) throw std::invalid_argument("Dimension must be > 0");
208 }
209
210 void add(uint64_t id, const float* vec) {
211 if (!vec) throw std::invalid_argument("Vector must not be null");
212 if (id_set_.count(id)) throw std::invalid_argument("Duplicate ID: " + std::to_string(id));
213 ids_.push_back(id);
214 vectors_.insert(vectors_.end(), vec, vec + dim_);
215 id_set_.insert(id);
216 }
217
218 void reserve(size_t cap) { ids_.reserve(cap); vectors_.reserve(cap * dim_); }
219
220 void save(const std::string& filename) const {
221 std::string tmp = filename + ".tmp";
222 std::ofstream f(tmp, std::ios::binary);
223 if (!f) throw std::runtime_error("Cannot open: " + tmp);
224 uint32_t magic = MMapVectorStore::MAGIC, ver = MMapVectorStore::VERSION;
225 uint64_t dim = dim_, nv = ids_.size();
226 uint32_t met = static_cast<uint32_t>(metric_), reserved = 0;
227 f.write(reinterpret_cast<const char*>(&magic), 4);
228 f.write(reinterpret_cast<const char*>(&ver), 4);
229 f.write(reinterpret_cast<const char*>(&dim), 8);
230 f.write(reinterpret_cast<const char*>(&nv), 8);
231 f.write(reinterpret_cast<const char*>(&met), 4);
232 f.write(reinterpret_cast<const char*>(&reserved), 4);
233 f.write(reinterpret_cast<const char*>(ids_.data()), ids_.size() * sizeof(uint64_t));
234 f.write(reinterpret_cast<const char*>(vectors_.data()), vectors_.size() * sizeof(float));
235 f.flush();
236 if (!f) { std::remove(tmp.c_str()); throw std::runtime_error("Write failed"); }
237 f.close(); // close before fsync_file (see file_utils.h: Windows lock contract)
239 if (std::rename(tmp.c_str(), filename.c_str()) != 0) {
240 std::remove(tmp.c_str()); throw std::runtime_error("Rename failed");
241 }
242 }
243
244 size_t size() const { return ids_.size(); }
245 size_t dimension() const { return dim_; }
246
247private:
248 size_t dim_;
249 DistanceMetric metric_;
250 std::vector<uint64_t> ids_;
251 std::vector<float> vectors_;
252 std::unordered_set<uint64_t> id_set_;
253};
254
255} // namespace vanedb
void add(uint64_t id, const float *vec)
void save(const std::string &filename) const
MMapVectorStoreBuilder(size_t dimension, DistanceMetric metric=DistanceMetric::L2)
static constexpr uint32_t MAGIC
DistanceMetric metric() const
bool contains(uint64_t id) const
static constexpr uint32_t VERSION
MMapVectorStore & operator=(const MMapVectorStore &)=delete
MMapVectorStore & operator=(MMapVectorStore &&o) noexcept
MMapVectorStore(MMapVectorStore &&o) noexcept
std::vector< SearchResult > search(const float *query, size_t k) const
const float * get(uint64_t id) const
MMapVectorStore(const std::string &filename)
MMapVectorStore(const MMapVectorStore &)=delete
static constexpr size_t HEADER_SIZE
void fsync_file(const std::string &path) noexcept
Definition file_utils.h:24