Skip to content

Commit b21704e

Browse files
authored
perf: Replace unordered_map with bitset for vocabulary lookups (#2040)
Signed-off-by: Syed Azeez <syedazeez337@gmail.com>
1 parent eababe4 commit b21704e

19 files changed

+497
-129
lines changed

src/core/jsonschema/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ include(./official_resolver.cmake)
44

55
sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME jsonschema
66
PRIVATE_HEADERS bundle.h walker.h frame.h error.h
7-
types.h transform.h
8-
SOURCES jsonschema.cc official_walker.cc frame.cc
9-
walker.cc bundle.cc transformer.cc format.cc
7+
types.h transform.h vocabularies.h
8+
SOURCES jsonschema.cc vocabularies.cc official_walker.cc
9+
frame.cc walker.cc bundle.cc transformer.cc format.cc
1010
"${CMAKE_CURRENT_BINARY_DIR}/official_resolver.cc")
1111

1212
if(SOURCEMETA_CORE_INSTALL)

src/core/jsonschema/include/sourcemeta/core/jsonschema_types.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,17 @@
33

44
#include <sourcemeta/core/json.h>
55
#include <sourcemeta/core/jsonpointer.h>
6+
#include <sourcemeta/core/jsonschema_vocabularies.h>
67

7-
#include <cstdint> // std::uint8_t
8-
#include <functional> // std::function, std::reference_wrapper
9-
#include <optional> // std::optional
10-
#include <set> // std::set
11-
#include <string> // std::string
12-
#include <string_view> // std::string_view
13-
#include <unordered_map> // std::unordered_map
8+
#include <cstdint> // std::uint8_t
9+
#include <functional> // std::function, std::reference_wrapper
10+
#include <optional> // std::optional
11+
#include <set> // std::set
12+
#include <string> // std::string
13+
#include <string_view> // std::string_view
1414

1515
namespace sourcemeta::core {
1616

17-
/// @ingroup jsonschema
18-
/// A set of vocabularies
19-
using Vocabularies = std::unordered_map<JSON::String, bool>;
20-
2117
// Take a URI and get back a schema
2218
/// @ingroup jsonschema
2319
///
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#ifndef SOURCEMETA_CORE_JSONSCHEMA_VOCABULARIES_H_
2+
#define SOURCEMETA_CORE_JSONSCHEMA_VOCABULARIES_H_
3+
4+
#ifndef SOURCEMETA_CORE_JSONSCHEMA_EXPORT
5+
#include <sourcemeta/core/jsonschema_export.h>
6+
#endif
7+
8+
#include <sourcemeta/core/json.h>
9+
10+
#include <bitset> // std::bitset
11+
#include <cassert> // assert
12+
#include <cstdint> // std::uint32_t, std::size_t
13+
#include <optional> // std::optional
14+
#include <stdexcept> // std::out_of_range
15+
#include <string> // std::string
16+
#include <string_view> // std::string_view
17+
#include <unordered_map> // std::unordered_map
18+
#include <utility> // std::pair
19+
#include <vector> // std::vector
20+
21+
namespace sourcemeta::core {
22+
23+
/// @ingroup jsonschema
24+
/// Optimized vocabulary set using bitflags for known vocabularies
25+
/// and a fallback `std::unordered_map` for custom vocabularies.
26+
///
27+
/// TODO: To maximize performance gains, convert string-based vocabulary checks
28+
/// throughout the codebase to use enum-based methods.
29+
struct SOURCEMETA_CORE_JSONSCHEMA_EXPORT Vocabularies {
30+
enum class Known : std::uint8_t {
31+
// Pre-vocabulary dialects (treated as vocabularies)
32+
JSON_Schema_Draft_0 = 0,
33+
JSON_Schema_Draft_0_Hyper = 1,
34+
JSON_Schema_Draft_1 = 2,
35+
JSON_Schema_Draft_1_Hyper = 3,
36+
JSON_Schema_Draft_2 = 4,
37+
JSON_Schema_Draft_2_Hyper = 5,
38+
JSON_Schema_Draft_3 = 6,
39+
JSON_Schema_Draft_3_Hyper = 7,
40+
JSON_Schema_Draft_4 = 8,
41+
JSON_Schema_Draft_4_Hyper = 9,
42+
JSON_Schema_Draft_6 = 10,
43+
JSON_Schema_Draft_6_Hyper = 11,
44+
JSON_Schema_Draft_7 = 12,
45+
JSON_Schema_Draft_7_Hyper = 13,
46+
// 2019-09 vocabularies
47+
JSON_Schema_2019_09_Core = 14,
48+
JSON_Schema_2019_09_Applicator = 15,
49+
JSON_Schema_2019_09_Validation = 16,
50+
JSON_Schema_2019_09_Meta_Data = 17,
51+
JSON_Schema_2019_09_Format = 18,
52+
JSON_Schema_2019_09_Content = 19,
53+
JSON_Schema_2019_09_Hyper_Schema = 20,
54+
// 2020-12 vocabularies
55+
JSON_Schema_2020_12_Core = 21,
56+
JSON_Schema_2020_12_Applicator = 22,
57+
JSON_Schema_2020_12_Unevaluated = 23,
58+
JSON_Schema_2020_12_Validation = 24,
59+
JSON_Schema_2020_12_Meta_Data = 25,
60+
JSON_Schema_2020_12_Format_Annotation = 26,
61+
JSON_Schema_2020_12_Format_Assertion = 27,
62+
JSON_Schema_2020_12_Content = 28
63+
};
64+
65+
// NOTE: Must be kept in sync with the Known enum above
66+
static constexpr std::size_t KNOWN_VOCABULARY_COUNT = 29;
67+
68+
public:
69+
Vocabularies() = default;
70+
Vocabularies(const Vocabularies &) = default;
71+
Vocabularies(Vocabularies &&) noexcept = default;
72+
auto operator=(const Vocabularies &) -> Vocabularies & = default;
73+
auto operator=(Vocabularies &&) noexcept -> Vocabularies & = default;
74+
~Vocabularies() = default;
75+
76+
/// Construct from initializer list
77+
Vocabularies(std::initializer_list<std::pair<JSON::String, bool>> init);
78+
79+
/// Construct from initializer list using known vocabulary enums
80+
Vocabularies(std::initializer_list<std::pair<Known, bool>> init);
81+
82+
/// Check if a vocabulary is enabled
83+
[[nodiscard]] auto contains(const JSON::String &uri) const noexcept -> bool;
84+
85+
/// Check if a known vocabulary is enabled
86+
[[nodiscard]] auto contains(Known vocabulary) const noexcept -> bool;
87+
88+
/// Insert a vocabulary with its required/optional status
89+
auto insert(const JSON::String &uri, bool required) noexcept -> void;
90+
91+
/// Insert a known vocabulary with its required/optional status
92+
auto insert(Known vocabulary, bool required) noexcept -> void;
93+
94+
/// Get vocabulary status by URI
95+
[[nodiscard]] auto get(const JSON::String &uri) const noexcept
96+
-> std::optional<bool>;
97+
98+
/// Get known vocabulary status
99+
[[nodiscard]] auto get(Known vocabulary) const noexcept
100+
-> std::optional<bool>;
101+
102+
/// Get the number of vocabularies (required + optional + custom)
103+
[[nodiscard]] auto size() const noexcept -> std::size_t;
104+
105+
/// Check if there are no vocabularies
106+
[[nodiscard]] auto empty() const noexcept -> bool;
107+
108+
private:
109+
// Invariant: required_known and optional_known must be mutually exclusive
110+
// A vocabulary can be either required (true) OR optional (false), never both
111+
#ifdef _MSC_VER
112+
#pragma warning(push)
113+
#pragma warning(disable : 4251)
114+
#endif
115+
std::bitset<KNOWN_VOCABULARY_COUNT> required_known{};
116+
std::bitset<KNOWN_VOCABULARY_COUNT> optional_known{};
117+
std::unordered_map<JSON::String, bool> custom;
118+
#ifdef _MSC_VER
119+
#pragma warning(pop)
120+
#endif
121+
};
122+
123+
} // namespace sourcemeta::core
124+
125+
#endif

src/core/jsonschema/jsonschema.cc

Lines changed: 86 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -259,18 +259,67 @@ auto sourcemeta::core::base_dialect(
259259
}
260260

261261
namespace {
262-
auto core_vocabulary(std::string_view base_dialect) -> std::string {
262+
auto core_vocabulary_known(std::string_view base_dialect)
263+
-> sourcemeta::core::Vocabularies::Known {
263264
if (base_dialect == "https://json-schema.org/draft/2020-12/schema" ||
264265
base_dialect == "https://json-schema.org/draft/2020-12/hyper-schema") {
265-
return "https://json-schema.org/draft/2020-12/vocab/core";
266+
return sourcemeta::core::Vocabularies::Known::JSON_Schema_2020_12_Core;
266267
} else if (base_dialect == "https://json-schema.org/draft/2019-09/schema" ||
267268
base_dialect ==
268269
"https://json-schema.org/draft/2019-09/hyper-schema") {
269-
return "https://json-schema.org/draft/2019-09/vocab/core";
270+
return sourcemeta::core::Vocabularies::Known::JSON_Schema_2019_09_Core;
270271
} else {
271272
throw sourcemeta::core::SchemaBaseDialectError(std::string{base_dialect});
272273
}
273274
}
275+
276+
auto dialect_to_known(std::string_view dialect)
277+
-> std::optional<sourcemeta::core::Vocabularies::Known> {
278+
using sourcemeta::core::Vocabularies;
279+
if (dialect == "http://json-schema.org/draft-07/schema#") {
280+
return Vocabularies::Known::JSON_Schema_Draft_7;
281+
}
282+
if (dialect == "http://json-schema.org/draft-07/hyper-schema#") {
283+
return Vocabularies::Known::JSON_Schema_Draft_7_Hyper;
284+
}
285+
if (dialect == "http://json-schema.org/draft-06/schema#") {
286+
return Vocabularies::Known::JSON_Schema_Draft_6;
287+
}
288+
if (dialect == "http://json-schema.org/draft-06/hyper-schema#") {
289+
return Vocabularies::Known::JSON_Schema_Draft_6_Hyper;
290+
}
291+
if (dialect == "http://json-schema.org/draft-04/schema#") {
292+
return Vocabularies::Known::JSON_Schema_Draft_4;
293+
}
294+
if (dialect == "http://json-schema.org/draft-04/hyper-schema#") {
295+
return Vocabularies::Known::JSON_Schema_Draft_4_Hyper;
296+
}
297+
if (dialect == "http://json-schema.org/draft-03/schema#") {
298+
return Vocabularies::Known::JSON_Schema_Draft_3;
299+
}
300+
if (dialect == "http://json-schema.org/draft-03/hyper-schema#") {
301+
return Vocabularies::Known::JSON_Schema_Draft_3_Hyper;
302+
}
303+
if (dialect == "http://json-schema.org/draft-02/schema#") {
304+
return Vocabularies::Known::JSON_Schema_Draft_2;
305+
}
306+
if (dialect == "http://json-schema.org/draft-02/hyper-schema#") {
307+
return Vocabularies::Known::JSON_Schema_Draft_2_Hyper;
308+
}
309+
if (dialect == "http://json-schema.org/draft-01/schema#") {
310+
return Vocabularies::Known::JSON_Schema_Draft_1;
311+
}
312+
if (dialect == "http://json-schema.org/draft-01/hyper-schema#") {
313+
return Vocabularies::Known::JSON_Schema_Draft_1_Hyper;
314+
}
315+
if (dialect == "http://json-schema.org/draft-00/schema#") {
316+
return Vocabularies::Known::JSON_Schema_Draft_0;
317+
}
318+
if (dialect == "http://json-schema.org/draft-00/hyper-schema#") {
319+
return Vocabularies::Known::JSON_Schema_Draft_0_Hyper;
320+
}
321+
return std::nullopt;
322+
}
274323
} // namespace
275324

276325
auto sourcemeta::core::vocabularies(
@@ -304,21 +353,22 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
304353
// As a performance optimization shortcut
305354
if (base_dialect == dialect) {
306355
if (dialect == "https://json-schema.org/draft/2020-12/schema") {
307-
return {{"https://json-schema.org/draft/2020-12/vocab/core", true},
308-
{"https://json-schema.org/draft/2020-12/vocab/applicator", true},
309-
{"https://json-schema.org/draft/2020-12/vocab/unevaluated", true},
310-
{"https://json-schema.org/draft/2020-12/vocab/validation", true},
311-
{"https://json-schema.org/draft/2020-12/vocab/meta-data", true},
312-
{"https://json-schema.org/draft/2020-12/vocab/format-annotation",
313-
true},
314-
{"https://json-schema.org/draft/2020-12/vocab/content", true}};
356+
return Vocabularies{
357+
{Vocabularies::Known::JSON_Schema_2020_12_Core, true},
358+
{Vocabularies::Known::JSON_Schema_2020_12_Applicator, true},
359+
{Vocabularies::Known::JSON_Schema_2020_12_Unevaluated, true},
360+
{Vocabularies::Known::JSON_Schema_2020_12_Validation, true},
361+
{Vocabularies::Known::JSON_Schema_2020_12_Meta_Data, true},
362+
{Vocabularies::Known::JSON_Schema_2020_12_Format_Annotation, true},
363+
{Vocabularies::Known::JSON_Schema_2020_12_Content, true}};
315364
} else if (dialect == "https://json-schema.org/draft/2019-09/schema") {
316-
return {{"https://json-schema.org/draft/2019-09/vocab/core", true},
317-
{"https://json-schema.org/draft/2019-09/vocab/applicator", true},
318-
{"https://json-schema.org/draft/2019-09/vocab/validation", true},
319-
{"https://json-schema.org/draft/2019-09/vocab/meta-data", true},
320-
{"https://json-schema.org/draft/2019-09/vocab/format", false},
321-
{"https://json-schema.org/draft/2019-09/vocab/content", true}};
365+
return Vocabularies{
366+
{Vocabularies::Known::JSON_Schema_2019_09_Core, true},
367+
{Vocabularies::Known::JSON_Schema_2019_09_Applicator, true},
368+
{Vocabularies::Known::JSON_Schema_2019_09_Validation, true},
369+
{Vocabularies::Known::JSON_Schema_2019_09_Meta_Data, true},
370+
{Vocabularies::Known::JSON_Schema_2019_09_Format, false},
371+
{Vocabularies::Known::JSON_Schema_2019_09_Content, true}};
322372
}
323373
}
324374

@@ -336,7 +386,11 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
336386
dialect == "http://json-schema.org/draft-02/schema#" ||
337387
dialect == "http://json-schema.org/draft-01/schema#" ||
338388
dialect == "http://json-schema.org/draft-00/schema#") {
339-
return {{dialect, true}};
389+
const auto known = dialect_to_known(dialect);
390+
if (known.has_value()) {
391+
return Vocabularies{{known.value(), true}};
392+
}
393+
return Vocabularies{{dialect, true}};
340394
}
341395

342396
/*
@@ -356,7 +410,11 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
356410
base_dialect == "http://json-schema.org/draft-02/hyper-schema#" ||
357411
base_dialect == "http://json-schema.org/draft-01/hyper-schema#" ||
358412
base_dialect == "http://json-schema.org/draft-00/hyper-schema#") {
359-
return {{base_dialect, true}};
413+
const auto known = dialect_to_known(base_dialect);
414+
if (known.has_value()) {
415+
return Vocabularies{{known.value(), true}};
416+
}
417+
return Vocabularies{{base_dialect, true}};
360418
}
361419

362420
/*
@@ -384,25 +442,28 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
384442
*/
385443

386444
Vocabularies result;
387-
const std::string core{core_vocabulary(base_dialect)};
445+
const auto core{core_vocabulary_known(base_dialect)};
388446
if (schema_dialect.defines("$vocabulary")) {
389447
const sourcemeta::core::JSON &vocabularies{
390448
schema_dialect.at("$vocabulary")};
391449
assert(vocabularies.is_object());
392450
for (const auto &entry : vocabularies.as_object()) {
393-
result.insert({entry.first, entry.second.to_boolean()});
451+
result.insert(entry.first, entry.second.to_boolean());
394452
}
395453
} else {
396-
result.insert({core, true});
454+
result.insert(core, true);
397455
}
398456

399457
// The specification recommends these checks
400458
if (!result.contains(core)) {
401459
throw sourcemeta::core::SchemaError(
402460
"The core vocabulary must always be present");
403-
} else if (!result.at(core)) {
404-
throw sourcemeta::core::SchemaError(
405-
"The core vocabulary must always be required");
461+
} else {
462+
const auto core_status{result.get(core)};
463+
if (core_status.has_value() && !core_status.value()) {
464+
throw sourcemeta::core::SchemaError(
465+
"The core vocabulary must always be required");
466+
}
406467
}
407468

408469
return result;

0 commit comments

Comments
 (0)