/tmp/bitcoin/src/util/asmap.cpp
Line | Count | Source |
1 | | // Copyright (c) 2019-present The Bitcoin Core developers |
2 | | // Distributed under the MIT software license, see the accompanying |
3 | | // file COPYING or http://www.opensource.org/licenses/mit-license.php. |
4 | | |
5 | | #include <util/asmap.h> |
6 | | |
7 | | #include <hash.h> |
8 | | #include <streams.h> |
9 | | #include <uint256.h> |
10 | | #include <util/check.h> |
11 | | #include <util/fs.h> |
12 | | #include <util/log.h> |
13 | | |
14 | | #include <bit> |
15 | | #include <cstddef> |
16 | | #include <cstdio> |
17 | | #include <span> |
18 | | #include <string> |
19 | | #include <utility> |
20 | | #include <vector> |
21 | | |
22 | | /* |
23 | | * ASMap (Autonomous System Map) Implementation |
24 | | * |
25 | | * Provides a compressed mapping from IP address prefixes to Autonomous System Numbers (ASNs). |
26 | | * Uses a binary trie structure encoded as bytecode instructions that are interpreted |
27 | | * at runtime to find the ASN for a given IP address. |
28 | | * |
29 | | * The format of the asmap data is a bit-packed binary format where the entire mapping |
30 | | * is treated as a continuous sequence of bits. Instructions and their arguments are |
31 | | * encoded using variable numbers of bits and concatenated together without regard for |
32 | | * byte boundaries. The bits are stored in bytes using little-endian bit ordering. |
33 | | * |
34 | | * The data structure internally represents the mapping as a binary trie where: |
35 | | * - Unassigned subnets (no ASN mapping present) map to 0 |
36 | | * - Subnets mapped entirely to one ASN become leaf nodes |
37 | | * - Subnets whose lower and upper halves have different mappings branch into subtrees |
38 | | * |
39 | | * The encoding uses variable-length integers and four instruction types (RETURN, JUMP, |
40 | | * MATCH, DEFAULT) to efficiently represent the trie. |
41 | | */ |
42 | | |
43 | | namespace { |
44 | | |
45 | | // Indicates decoding errors or invalid data |
46 | | constexpr uint32_t INVALID = 0xFFFFFFFF; |
47 | | |
48 | | /** |
49 | | * Extract a single bit from byte array using little-endian bit ordering (LSB first). |
50 | | * Used for ASMap data. |
51 | | */ |
52 | | inline bool ConsumeBitLE(size_t& bitpos, std::span<const std::byte> bytes) noexcept |
53 | 37.5M | { |
54 | 37.5M | const bool bit = (std::to_integer<uint8_t>(bytes[bitpos / 8]) >> (bitpos % 8)) & 1; |
55 | 37.5M | ++bitpos; |
56 | 37.5M | return bit; |
57 | 37.5M | } |
58 | | |
59 | | /** |
60 | | * Extract a single bit from byte array using big-endian bit ordering (MSB first). |
61 | | * Used for IP addresses to match network byte order conventions. |
62 | | */ |
63 | | inline bool ConsumeBitBE(uint8_t& bitpos, std::span<const std::byte> bytes) noexcept |
64 | 412k | { |
65 | 412k | const bool bit = (std::to_integer<uint8_t>(bytes[bitpos / 8]) >> (7 - (bitpos % 8))) & 1; |
66 | 412k | ++bitpos; |
67 | 412k | return bit; |
68 | 412k | } |
69 | | |
70 | | /** |
71 | | * Variable-length integer decoder using a custom encoding scheme. |
72 | | * |
73 | | * The encoding is easiest to describe using an example. Let's say minval=100 and |
74 | | * bit_sizes=[4,2,2,3]. In that case: |
75 | | * - x in [100..115]: encoded as [0] + [4-bit BE encoding of (x-100)] |
76 | | * - x in [116..119]: encoded as [1,0] + [2-bit BE encoding of (x-116)] |
77 | | * - x in [120..123]: encoded as [1,1,0] + [2-bit BE encoding of (x-120)] |
78 | | * - x in [124..131]: encoded as [1,1,1] + [3-bit BE encoding of (x-124)] |
79 | | * |
80 | | * In general, every number is encoded as: |
81 | | * - First, k "1"-bits, where k is the class the number falls in |
82 | | * - Then, a "0"-bit, unless k is the highest class |
83 | | * - Lastly, bit_sizes[k] bits encoding in big endian the position within that class |
84 | | */ |
85 | | uint32_t DecodeBits(size_t& bitpos, const std::span<const std::byte> data, uint8_t minval, const std::span<const uint8_t> bit_sizes) |
86 | 5.58M | { |
87 | 5.58M | uint32_t val = minval; // Start with minimum encodable value |
88 | 5.58M | bool bit; |
89 | 12.2M | for (auto bit_sizes_it = bit_sizes.begin(); bit_sizes_it != bit_sizes.end(); ++bit_sizes_it) { |
90 | | // Read continuation bit to determine if we're in this class |
91 | 12.2M | if (bit_sizes_it + 1 != bit_sizes.end()) { // Unless we're in the last class |
92 | 10.7M | if (bitpos >= data.size() * 8) break; |
93 | 10.7M | bit = ConsumeBitLE(bitpos, data); |
94 | 10.7M | } else { |
95 | 1.55M | bit = 0; // Last class has no continuation bit |
96 | 1.55M | } |
97 | 12.2M | if (bit) { |
98 | | // If the value will not fit in this class, subtract its range from val, |
99 | | // emit a "1" bit and continue with the next class |
100 | 6.68M | val += (1 << *bit_sizes_it); // Add size of this class |
101 | 6.68M | } else { |
102 | | // Decode the position within this class in big endian |
103 | 32.3M | for (int b = 0; b < *bit_sizes_it; b++) { |
104 | 26.8M | if (bitpos >= data.size() * 8) return INVALID; // Reached EOF in mantissa |
105 | 26.8M | bit = ConsumeBitLE(bitpos, data); |
106 | 26.8M | val += bit << (*bit_sizes_it - 1 - b); // Big-endian within the class |
107 | 26.8M | } |
108 | 5.58M | return val; |
109 | 5.58M | } |
110 | 12.2M | } |
111 | 0 | return INVALID; // Reached EOF in exponent |
112 | 5.58M | } |
113 | | |
114 | | /** |
115 | | * Instruction Set |
116 | | * |
117 | | * The instruction set is designed to efficiently encode a binary trie |
118 | | * that maps IP prefixes to ASNs. Each instruction type serves a specific |
119 | | * role in trie traversal and evaluation. |
120 | | */ |
121 | | enum class Instruction : uint32_t |
122 | | { |
123 | | // A return instruction, encoded as [0], returns a constant ASN. |
124 | | // It is followed by an integer using the ASN encoding. |
125 | | RETURN = 0, |
126 | | // A jump instruction, encoded as [1,0], inspects the next unused bit in the input |
127 | | // and either continues execution (if 0), or skips a specified number of bits (if 1). |
128 | | // It is followed by an integer using jump encoding. |
129 | | JUMP = 1, |
130 | | // A match instruction, encoded as [1,1,0], inspects 1 or more of the next unused bits |
131 | | // in the input. If they all match, execution continues. If not, the default ASN is returned |
132 | | // (or 0 if unset). The match value encodes both the pattern and its length. |
133 | | MATCH = 2, |
134 | | // A default instruction, encoded as [1,1,1], sets the default variable to its argument, |
135 | | // and continues execution. It is followed by an integer in ASN encoding. |
136 | | DEFAULT = 3, |
137 | | }; |
138 | | |
139 | | // Instruction type encoding: RETURN=[0], JUMP=[1,0], MATCH=[1,1,0], DEFAULT=[1,1,1] |
140 | | constexpr uint8_t TYPE_BIT_SIZES[]{0, 0, 1}; |
141 | | Instruction DecodeType(size_t& bitpos, const std::span<const std::byte> data) |
142 | 2.79M | { |
143 | 2.79M | return Instruction(DecodeBits(bitpos, data, 0, TYPE_BIT_SIZES)); |
144 | 2.79M | } |
145 | | |
146 | | // ASN encoding: Can encode ASNs from 1 to ~16.7 million. |
147 | | // Uses variable-length encoding optimized for real-world ASN distribution. |
148 | | // ASN 0 is reserved and used if there isn't a match. |
149 | | constexpr uint8_t ASN_BIT_SIZES[]{15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; |
150 | | uint32_t DecodeASN(size_t& bitpos, const std::span<const std::byte> data) |
151 | 1.20M | { |
152 | 1.20M | return DecodeBits(bitpos, data, 1, ASN_BIT_SIZES); |
153 | 1.20M | } |
154 | | |
155 | | // MATCH argument: Values in [2, 511]. The highest set bit determines the match length |
156 | | // n ∈ [1,8]; the lower n-1 bits are the pattern to compare. |
157 | | constexpr uint8_t MATCH_BIT_SIZES[]{1, 2, 3, 4, 5, 6, 7, 8}; |
158 | | uint32_t DecodeMatch(size_t& bitpos, const std::span<const std::byte> data) |
159 | 930k | { |
160 | 930k | return DecodeBits(bitpos, data, 2, MATCH_BIT_SIZES); |
161 | 930k | } |
162 | | |
163 | | // JUMP offset: Minimum value 17. Variable-length coded and may be large |
164 | | // for skipping big subtrees. |
165 | | constexpr uint8_t JUMP_BIT_SIZES[]{5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}; |
166 | | uint32_t DecodeJump(size_t& bitpos, const std::span<const std::byte> data) |
167 | 657k | { |
168 | 657k | return DecodeBits(bitpos, data, 17, JUMP_BIT_SIZES); |
169 | 657k | } |
170 | | |
171 | | } // anonymous namespace |
172 | | |
173 | | /** |
174 | | * Execute the ASMap bytecode to find the ASN for an IP |
175 | | * |
176 | | * This function interprets the asmap bytecode and uses bits from the IP |
177 | | * address to navigate through the encoded trie structure, ultimately |
178 | | * returning an ASN value. |
179 | | */ |
180 | | uint32_t Interpret(const std::span<const std::byte> asmap, const std::span<const std::byte> ip) |
181 | 4.21k | { |
182 | 4.21k | size_t pos{0}; |
183 | 4.21k | const size_t endpos{asmap.size() * 8}; |
184 | 4.21k | uint8_t ip_bit{0}; |
185 | 4.21k | const uint8_t ip_bits_end = ip.size() * 8; |
186 | 4.21k | uint32_t default_asn = 0; |
187 | 59.6k | while (pos < endpos) { |
188 | 59.6k | Instruction opcode = DecodeType(pos, asmap); |
189 | 59.6k | if (opcode == Instruction::RETURN) { |
190 | | // Found leaf node - return the ASN |
191 | 3.72k | uint32_t asn = DecodeASN(pos, asmap); |
192 | 3.72k | if (asn == INVALID) break; // ASN straddles EOF |
193 | 3.72k | return asn; |
194 | 55.8k | } else if (opcode == Instruction::JUMP) { |
195 | | // Binary branch: if IP bit is 1, jump forward; else continue |
196 | 4.60k | uint32_t jump = DecodeJump(pos, asmap); |
197 | 4.60k | if (jump == INVALID) break; // Jump offset straddles EOF |
198 | 4.60k | if (ip_bit == ip_bits_end) break; // No input bits left |
199 | 4.60k | if (int64_t{jump} >= static_cast<int64_t>(endpos - pos)) break; // Jumping past EOF |
200 | 4.60k | if (ConsumeBitBE(ip_bit, ip)) { // Check next IP bit (big-endian) |
201 | 3.68k | pos += jump; // Bit = 1: skip to right subtree |
202 | 3.68k | } |
203 | | // Bit = 0: fall through to left subtree |
204 | 51.2k | } else if (opcode == Instruction::MATCH) { |
205 | | // Compare multiple IP bits against a pattern |
206 | | // The match value encodes both length and pattern: |
207 | | // - highest set bit position determines length (bit_width - 1) |
208 | | // - lower bits contain the pattern to compare |
209 | 51.2k | uint32_t match = DecodeMatch(pos, asmap); |
210 | 51.2k | if (match == INVALID) break; // Match bits straddle EOF |
211 | 51.2k | int matchlen = std::bit_width(match) - 1; // An n-bit value matches n-1 input bits |
212 | 51.2k | if ((ip_bits_end - ip_bit) < matchlen) break; // Not enough input bits |
213 | 458k | for (int bit = 0; bit < matchlen; bit++) { |
214 | 407k | if (ConsumeBitBE(ip_bit, ip) != ((match >> (matchlen - 1 - bit)) & 1)) { |
215 | 486 | return default_asn; // Pattern mismatch - use default |
216 | 486 | } |
217 | 407k | } |
218 | | // Pattern matched - continue execution |
219 | 51.2k | } else if (opcode == Instruction::DEFAULT) { |
220 | | // Update the default ASN for subsequent MATCH failures |
221 | 22 | default_asn = DecodeASN(pos, asmap); |
222 | 22 | if (default_asn == INVALID) break; // ASN straddles EOF |
223 | 22 | } else { |
224 | 0 | break; // Instruction straddles EOF |
225 | 0 | } |
226 | 59.6k | } |
227 | | // Reached EOF without RETURN, or aborted (see any of the breaks above) |
228 | | // - should have been caught by SanityCheckAsmap below |
229 | 4.21k | assert(false); |
230 | 0 | return 0; // 0 is not a valid ASN |
231 | 0 | } |
232 | | |
233 | | /** |
234 | | * Validates ASMap structure by simulating all possible execution paths. |
235 | | * Ensures well-formed bytecode, valid jumps, and proper termination. |
236 | | */ |
237 | | bool SanityCheckAsmap(const std::span<const std::byte> asmap, int bits) |
238 | 9 | { |
239 | 9 | size_t pos{0}; |
240 | 9 | const size_t endpos{asmap.size() * 8}; |
241 | 9 | std::vector<std::pair<uint32_t, int>> jumps; // All future positions we may jump to (bit offset in asmap -> bits to consume left) |
242 | 9 | jumps.reserve(bits); |
243 | 9 | Instruction prevopcode = Instruction::JUMP; |
244 | 9 | bool had_incomplete_match = false; // Track <8 bit matches for efficiency check |
245 | | |
246 | 2.73M | while (pos != endpos) { |
247 | | // There was a jump into the middle of the previous instruction |
248 | 2.73M | if (!jumps.empty() && pos >= jumps.back().first) return false; |
249 | | |
250 | 2.73M | Instruction opcode = DecodeType(pos, asmap); |
251 | 2.73M | if (opcode == Instruction::RETURN) { |
252 | | // There should not be any RETURN immediately after a DEFAULT (could be combined into just RETURN) |
253 | 652k | if (prevopcode == Instruction::DEFAULT) return false; |
254 | 652k | uint32_t asn = DecodeASN(pos, asmap); |
255 | 652k | if (asn == INVALID) return false; // ASN straddles EOF |
256 | 652k | if (jumps.empty()) { |
257 | | // Nothing to execute anymore |
258 | 8 | if (endpos - pos > 7) return false; // Excessive padding |
259 | 30 | while (pos != endpos) { |
260 | 22 | if (ConsumeBitLE(pos, asmap)) return false; // Nonzero padding bit |
261 | 22 | } |
262 | 8 | return true; // Sanely reached EOF |
263 | 652k | } else { |
264 | | // Continue by pretending we jumped to the next instruction |
265 | 652k | if (pos != jumps.back().first) return false; // Unreachable code |
266 | 652k | bits = jumps.back().second; // Restore the number of bits we would have had left after this jump |
267 | 652k | jumps.pop_back(); |
268 | 652k | prevopcode = Instruction::JUMP; |
269 | 652k | } |
270 | 2.07M | } else if (opcode == Instruction::JUMP) { |
271 | 652k | uint32_t jump = DecodeJump(pos, asmap); |
272 | 652k | if (jump == INVALID) return false; // Jump offset straddles EOF |
273 | 652k | if (int64_t{jump} > static_cast<int64_t>(endpos - pos)) return false; // Jump out of range |
274 | 652k | if (bits == 0) return false; // Consuming bits past the end of the input |
275 | 652k | --bits; |
276 | 652k | uint32_t jump_offset = pos + jump; |
277 | 652k | if (!jumps.empty() && jump_offset >= jumps.back().first) return false; // Intersecting jumps |
278 | 652k | jumps.emplace_back(jump_offset, bits); // Queue jump target for validation |
279 | 652k | prevopcode = Instruction::JUMP; |
280 | 1.42M | } else if (opcode == Instruction::MATCH) { |
281 | 879k | uint32_t match = DecodeMatch(pos, asmap); |
282 | 879k | if (match == INVALID) return false; // Match bits straddle EOF |
283 | 879k | int matchlen = std::bit_width(match) - 1; |
284 | 879k | if (prevopcode != Instruction::MATCH) had_incomplete_match = false; |
285 | | // Within a sequence of matches only at most one should be incomplete |
286 | 879k | if (matchlen < 8 && had_incomplete_match) return false; |
287 | 879k | had_incomplete_match = (matchlen < 8); |
288 | 879k | if (bits < matchlen) return false; // Consuming bits past the end of the input |
289 | 879k | bits -= matchlen; |
290 | 879k | prevopcode = Instruction::MATCH; |
291 | 879k | } else if (opcode == Instruction::DEFAULT) { |
292 | | // There should not be two successive DEFAULTs (they could be combined into one) |
293 | 546k | if (prevopcode == Instruction::DEFAULT) return false; |
294 | 546k | uint32_t asn = DecodeASN(pos, asmap); |
295 | 546k | if (asn == INVALID) return false; // ASN straddles EOF |
296 | 546k | prevopcode = Instruction::DEFAULT; |
297 | 546k | } else { |
298 | 0 | return false; // Instruction straddles EOF |
299 | 0 | } |
300 | 2.73M | } |
301 | 1 | return false; // Reached EOF without RETURN instruction |
302 | 9 | } |
303 | | |
304 | | /** |
305 | | * Provides a safe interface for validating ASMap data before use. |
306 | | * Returns true if the data is valid for 128 bits long inputs. |
307 | | */ |
308 | | bool CheckStandardAsmap(const std::span<const std::byte> data) |
309 | 9 | { |
310 | 9 | if (!SanityCheckAsmap(data, 128)) { |
311 | 1 | LogWarning("Sanity check of asmap data failed\n"); |
312 | 1 | return false; |
313 | 1 | } |
314 | 8 | return true; |
315 | 9 | } |
316 | | |
317 | | /** |
318 | | * Loads an ASMap file from disk and validates it. |
319 | | */ |
320 | | std::vector<std::byte> DecodeAsmap(fs::path path) |
321 | 6 | { |
322 | 6 | FILE *filestr = fsbridge::fopen(path, "rb"); |
323 | 6 | AutoFile file{filestr}; |
324 | 6 | if (file.IsNull()) { |
325 | 0 | LogWarning("Failed to open asmap file from disk"); |
326 | 0 | return {}; |
327 | 0 | } |
328 | 6 | int64_t length{file.size()}; |
329 | 6 | LogInfo("Opened asmap file %s (%d bytes) from disk", fs::quoted(fs::PathToString(path)), length); |
330 | | |
331 | | // Read entire file into memory |
332 | 6 | std::vector<std::byte> buffer(length); |
333 | 6 | file.read(buffer); |
334 | | |
335 | 6 | if (!CheckStandardAsmap(buffer)) { |
336 | 1 | LogWarning("Sanity check of asmap file %s failed", fs::quoted(fs::PathToString(path))); |
337 | 1 | return {}; |
338 | 1 | } |
339 | | |
340 | 5 | return buffer; |
341 | 6 | } |
342 | | |
343 | | /** |
344 | | * Computes SHA256 hash of ASMap data for versioning and consistency checks. |
345 | | */ |
346 | | uint256 AsmapVersion(const std::span<const std::byte> data) |
347 | 2.06k | { |
348 | 2.06k | if (data.empty()) return {}; |
349 | | |
350 | 26 | HashWriter asmap_hasher; |
351 | 26 | asmap_hasher << data; |
352 | 26 | return asmap_hasher.GetHash(); |
353 | 2.06k | } |