/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line | Count | Source |
1 | | // Copyright (c) 2022-present The Bitcoin Core developers |
2 | | // Distributed under the MIT software license, see the accompanying |
3 | | // file COPYING or http://www.opensource.org/licenses/mit-license.php. |
4 | | // |
5 | | // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c, |
6 | | // Written and placed in public domain by Jeffrey Walton. |
7 | | // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and |
8 | | // Barry O'Rourke for the mbedTLS project. |
9 | | // Variant specialized for 64-byte inputs added by Pieter Wuille. |
10 | | |
11 | | #ifdef ENABLE_ARM_SHANI |
12 | | |
13 | | #include <array> |
14 | | #include <cstdint> |
15 | | #include <cstddef> |
16 | | #include <arm_neon.h> |
17 | | |
18 | | namespace { |
19 | | alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K = |
20 | | { |
21 | | 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, |
22 | | 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, |
23 | | 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, |
24 | | 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, |
25 | | 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, |
26 | | 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, |
27 | | 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, |
28 | | 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, |
29 | | 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, |
30 | | 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, |
31 | | 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, |
32 | | 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, |
33 | | 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, |
34 | | 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, |
35 | | 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, |
36 | | 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, |
37 | | }; |
38 | | } |
39 | | |
40 | | namespace sha256_arm_shani { |
41 | | void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks) |
42 | 52.1M | { |
43 | 52.1M | uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; |
44 | 52.1M | uint32x4_t MSG0, MSG1, MSG2, MSG3; |
45 | 52.1M | uint32x4_t TMP0, TMP2; |
46 | | |
47 | | // Load state |
48 | 52.1M | STATE0 = vld1q_u32(&s[0]); |
49 | 52.1M | STATE1 = vld1q_u32(&s[4]); |
50 | | |
51 | 224M | while (blocks--) |
52 | 172M | { |
53 | | // Save state |
54 | 172M | ABEF_SAVE = STATE0; |
55 | 172M | CDGH_SAVE = STATE1; |
56 | | |
57 | | // Load and convert input chunk to Big Endian |
58 | 172M | MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0))); |
59 | 172M | MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16))); |
60 | 172M | MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32))); |
61 | 172M | MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48))); |
62 | 172M | chunk += 64; |
63 | | |
64 | | // Original implementation preloaded message and constant addition which was 1-3% slower. |
65 | | // Now included as first step in quad round code saving one Q Neon register |
66 | | // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));" |
67 | | |
68 | | // Rounds 1-4 |
69 | 172M | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0])); |
70 | 172M | TMP2 = STATE0; |
71 | 172M | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
72 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
73 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
74 | 172M | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
75 | | |
76 | | // Rounds 5-8 |
77 | 172M | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4])); |
78 | 172M | TMP2 = STATE0; |
79 | 172M | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
80 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
81 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
82 | 172M | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
83 | | |
84 | | // Rounds 9-12 |
85 | 172M | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8])); |
86 | 172M | TMP2 = STATE0; |
87 | 172M | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
88 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
89 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
90 | 172M | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
91 | | |
92 | | // Rounds 13-16 |
93 | 172M | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12])); |
94 | 172M | TMP2 = STATE0; |
95 | 172M | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
96 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
97 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
98 | 172M | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
99 | | |
100 | | // Rounds 17-20 |
101 | 172M | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16])); |
102 | 172M | TMP2 = STATE0; |
103 | 172M | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
104 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
105 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
106 | 172M | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
107 | | |
108 | | // Rounds 21-24 |
109 | 172M | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20])); |
110 | 172M | TMP2 = STATE0; |
111 | 172M | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
112 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
113 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
114 | 172M | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
115 | | |
116 | | // Rounds 25-28 |
117 | 172M | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24])); |
118 | 172M | TMP2 = STATE0; |
119 | 172M | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
120 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
121 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
122 | 172M | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
123 | | |
124 | | // Rounds 29-32 |
125 | 172M | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28])); |
126 | 172M | TMP2 = STATE0; |
127 | 172M | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
128 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
129 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
130 | 172M | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
131 | | |
132 | | // Rounds 33-36 |
133 | 172M | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32])); |
134 | 172M | TMP2 = STATE0; |
135 | 172M | MSG0 = vsha256su0q_u32(MSG0, MSG1); |
136 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
137 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
138 | 172M | MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); |
139 | | |
140 | | // Rounds 37-40 |
141 | 172M | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36])); |
142 | 172M | TMP2 = STATE0; |
143 | 172M | MSG1 = vsha256su0q_u32(MSG1, MSG2); |
144 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
145 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
146 | 172M | MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); |
147 | | |
148 | | // Rounds 41-44 |
149 | 172M | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40])); |
150 | 172M | TMP2 = STATE0; |
151 | 172M | MSG2 = vsha256su0q_u32(MSG2, MSG3); |
152 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
153 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
154 | 172M | MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); |
155 | | |
156 | | // Rounds 45-48 |
157 | 172M | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44])); |
158 | 172M | TMP2 = STATE0; |
159 | 172M | MSG3 = vsha256su0q_u32(MSG3, MSG0); |
160 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
161 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
162 | 172M | MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); |
163 | | |
164 | | // Rounds 49-52 |
165 | 172M | TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48])); |
166 | 172M | TMP2 = STATE0; |
167 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
168 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
169 | | |
170 | | // Rounds 53-56 |
171 | 172M | TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52])); |
172 | 172M | TMP2 = STATE0; |
173 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
174 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
175 | | |
176 | | // Rounds 57-60 |
177 | 172M | TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56])); |
178 | 172M | TMP2 = STATE0; |
179 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
180 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
181 | | |
182 | | // Rounds 61-64 |
183 | 172M | TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60])); |
184 | 172M | TMP2 = STATE0; |
185 | 172M | STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); |
186 | 172M | STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); |
187 | | |
188 | | // Update state |
189 | 172M | STATE0 = vaddq_u32(STATE0, ABEF_SAVE); |
190 | 172M | STATE1 = vaddq_u32(STATE1, CDGH_SAVE); |
191 | 172M | } |
192 | | |
193 | | // Save final state |
194 | 52.1M | vst1q_u32(&s[0], STATE0); |
195 | 52.1M | vst1q_u32(&s[4], STATE1); |
196 | 52.1M | } |
197 | | } |
198 | | |
199 | | namespace sha256d64_arm_shani { |
200 | | void Transform_2way(unsigned char* output, const unsigned char* input) |
201 | 173k | { |
202 | | /* Initial state. */ |
203 | 173k | alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = { |
204 | 173k | 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, |
205 | 173k | 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 |
206 | 173k | }; |
207 | | |
208 | | /* Precomputed message schedule for the 2nd transform. */ |
209 | 173k | alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = { |
210 | 173k | 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
211 | 173k | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
212 | 173k | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
213 | 173k | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374, |
214 | 173k | 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, |
215 | 173k | 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa, |
216 | 173k | 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, |
217 | 173k | 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0, |
218 | 173k | 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, |
219 | 173k | 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16, |
220 | 173k | 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, |
221 | 173k | 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37, |
222 | 173k | 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, |
223 | 173k | 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890, |
224 | 173k | 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, |
225 | 173k | 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 |
226 | 173k | }; |
227 | | |
228 | | /* A few precomputed message schedule values for the 3rd transform. */ |
229 | 173k | alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = { |
230 | 173k | 0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
231 | 173k | 0x80000000, 0x00000000, 0x00000000, 0x00000000, |
232 | 173k | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274 |
233 | 173k | }; |
234 | | |
235 | | /* Padding processed in the 3rd transform (byteswapped). */ |
236 | 173k | alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100}; |
237 | | |
238 | 173k | uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB; |
239 | 173k | uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B; |
240 | 173k | uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP; |
241 | | |
242 | | // Transform 1: Load state |
243 | 173k | STATE0A = vld1q_u32(&INIT[0]); |
244 | 173k | STATE0B = STATE0A; |
245 | 173k | STATE1A = vld1q_u32(&INIT[4]); |
246 | 173k | STATE1B = STATE1A; |
247 | | |
248 | | // Transform 1: Load and convert input chunk to Big Endian |
249 | 173k | MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0))); |
250 | 173k | MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16))); |
251 | 173k | MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32))); |
252 | 173k | MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48))); |
253 | 173k | MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64))); |
254 | 173k | MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80))); |
255 | 173k | MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96))); |
256 | 173k | MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112))); |
257 | | |
258 | | // Transform 1: Rounds 1-4 |
259 | 173k | TMP = vld1q_u32(&K[0]); |
260 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
261 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
262 | 173k | TMP2A = STATE0A; |
263 | 173k | TMP2B = STATE0B; |
264 | 173k | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
265 | 173k | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
266 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
267 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
268 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
269 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
270 | 173k | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
271 | 173k | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
272 | | |
273 | | // Transform 1: Rounds 5-8 |
274 | 173k | TMP = vld1q_u32(&K[4]); |
275 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
276 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
277 | 173k | TMP2A = STATE0A; |
278 | 173k | TMP2B = STATE0B; |
279 | 173k | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
280 | 173k | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
281 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
282 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
283 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
284 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
285 | 173k | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
286 | 173k | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
287 | | |
288 | | // Transform 1: Rounds 9-12 |
289 | 173k | TMP = vld1q_u32(&K[8]); |
290 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
291 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
292 | 173k | TMP2A = STATE0A; |
293 | 173k | TMP2B = STATE0B; |
294 | 173k | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
295 | 173k | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
296 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
297 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
298 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
299 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
300 | 173k | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
301 | 173k | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
302 | | |
303 | | // Transform 1: Rounds 13-16 |
304 | 173k | TMP = vld1q_u32(&K[12]); |
305 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
306 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
307 | 173k | TMP2A = STATE0A; |
308 | 173k | TMP2B = STATE0B; |
309 | 173k | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
310 | 173k | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
311 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
312 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
313 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
314 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
315 | 173k | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
316 | 173k | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
317 | | |
318 | | // Transform 1: Rounds 17-20 |
319 | 173k | TMP = vld1q_u32(&K[16]); |
320 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
321 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
322 | 173k | TMP2A = STATE0A; |
323 | 173k | TMP2B = STATE0B; |
324 | 173k | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
325 | 173k | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
326 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
327 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
328 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
329 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
330 | 173k | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
331 | 173k | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
332 | | |
333 | | // Transform 1: Rounds 21-24 |
334 | 173k | TMP = vld1q_u32(&K[20]); |
335 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
336 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
337 | 173k | TMP2A = STATE0A; |
338 | 173k | TMP2B = STATE0B; |
339 | 173k | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
340 | 173k | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
341 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
342 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
343 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
344 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
345 | 173k | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
346 | 173k | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
347 | | |
348 | | // Transform 1: Rounds 25-28 |
349 | 173k | TMP = vld1q_u32(&K[24]); |
350 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
351 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
352 | 173k | TMP2A = STATE0A; |
353 | 173k | TMP2B = STATE0B; |
354 | 173k | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
355 | 173k | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
356 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
357 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
358 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
359 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
360 | 173k | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
361 | 173k | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
362 | | |
363 | | // Transform 1: Rounds 29-32 |
364 | 173k | TMP = vld1q_u32(&K[28]); |
365 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
366 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
367 | 173k | TMP2A = STATE0A; |
368 | 173k | TMP2B = STATE0B; |
369 | 173k | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
370 | 173k | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
371 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
372 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
373 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
374 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
375 | 173k | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
376 | 173k | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
377 | | |
378 | | // Transform 1: Rounds 33-36 |
379 | 173k | TMP = vld1q_u32(&K[32]); |
380 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
381 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
382 | 173k | TMP2A = STATE0A; |
383 | 173k | TMP2B = STATE0B; |
384 | 173k | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
385 | 173k | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
386 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
387 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
388 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
389 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
390 | 173k | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
391 | 173k | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
392 | | |
393 | | // Transform 1: Rounds 37-40 |
394 | 173k | TMP = vld1q_u32(&K[36]); |
395 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
396 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
397 | 173k | TMP2A = STATE0A; |
398 | 173k | TMP2B = STATE0B; |
399 | 173k | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
400 | 173k | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
401 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
402 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
403 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
404 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
405 | 173k | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
406 | 173k | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
407 | | |
408 | | // Transform 1: Rounds 41-44 |
409 | 173k | TMP = vld1q_u32(&K[40]); |
410 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
411 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
412 | 173k | TMP2A = STATE0A; |
413 | 173k | TMP2B = STATE0B; |
414 | 173k | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
415 | 173k | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
416 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
417 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
418 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
419 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
420 | 173k | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
421 | 173k | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
422 | | |
423 | | // Transform 1: Rounds 45-48 |
424 | 173k | TMP = vld1q_u32(&K[44]); |
425 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
426 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
427 | 173k | TMP2A = STATE0A; |
428 | 173k | TMP2B = STATE0B; |
429 | 173k | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
430 | 173k | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
431 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
432 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
433 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
434 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
435 | 173k | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
436 | 173k | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
437 | | |
438 | | // Transform 1: Rounds 49-52 |
439 | 173k | TMP = vld1q_u32(&K[48]); |
440 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
441 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
442 | 173k | TMP2A = STATE0A; |
443 | 173k | TMP2B = STATE0B; |
444 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
445 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
446 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
447 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
448 | | |
449 | | // Transform 1: Rounds 53-56 |
450 | 173k | TMP = vld1q_u32(&K[52]); |
451 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
452 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
453 | 173k | TMP2A = STATE0A; |
454 | 173k | TMP2B = STATE0B; |
455 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
456 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
457 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
458 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
459 | | |
460 | | // Transform 1: Rounds 57-60 |
461 | 173k | TMP = vld1q_u32(&K[56]); |
462 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
463 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
464 | 173k | TMP2A = STATE0A; |
465 | 173k | TMP2B = STATE0B; |
466 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
467 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
468 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
469 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
470 | | |
471 | | // Transform 1: Rounds 61-64 |
472 | 173k | TMP = vld1q_u32(&K[60]); |
473 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
474 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
475 | 173k | TMP2A = STATE0A; |
476 | 173k | TMP2B = STATE0B; |
477 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
478 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
479 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
480 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
481 | | |
482 | | // Transform 1: Update state |
483 | 173k | TMP = vld1q_u32(&INIT[0]); |
484 | 173k | STATE0A = vaddq_u32(STATE0A, TMP); |
485 | 173k | STATE0B = vaddq_u32(STATE0B, TMP); |
486 | 173k | TMP = vld1q_u32(&INIT[4]); |
487 | 173k | STATE1A = vaddq_u32(STATE1A, TMP); |
488 | 173k | STATE1B = vaddq_u32(STATE1B, TMP); |
489 | | |
490 | | // Transform 2: Save state |
491 | 173k | ABEF_SAVEA = STATE0A; |
492 | 173k | ABEF_SAVEB = STATE0B; |
493 | 173k | CDGH_SAVEA = STATE1A; |
494 | 173k | CDGH_SAVEB = STATE1B; |
495 | | |
496 | | // Transform 2: Rounds 1-4 |
497 | 173k | TMP = vld1q_u32(&MIDS[0]); |
498 | 173k | TMP2A = STATE0A; |
499 | 173k | TMP2B = STATE0B; |
500 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
501 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
502 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
503 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
504 | | |
505 | | // Transform 2: Rounds 5-8 |
506 | 173k | TMP = vld1q_u32(&MIDS[4]); |
507 | 173k | TMP2A = STATE0A; |
508 | 173k | TMP2B = STATE0B; |
509 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
510 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
511 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
512 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
513 | | |
514 | | // Transform 2: Rounds 9-12 |
515 | 173k | TMP = vld1q_u32(&MIDS[8]); |
516 | 173k | TMP2A = STATE0A; |
517 | 173k | TMP2B = STATE0B; |
518 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
519 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
520 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
521 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
522 | | |
523 | | // Transform 2: Rounds 13-16 |
524 | 173k | TMP = vld1q_u32(&MIDS[12]); |
525 | 173k | TMP2A = STATE0A; |
526 | 173k | TMP2B = STATE0B; |
527 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
528 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
529 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
530 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
531 | | |
532 | | // Transform 2: Rounds 17-20 |
533 | 173k | TMP = vld1q_u32(&MIDS[16]); |
534 | 173k | TMP2A = STATE0A; |
535 | 173k | TMP2B = STATE0B; |
536 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
537 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
538 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
539 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
540 | | |
541 | | // Transform 2: Rounds 21-24 |
542 | 173k | TMP = vld1q_u32(&MIDS[20]); |
543 | 173k | TMP2A = STATE0A; |
544 | 173k | TMP2B = STATE0B; |
545 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
546 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
547 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
548 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
549 | | |
550 | | // Transform 2: Rounds 25-28 |
551 | 173k | TMP = vld1q_u32(&MIDS[24]); |
552 | 173k | TMP2A = STATE0A; |
553 | 173k | TMP2B = STATE0B; |
554 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
555 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
556 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
557 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
558 | | |
559 | | // Transform 2: Rounds 29-32 |
560 | 173k | TMP = vld1q_u32(&MIDS[28]); |
561 | 173k | TMP2A = STATE0A; |
562 | 173k | TMP2B = STATE0B; |
563 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
564 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
565 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
566 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
567 | | |
568 | | // Transform 2: Rounds 33-36 |
569 | 173k | TMP = vld1q_u32(&MIDS[32]); |
570 | 173k | TMP2A = STATE0A; |
571 | 173k | TMP2B = STATE0B; |
572 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
573 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
574 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
575 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
576 | | |
577 | | // Transform 2: Rounds 37-40 |
578 | 173k | TMP = vld1q_u32(&MIDS[36]); |
579 | 173k | TMP2A = STATE0A; |
580 | 173k | TMP2B = STATE0B; |
581 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
582 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
583 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
584 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
585 | | |
586 | | // Transform 2: Rounds 41-44 |
587 | 173k | TMP = vld1q_u32(&MIDS[40]); |
588 | 173k | TMP2A = STATE0A; |
589 | 173k | TMP2B = STATE0B; |
590 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
591 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
592 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
593 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
594 | | |
595 | | // Transform 2: Rounds 45-48 |
596 | 173k | TMP = vld1q_u32(&MIDS[44]); |
597 | 173k | TMP2A = STATE0A; |
598 | 173k | TMP2B = STATE0B; |
599 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
600 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
601 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
602 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
603 | | |
604 | | // Transform 2: Rounds 49-52 |
605 | 173k | TMP = vld1q_u32(&MIDS[48]); |
606 | 173k | TMP2A = STATE0A; |
607 | 173k | TMP2B = STATE0B; |
608 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
609 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
610 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
611 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
612 | | |
613 | | // Transform 2: Rounds 53-56 |
614 | 173k | TMP = vld1q_u32(&MIDS[52]); |
615 | 173k | TMP2A = STATE0A; |
616 | 173k | TMP2B = STATE0B; |
617 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
618 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
619 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
620 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
621 | | |
622 | | // Transform 2: Rounds 57-60 |
623 | 173k | TMP = vld1q_u32(&MIDS[56]); |
624 | 173k | TMP2A = STATE0A; |
625 | 173k | TMP2B = STATE0B; |
626 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
627 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
628 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
629 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
630 | | |
631 | | // Transform 2: Rounds 61-64 |
632 | 173k | TMP = vld1q_u32(&MIDS[60]); |
633 | 173k | TMP2A = STATE0A; |
634 | 173k | TMP2B = STATE0B; |
635 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
636 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
637 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
638 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
639 | | |
640 | | // Transform 2: Update state |
641 | 173k | STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA); |
642 | 173k | STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB); |
643 | 173k | STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA); |
644 | 173k | STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB); |
645 | | |
646 | | // Transform 3: Pad previous output |
647 | 173k | MSG0A = STATE0A; |
648 | 173k | MSG0B = STATE0B; |
649 | 173k | MSG1A = STATE1A; |
650 | 173k | MSG1B = STATE1B; |
651 | 173k | MSG2A = vld1q_u32(&FINAL[0]); |
652 | 173k | MSG2B = MSG2A; |
653 | 173k | MSG3A = vld1q_u32(&FINAL[4]); |
654 | 173k | MSG3B = MSG3A; |
655 | | |
656 | | // Transform 3: Load state |
657 | 173k | STATE0A = vld1q_u32(&INIT[0]); |
658 | 173k | STATE0B = STATE0A; |
659 | 173k | STATE1A = vld1q_u32(&INIT[4]); |
660 | 173k | STATE1B = STATE1A; |
661 | | |
662 | | // Transform 3: Rounds 1-4 |
663 | 173k | TMP = vld1q_u32(&K[0]); |
664 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
665 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
666 | 173k | TMP2A = STATE0A; |
667 | 173k | TMP2B = STATE0B; |
668 | 173k | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
669 | 173k | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
670 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
671 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
672 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
673 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
674 | 173k | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
675 | 173k | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
676 | | |
677 | | // Transform 3: Rounds 5-8 |
678 | 173k | TMP = vld1q_u32(&K[4]); |
679 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
680 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
681 | 173k | TMP2A = STATE0A; |
682 | 173k | TMP2B = STATE0B; |
683 | 173k | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
684 | 173k | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
685 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
686 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
687 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
688 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
689 | 173k | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
690 | 173k | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
691 | | |
692 | | // Transform 3: Rounds 9-12 |
693 | 173k | TMP = vld1q_u32(&FINS[0]); |
694 | 173k | TMP2A = STATE0A; |
695 | 173k | TMP2B = STATE0B; |
696 | 173k | MSG2A = vld1q_u32(&FINS[4]); |
697 | 173k | MSG2B = MSG2A; |
698 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
699 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
700 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
701 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
702 | 173k | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
703 | 173k | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
704 | | |
705 | | // Transform 3: Rounds 13-16 |
706 | 173k | TMP = vld1q_u32(&FINS[8]); |
707 | 173k | TMP2A = STATE0A; |
708 | 173k | TMP2B = STATE0B; |
709 | 173k | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
710 | 173k | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
711 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP); |
712 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP); |
713 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP); |
714 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP); |
715 | 173k | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
716 | 173k | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
717 | | |
718 | | // Transform 3: Rounds 17-20 |
719 | 173k | TMP = vld1q_u32(&K[16]); |
720 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
721 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
722 | 173k | TMP2A = STATE0A; |
723 | 173k | TMP2B = STATE0B; |
724 | 173k | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
725 | 173k | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
726 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
727 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
728 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
729 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
730 | 173k | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
731 | 173k | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
732 | | |
733 | | // Transform 3: Rounds 21-24 |
734 | 173k | TMP = vld1q_u32(&K[20]); |
735 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
736 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
737 | 173k | TMP2A = STATE0A; |
738 | 173k | TMP2B = STATE0B; |
739 | 173k | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
740 | 173k | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
741 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
742 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
743 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
744 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
745 | 173k | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
746 | 173k | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
747 | | |
748 | | // Transform 3: Rounds 25-28 |
749 | 173k | TMP = vld1q_u32(&K[24]); |
750 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
751 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
752 | 173k | TMP2A = STATE0A; |
753 | 173k | TMP2B = STATE0B; |
754 | 173k | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
755 | 173k | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
756 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
757 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
758 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
759 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
760 | 173k | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
761 | 173k | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
762 | | |
763 | | // Transform 3: Rounds 29-32 |
764 | 173k | TMP = vld1q_u32(&K[28]); |
765 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
766 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
767 | 173k | TMP2A = STATE0A; |
768 | 173k | TMP2B = STATE0B; |
769 | 173k | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
770 | 173k | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
771 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
772 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
773 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
774 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
775 | 173k | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
776 | 173k | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
777 | | |
778 | | // Transform 3: Rounds 33-36 |
779 | 173k | TMP = vld1q_u32(&K[32]); |
780 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
781 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
782 | 173k | TMP2A = STATE0A; |
783 | 173k | TMP2B = STATE0B; |
784 | 173k | MSG0A = vsha256su0q_u32(MSG0A, MSG1A); |
785 | 173k | MSG0B = vsha256su0q_u32(MSG0B, MSG1B); |
786 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
787 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
788 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
789 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
790 | 173k | MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A); |
791 | 173k | MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B); |
792 | | |
793 | | // Transform 3: Rounds 37-40 |
794 | 173k | TMP = vld1q_u32(&K[36]); |
795 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
796 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
797 | 173k | TMP2A = STATE0A; |
798 | 173k | TMP2B = STATE0B; |
799 | 173k | MSG1A = vsha256su0q_u32(MSG1A, MSG2A); |
800 | 173k | MSG1B = vsha256su0q_u32(MSG1B, MSG2B); |
801 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
802 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
803 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
804 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
805 | 173k | MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A); |
806 | 173k | MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B); |
807 | | |
808 | | // Transform 3: Rounds 41-44 |
809 | 173k | TMP = vld1q_u32(&K[40]); |
810 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
811 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
812 | 173k | TMP2A = STATE0A; |
813 | 173k | TMP2B = STATE0B; |
814 | 173k | MSG2A = vsha256su0q_u32(MSG2A, MSG3A); |
815 | 173k | MSG2B = vsha256su0q_u32(MSG2B, MSG3B); |
816 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
817 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
818 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
819 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
820 | 173k | MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A); |
821 | 173k | MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B); |
822 | | |
823 | | // Transform 3: Rounds 45-48 |
824 | 173k | TMP = vld1q_u32(&K[44]); |
825 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
826 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
827 | 173k | TMP2A = STATE0A; |
828 | 173k | TMP2B = STATE0B; |
829 | 173k | MSG3A = vsha256su0q_u32(MSG3A, MSG0A); |
830 | 173k | MSG3B = vsha256su0q_u32(MSG3B, MSG0B); |
831 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
832 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
833 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
834 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
835 | 173k | MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A); |
836 | 173k | MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B); |
837 | | |
838 | | // Transform 3: Rounds 49-52 |
839 | 173k | TMP = vld1q_u32(&K[48]); |
840 | 173k | TMP0A = vaddq_u32(MSG0A, TMP); |
841 | 173k | TMP0B = vaddq_u32(MSG0B, TMP); |
842 | 173k | TMP2A = STATE0A; |
843 | 173k | TMP2B = STATE0B; |
844 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
845 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
846 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
847 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
848 | | |
849 | | // Transform 3: Rounds 53-56 |
850 | 173k | TMP = vld1q_u32(&K[52]); |
851 | 173k | TMP0A = vaddq_u32(MSG1A, TMP); |
852 | 173k | TMP0B = vaddq_u32(MSG1B, TMP); |
853 | 173k | TMP2A = STATE0A; |
854 | 173k | TMP2B = STATE0B; |
855 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
856 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
857 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
858 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
859 | | |
860 | | // Transform 3: Rounds 57-60 |
861 | 173k | TMP = vld1q_u32(&K[56]); |
862 | 173k | TMP0A = vaddq_u32(MSG2A, TMP); |
863 | 173k | TMP0B = vaddq_u32(MSG2B, TMP); |
864 | 173k | TMP2A = STATE0A; |
865 | 173k | TMP2B = STATE0B; |
866 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
867 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
868 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
869 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
870 | | |
871 | | // Transform 3: Rounds 61-64 |
872 | 173k | TMP = vld1q_u32(&K[60]); |
873 | 173k | TMP0A = vaddq_u32(MSG3A, TMP); |
874 | 173k | TMP0B = vaddq_u32(MSG3B, TMP); |
875 | 173k | TMP2A = STATE0A; |
876 | 173k | TMP2B = STATE0B; |
877 | 173k | STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A); |
878 | 173k | STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B); |
879 | 173k | STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A); |
880 | 173k | STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B); |
881 | | |
882 | | // Transform 3: Update state |
883 | 173k | TMP = vld1q_u32(&INIT[0]); |
884 | 173k | STATE0A = vaddq_u32(STATE0A, TMP); |
885 | 173k | STATE0B = vaddq_u32(STATE0B, TMP); |
886 | 173k | TMP = vld1q_u32(&INIT[4]); |
887 | 173k | STATE1A = vaddq_u32(STATE1A, TMP); |
888 | 173k | STATE1B = vaddq_u32(STATE1B, TMP); |
889 | | |
890 | | // Store result |
891 | 173k | vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A))); |
892 | 173k | vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A))); |
893 | 173k | vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B))); |
894 | | vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B))); |
895 | 173k | } |
896 | | } |
897 | | |
898 | | #endif |