Jussi Kivilinna | 0b95ec5 | 2012-03-05 20:26:47 +0200 | [diff] [blame] | 1 | /* |
| 2 | * Camellia Cipher Algorithm (x86_64) |
| 3 | * |
| 4 | * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This program is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License |
| 17 | * along with this program; if not, write to the Free Software |
| 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
| 19 | * USA |
| 20 | * |
| 21 | */ |
| 22 | |
| 23 | .file "camellia-x86_64-asm_64.S" |
| 24 | .text |
| 25 | |
| 26 | .extern camellia_sp10011110; |
| 27 | .extern camellia_sp22000222; |
| 28 | .extern camellia_sp03303033; |
| 29 | .extern camellia_sp00444404; |
| 30 | .extern camellia_sp02220222; |
| 31 | .extern camellia_sp30333033; |
| 32 | .extern camellia_sp44044404; |
| 33 | .extern camellia_sp11101110; |
| 34 | |
| 35 | #define sp10011110 camellia_sp10011110 |
| 36 | #define sp22000222 camellia_sp22000222 |
| 37 | #define sp03303033 camellia_sp03303033 |
| 38 | #define sp00444404 camellia_sp00444404 |
| 39 | #define sp02220222 camellia_sp02220222 |
| 40 | #define sp30333033 camellia_sp30333033 |
| 41 | #define sp44044404 camellia_sp44044404 |
| 42 | #define sp11101110 camellia_sp11101110 |
| 43 | |
| 44 | #define CAMELLIA_TABLE_BYTE_LEN 272 |
| 45 | |
| 46 | /* struct camellia_ctx: */ |
| 47 | #define key_table 0 |
| 48 | #define key_length CAMELLIA_TABLE_BYTE_LEN |
| 49 | |
| 50 | /* register macros */ |
| 51 | #define CTX %rdi |
| 52 | #define RIO %rsi |
| 53 | #define RIOd %esi |
| 54 | |
| 55 | #define RAB0 %rax |
| 56 | #define RCD0 %rcx |
| 57 | #define RAB1 %rbx |
| 58 | #define RCD1 %rdx |
| 59 | |
| 60 | #define RAB0d %eax |
| 61 | #define RCD0d %ecx |
| 62 | #define RAB1d %ebx |
| 63 | #define RCD1d %edx |
| 64 | |
| 65 | #define RAB0bl %al |
| 66 | #define RCD0bl %cl |
| 67 | #define RAB1bl %bl |
| 68 | #define RCD1bl %dl |
| 69 | |
| 70 | #define RAB0bh %ah |
| 71 | #define RCD0bh %ch |
| 72 | #define RAB1bh %bh |
| 73 | #define RCD1bh %dh |
| 74 | |
| 75 | #define RT0 %rsi |
| 76 | #define RT1 %rbp |
| 77 | #define RT2 %r8 |
| 78 | |
| 79 | #define RT0d %esi |
| 80 | #define RT1d %ebp |
| 81 | #define RT2d %r8d |
| 82 | |
| 83 | #define RT2bl %r8b |
| 84 | |
| 85 | #define RXOR %r9 |
| 86 | #define RRBP %r10 |
| 87 | #define RDST %r11 |
| 88 | |
| 89 | #define RXORd %r9d |
| 90 | #define RXORbl %r9b |
| 91 | |
| 92 | #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ |
| 93 | movzbl ab ## bl, tmp2 ## d; \ |
| 94 | movzbl ab ## bh, tmp1 ## d; \ |
| 95 | rorq $16, ab; \ |
| 96 | xorq T0(, tmp2, 8), dst; \ |
| 97 | xorq T1(, tmp1, 8), dst; |
| 98 | |
| 99 | /********************************************************************** |
| 100 | 1-way camellia |
| 101 | **********************************************************************/ |
| 102 | #define roundsm(ab, subkey, cd) \ |
| 103 | movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ |
| 104 | \ |
| 105 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ |
| 106 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ |
| 107 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ |
| 108 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ |
| 109 | \ |
| 110 | xorq RT2, cd ## 0; |
| 111 | |
| 112 | #define fls(l, r, kl, kr) \ |
| 113 | movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ |
| 114 | andl l ## 0d, RT0d; \ |
| 115 | roll $1, RT0d; \ |
| 116 | shlq $32, RT0; \ |
| 117 | xorq RT0, l ## 0; \ |
| 118 | movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ |
| 119 | orq r ## 0, RT1; \ |
| 120 | shrq $32, RT1; \ |
| 121 | xorq RT1, r ## 0; \ |
| 122 | \ |
| 123 | movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ |
| 124 | orq l ## 0, RT2; \ |
| 125 | shrq $32, RT2; \ |
| 126 | xorq RT2, l ## 0; \ |
| 127 | movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ |
| 128 | andl r ## 0d, RT0d; \ |
| 129 | roll $1, RT0d; \ |
| 130 | shlq $32, RT0; \ |
| 131 | xorq RT0, r ## 0; |
| 132 | |
| 133 | #define enc_rounds(i) \ |
| 134 | roundsm(RAB, i + 2, RCD); \ |
| 135 | roundsm(RCD, i + 3, RAB); \ |
| 136 | roundsm(RAB, i + 4, RCD); \ |
| 137 | roundsm(RCD, i + 5, RAB); \ |
| 138 | roundsm(RAB, i + 6, RCD); \ |
| 139 | roundsm(RCD, i + 7, RAB); |
| 140 | |
| 141 | #define enc_fls(i) \ |
| 142 | fls(RAB, RCD, i + 0, i + 1); |
| 143 | |
| 144 | #define enc_inpack() \ |
| 145 | movq (RIO), RAB0; \ |
| 146 | bswapq RAB0; \ |
| 147 | rolq $32, RAB0; \ |
| 148 | movq 4*2(RIO), RCD0; \ |
| 149 | bswapq RCD0; \ |
| 150 | rorq $32, RCD0; \ |
| 151 | xorq key_table(CTX), RAB0; |
| 152 | |
| 153 | #define enc_outunpack(op, max) \ |
| 154 | xorq key_table(CTX, max, 8), RCD0; \ |
| 155 | rorq $32, RCD0; \ |
| 156 | bswapq RCD0; \ |
| 157 | op ## q RCD0, (RIO); \ |
| 158 | rolq $32, RAB0; \ |
| 159 | bswapq RAB0; \ |
| 160 | op ## q RAB0, 4*2(RIO); |
| 161 | |
| 162 | #define dec_rounds(i) \ |
| 163 | roundsm(RAB, i + 7, RCD); \ |
| 164 | roundsm(RCD, i + 6, RAB); \ |
| 165 | roundsm(RAB, i + 5, RCD); \ |
| 166 | roundsm(RCD, i + 4, RAB); \ |
| 167 | roundsm(RAB, i + 3, RCD); \ |
| 168 | roundsm(RCD, i + 2, RAB); |
| 169 | |
| 170 | #define dec_fls(i) \ |
| 171 | fls(RAB, RCD, i + 1, i + 0); |
| 172 | |
| 173 | #define dec_inpack(max) \ |
| 174 | movq (RIO), RAB0; \ |
| 175 | bswapq RAB0; \ |
| 176 | rolq $32, RAB0; \ |
| 177 | movq 4*2(RIO), RCD0; \ |
| 178 | bswapq RCD0; \ |
| 179 | rorq $32, RCD0; \ |
| 180 | xorq key_table(CTX, max, 8), RAB0; |
| 181 | |
| 182 | #define dec_outunpack() \ |
| 183 | xorq key_table(CTX), RCD0; \ |
| 184 | rorq $32, RCD0; \ |
| 185 | bswapq RCD0; \ |
| 186 | movq RCD0, (RIO); \ |
| 187 | rolq $32, RAB0; \ |
| 188 | bswapq RAB0; \ |
| 189 | movq RAB0, 4*2(RIO); |
| 190 | |
| 191 | .global __camellia_enc_blk; |
| 192 | .type __camellia_enc_blk,@function; |
| 193 | |
| 194 | __camellia_enc_blk: |
| 195 | /* input: |
| 196 | * %rdi: ctx, CTX |
| 197 | * %rsi: dst |
| 198 | * %rdx: src |
| 199 | * %rcx: bool xor |
| 200 | */ |
| 201 | movq %rbp, RRBP; |
| 202 | |
| 203 | movq %rcx, RXOR; |
| 204 | movq %rsi, RDST; |
| 205 | movq %rdx, RIO; |
| 206 | |
| 207 | enc_inpack(); |
| 208 | |
| 209 | enc_rounds(0); |
| 210 | enc_fls(8); |
| 211 | enc_rounds(8); |
| 212 | enc_fls(16); |
| 213 | enc_rounds(16); |
| 214 | movl $24, RT1d; /* max */ |
| 215 | |
| 216 | cmpb $16, key_length(CTX); |
| 217 | je __enc_done; |
| 218 | |
| 219 | enc_fls(24); |
| 220 | enc_rounds(24); |
| 221 | movl $32, RT1d; /* max */ |
| 222 | |
| 223 | __enc_done: |
| 224 | testb RXORbl, RXORbl; |
| 225 | movq RDST, RIO; |
| 226 | |
| 227 | jnz __enc_xor; |
| 228 | |
| 229 | enc_outunpack(mov, RT1); |
| 230 | |
| 231 | movq RRBP, %rbp; |
| 232 | ret; |
| 233 | |
| 234 | __enc_xor: |
| 235 | enc_outunpack(xor, RT1); |
| 236 | |
| 237 | movq RRBP, %rbp; |
| 238 | ret; |
| 239 | |
| 240 | .global camellia_dec_blk; |
| 241 | .type camellia_dec_blk,@function; |
| 242 | |
| 243 | camellia_dec_blk: |
| 244 | /* input: |
| 245 | * %rdi: ctx, CTX |
| 246 | * %rsi: dst |
| 247 | * %rdx: src |
| 248 | */ |
| 249 | cmpl $16, key_length(CTX); |
| 250 | movl $32, RT2d; |
| 251 | movl $24, RXORd; |
| 252 | cmovel RXORd, RT2d; /* max */ |
| 253 | |
| 254 | movq %rbp, RRBP; |
| 255 | movq %rsi, RDST; |
| 256 | movq %rdx, RIO; |
| 257 | |
| 258 | dec_inpack(RT2); |
| 259 | |
| 260 | cmpb $24, RT2bl; |
| 261 | je __dec_rounds16; |
| 262 | |
| 263 | dec_rounds(24); |
| 264 | dec_fls(24); |
| 265 | |
| 266 | __dec_rounds16: |
| 267 | dec_rounds(16); |
| 268 | dec_fls(16); |
| 269 | dec_rounds(8); |
| 270 | dec_fls(8); |
| 271 | dec_rounds(0); |
| 272 | |
| 273 | movq RDST, RIO; |
| 274 | |
| 275 | dec_outunpack(); |
| 276 | |
| 277 | movq RRBP, %rbp; |
| 278 | ret; |
| 279 | |
| 280 | /********************************************************************** |
| 281 | 2-way camellia |
| 282 | **********************************************************************/ |
| 283 | #define roundsm2(ab, subkey, cd) \ |
| 284 | movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ |
| 285 | xorq RT2, cd ## 1; \ |
| 286 | \ |
| 287 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ |
| 288 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ |
| 289 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ |
| 290 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ |
| 291 | \ |
| 292 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ |
| 293 | xorq RT2, cd ## 0; \ |
| 294 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ |
| 295 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ |
| 296 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); |
| 297 | |
| 298 | #define fls2(l, r, kl, kr) \ |
| 299 | movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ |
| 300 | andl l ## 0d, RT0d; \ |
| 301 | roll $1, RT0d; \ |
| 302 | shlq $32, RT0; \ |
| 303 | xorq RT0, l ## 0; \ |
| 304 | movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ |
| 305 | orq r ## 0, RT1; \ |
| 306 | shrq $32, RT1; \ |
| 307 | xorq RT1, r ## 0; \ |
| 308 | \ |
| 309 | movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ |
| 310 | andl l ## 1d, RT2d; \ |
| 311 | roll $1, RT2d; \ |
| 312 | shlq $32, RT2; \ |
| 313 | xorq RT2, l ## 1; \ |
| 314 | movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ |
| 315 | orq r ## 1, RT0; \ |
| 316 | shrq $32, RT0; \ |
| 317 | xorq RT0, r ## 1; \ |
| 318 | \ |
| 319 | movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ |
| 320 | orq l ## 0, RT1; \ |
| 321 | shrq $32, RT1; \ |
| 322 | xorq RT1, l ## 0; \ |
| 323 | movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ |
| 324 | andl r ## 0d, RT2d; \ |
| 325 | roll $1, RT2d; \ |
| 326 | shlq $32, RT2; \ |
| 327 | xorq RT2, r ## 0; \ |
| 328 | \ |
| 329 | movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ |
| 330 | orq l ## 1, RT0; \ |
| 331 | shrq $32, RT0; \ |
| 332 | xorq RT0, l ## 1; \ |
| 333 | movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ |
| 334 | andl r ## 1d, RT1d; \ |
| 335 | roll $1, RT1d; \ |
| 336 | shlq $32, RT1; \ |
| 337 | xorq RT1, r ## 1; |
| 338 | |
| 339 | #define enc_rounds2(i) \ |
| 340 | roundsm2(RAB, i + 2, RCD); \ |
| 341 | roundsm2(RCD, i + 3, RAB); \ |
| 342 | roundsm2(RAB, i + 4, RCD); \ |
| 343 | roundsm2(RCD, i + 5, RAB); \ |
| 344 | roundsm2(RAB, i + 6, RCD); \ |
| 345 | roundsm2(RCD, i + 7, RAB); |
| 346 | |
| 347 | #define enc_fls2(i) \ |
| 348 | fls2(RAB, RCD, i + 0, i + 1); |
| 349 | |
| 350 | #define enc_inpack2() \ |
| 351 | movq (RIO), RAB0; \ |
| 352 | bswapq RAB0; \ |
| 353 | rorq $32, RAB0; \ |
| 354 | movq 4*2(RIO), RCD0; \ |
| 355 | bswapq RCD0; \ |
| 356 | rolq $32, RCD0; \ |
| 357 | xorq key_table(CTX), RAB0; \ |
| 358 | \ |
| 359 | movq 8*2(RIO), RAB1; \ |
| 360 | bswapq RAB1; \ |
| 361 | rorq $32, RAB1; \ |
| 362 | movq 12*2(RIO), RCD1; \ |
| 363 | bswapq RCD1; \ |
| 364 | rolq $32, RCD1; \ |
| 365 | xorq key_table(CTX), RAB1; |
| 366 | |
| 367 | #define enc_outunpack2(op, max) \ |
| 368 | xorq key_table(CTX, max, 8), RCD0; \ |
| 369 | rolq $32, RCD0; \ |
| 370 | bswapq RCD0; \ |
| 371 | op ## q RCD0, (RIO); \ |
| 372 | rorq $32, RAB0; \ |
| 373 | bswapq RAB0; \ |
| 374 | op ## q RAB0, 4*2(RIO); \ |
| 375 | \ |
| 376 | xorq key_table(CTX, max, 8), RCD1; \ |
| 377 | rolq $32, RCD1; \ |
| 378 | bswapq RCD1; \ |
| 379 | op ## q RCD1, 8*2(RIO); \ |
| 380 | rorq $32, RAB1; \ |
| 381 | bswapq RAB1; \ |
| 382 | op ## q RAB1, 12*2(RIO); |
| 383 | |
| 384 | #define dec_rounds2(i) \ |
| 385 | roundsm2(RAB, i + 7, RCD); \ |
| 386 | roundsm2(RCD, i + 6, RAB); \ |
| 387 | roundsm2(RAB, i + 5, RCD); \ |
| 388 | roundsm2(RCD, i + 4, RAB); \ |
| 389 | roundsm2(RAB, i + 3, RCD); \ |
| 390 | roundsm2(RCD, i + 2, RAB); |
| 391 | |
| 392 | #define dec_fls2(i) \ |
| 393 | fls2(RAB, RCD, i + 1, i + 0); |
| 394 | |
| 395 | #define dec_inpack2(max) \ |
| 396 | movq (RIO), RAB0; \ |
| 397 | bswapq RAB0; \ |
| 398 | rorq $32, RAB0; \ |
| 399 | movq 4*2(RIO), RCD0; \ |
| 400 | bswapq RCD0; \ |
| 401 | rolq $32, RCD0; \ |
| 402 | xorq key_table(CTX, max, 8), RAB0; \ |
| 403 | \ |
| 404 | movq 8*2(RIO), RAB1; \ |
| 405 | bswapq RAB1; \ |
| 406 | rorq $32, RAB1; \ |
| 407 | movq 12*2(RIO), RCD1; \ |
| 408 | bswapq RCD1; \ |
| 409 | rolq $32, RCD1; \ |
| 410 | xorq key_table(CTX, max, 8), RAB1; |
| 411 | |
| 412 | #define dec_outunpack2() \ |
| 413 | xorq key_table(CTX), RCD0; \ |
| 414 | rolq $32, RCD0; \ |
| 415 | bswapq RCD0; \ |
| 416 | movq RCD0, (RIO); \ |
| 417 | rorq $32, RAB0; \ |
| 418 | bswapq RAB0; \ |
| 419 | movq RAB0, 4*2(RIO); \ |
| 420 | \ |
| 421 | xorq key_table(CTX), RCD1; \ |
| 422 | rolq $32, RCD1; \ |
| 423 | bswapq RCD1; \ |
| 424 | movq RCD1, 8*2(RIO); \ |
| 425 | rorq $32, RAB1; \ |
| 426 | bswapq RAB1; \ |
| 427 | movq RAB1, 12*2(RIO); |
| 428 | |
| 429 | .global __camellia_enc_blk_2way; |
| 430 | .type __camellia_enc_blk_2way,@function; |
| 431 | |
| 432 | __camellia_enc_blk_2way: |
| 433 | /* input: |
| 434 | * %rdi: ctx, CTX |
| 435 | * %rsi: dst |
| 436 | * %rdx: src |
| 437 | * %rcx: bool xor |
| 438 | */ |
| 439 | pushq %rbx; |
| 440 | |
| 441 | movq %rbp, RRBP; |
| 442 | movq %rcx, RXOR; |
| 443 | movq %rsi, RDST; |
| 444 | movq %rdx, RIO; |
| 445 | |
| 446 | enc_inpack2(); |
| 447 | |
| 448 | enc_rounds2(0); |
| 449 | enc_fls2(8); |
| 450 | enc_rounds2(8); |
| 451 | enc_fls2(16); |
| 452 | enc_rounds2(16); |
| 453 | movl $24, RT2d; /* max */ |
| 454 | |
| 455 | cmpb $16, key_length(CTX); |
| 456 | je __enc2_done; |
| 457 | |
| 458 | enc_fls2(24); |
| 459 | enc_rounds2(24); |
| 460 | movl $32, RT2d; /* max */ |
| 461 | |
| 462 | __enc2_done: |
| 463 | test RXORbl, RXORbl; |
| 464 | movq RDST, RIO; |
| 465 | jnz __enc2_xor; |
| 466 | |
| 467 | enc_outunpack2(mov, RT2); |
| 468 | |
| 469 | movq RRBP, %rbp; |
| 470 | popq %rbx; |
| 471 | ret; |
| 472 | |
| 473 | __enc2_xor: |
| 474 | enc_outunpack2(xor, RT2); |
| 475 | |
| 476 | movq RRBP, %rbp; |
| 477 | popq %rbx; |
| 478 | ret; |
| 479 | |
| 480 | .global camellia_dec_blk_2way; |
| 481 | .type camellia_dec_blk_2way,@function; |
| 482 | |
| 483 | camellia_dec_blk_2way: |
| 484 | /* input: |
| 485 | * %rdi: ctx, CTX |
| 486 | * %rsi: dst |
| 487 | * %rdx: src |
| 488 | */ |
| 489 | cmpl $16, key_length(CTX); |
| 490 | movl $32, RT2d; |
| 491 | movl $24, RXORd; |
| 492 | cmovel RXORd, RT2d; /* max */ |
| 493 | |
| 494 | movq %rbx, RXOR; |
| 495 | movq %rbp, RRBP; |
| 496 | movq %rsi, RDST; |
| 497 | movq %rdx, RIO; |
| 498 | |
| 499 | dec_inpack2(RT2); |
| 500 | |
| 501 | cmpb $24, RT2bl; |
| 502 | je __dec2_rounds16; |
| 503 | |
| 504 | dec_rounds2(24); |
| 505 | dec_fls2(24); |
| 506 | |
| 507 | __dec2_rounds16: |
| 508 | dec_rounds2(16); |
| 509 | dec_fls2(16); |
| 510 | dec_rounds2(8); |
| 511 | dec_fls2(8); |
| 512 | dec_rounds2(0); |
| 513 | |
| 514 | movq RDST, RIO; |
| 515 | |
| 516 | dec_outunpack2(); |
| 517 | |
| 518 | movq RRBP, %rbp; |
| 519 | movq RXOR, %rbx; |
| 520 | ret; |