|
267 | 267 |
|
268 | 268 | #define aria_ark_8way(x0, x1, x2, x3, \ |
269 | 269 | x4, x5, x6, x7, \ |
270 | | - t0, rk, idx, round) \ |
| 270 | + t0, t1, t2, rk, \ |
| 271 | + idx, round) \ |
271 | 272 | /* AddRoundKey */ \ |
272 | | - vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ |
273 | | - vpxor t0, x0, x0; \ |
274 | | - vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ |
275 | | - vpxor t0, x1, x1; \ |
276 | | - vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ |
277 | | - vpxor t0, x2, x2; \ |
278 | | - vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ |
279 | | - vpxor t0, x3, x3; \ |
280 | | - vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ |
281 | | - vpxor t0, x4, x4; \ |
282 | | - vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ |
283 | | - vpxor t0, x5, x5; \ |
284 | | - vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ |
285 | | - vpxor t0, x6, x6; \ |
286 | | - vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ |
287 | | - vpxor t0, x7, x7; |
| 273 | + vbroadcastss ((round * 16) + idx + 0)(rk), t0; \ |
| 274 | + vpsrld $24, t0, t2; \ |
| 275 | + vpshufb t1, t2, t2; \ |
| 276 | + vpxor t2, x0, x0; \ |
| 277 | + vpsrld $16, t0, t2; \ |
| 278 | + vpshufb t1, t2, t2; \ |
| 279 | + vpxor t2, x1, x1; \ |
| 280 | + vpsrld $8, t0, t2; \ |
| 281 | + vpshufb t1, t2, t2; \ |
| 282 | + vpxor t2, x2, x2; \ |
| 283 | + vpshufb t1, t0, t2; \ |
| 284 | + vpxor t2, x3, x3; \ |
| 285 | + vbroadcastss ((round * 16) + idx + 4)(rk), t0; \ |
| 286 | + vpsrld $24, t0, t2; \ |
| 287 | + vpshufb t1, t2, t2; \ |
| 288 | + vpxor t2, x4, x4; \ |
| 289 | + vpsrld $16, t0, t2; \ |
| 290 | + vpshufb t1, t2, t2; \ |
| 291 | + vpxor t2, x5, x5; \ |
| 292 | + vpsrld $8, t0, t2; \ |
| 293 | + vpshufb t1, t2, t2; \ |
| 294 | + vpxor t2, x6, x6; \ |
| 295 | + vpshufb t1, t0, t2; \ |
| 296 | + vpxor t2, x7, x7; |
288 | 297 |
|
289 | 298 | #ifdef CONFIG_AS_GFNI |
290 | 299 | #define aria_sbox_8way_gfni(x0, x1, x2, x3, \ |
291 | 300 | x4, x5, x6, x7, \ |
292 | 301 | t0, t1, t2, t3, \ |
293 | 302 | t4, t5, t6, t7) \ |
294 | | - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ |
295 | | - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ |
296 | | - vpbroadcastq .Ltf_id_bitmatrix, t2; \ |
297 | | - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ |
298 | | - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ |
| 303 | + vmovdqa .Ltf_s2_bitmatrix, t0; \ |
| 304 | + vmovdqa .Ltf_inv_bitmatrix, t1; \ |
| 305 | + vmovdqa .Ltf_id_bitmatrix, t2; \ |
| 306 | + vmovdqa .Ltf_aff_bitmatrix, t3; \ |
| 307 | + vmovdqa .Ltf_x2_bitmatrix, t4; \ |
299 | 308 | vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ |
300 | 309 | vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ |
301 | 310 | vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ |
|
315 | 324 | x4, x5, x6, x7, \ |
316 | 325 | t0, t1, t2, t3, \ |
317 | 326 | t4, t5, t6, t7) \ |
318 | | - vpxor t7, t7, t7; \ |
319 | 327 | vmovdqa .Linv_shift_row, t0; \ |
320 | 328 | vmovdqa .Lshift_row, t1; \ |
321 | | - vpbroadcastd .L0f0f0f0f, t6; \ |
| 329 | + vbroadcastss .L0f0f0f0f, t6; \ |
322 | 330 | vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ |
323 | 331 | vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ |
324 | 332 | vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ |
|
413 | 421 | y0, y1, y2, y3, \ |
414 | 422 | y4, y5, y6, y7, \ |
415 | 423 | mem_tmp, rk, round) \ |
| 424 | + vpxor y7, y7, y7; \ |
416 | 425 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
417 | | - y0, rk, 8, round); \ |
| 426 | + y0, y7, y2, rk, 8, round); \ |
418 | 427 | \ |
419 | 428 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
420 | 429 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
|
429 | 438 | x4, x5, x6, x7, \ |
430 | 439 | mem_tmp, 0); \ |
431 | 440 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
432 | | - y0, rk, 0, round); \ |
| 441 | + y0, y7, y2, rk, 0, round); \ |
433 | 442 | \ |
434 | 443 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
435 | 444 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
|
467 | 476 | y0, y1, y2, y3, \ |
468 | 477 | y4, y5, y6, y7, \ |
469 | 478 | mem_tmp, rk, round) \ |
| 479 | + vpxor y7, y7, y7; \ |
470 | 480 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
471 | | - y0, rk, 8, round); \ |
| 481 | + y0, y7, y2, rk, 8, round); \ |
472 | 482 | \ |
473 | 483 | aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
474 | 484 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
|
483 | 493 | x4, x5, x6, x7, \ |
484 | 494 | mem_tmp, 0); \ |
485 | 495 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
486 | | - y0, rk, 0, round); \ |
| 496 | + y0, y7, y2, rk, 0, round); \ |
487 | 497 | \ |
488 | 498 | aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
489 | 499 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
|
521 | 531 | y0, y1, y2, y3, \ |
522 | 532 | y4, y5, y6, y7, \ |
523 | 533 | mem_tmp, rk, round, last_round) \ |
| 534 | + vpxor y7, y7, y7; \ |
524 | 535 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
525 | | - y0, rk, 8, round); \ |
| 536 | + y0, y7, y2, rk, 8, round); \ |
526 | 537 | \ |
527 | 538 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
528 | 539 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
529 | 540 | \ |
530 | 541 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
531 | | - y0, rk, 8, last_round); \ |
| 542 | + y0, y7, y2, rk, 8, last_round); \ |
532 | 543 | \ |
533 | 544 | aria_store_state_8way(x0, x1, x2, x3, \ |
534 | 545 | x4, x5, x6, x7, \ |
|
538 | 549 | x4, x5, x6, x7, \ |
539 | 550 | mem_tmp, 0); \ |
540 | 551 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
541 | | - y0, rk, 0, round); \ |
| 552 | + y0, y7, y2, rk, 0, round); \ |
542 | 553 | \ |
543 | 554 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
544 | 555 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
545 | 556 | \ |
546 | 557 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
547 | | - y0, rk, 0, last_round); \ |
| 558 | + y0, y7, y2, rk, 0, last_round); \ |
548 | 559 | \ |
549 | 560 | aria_load_state_8way(y0, y1, y2, y3, \ |
550 | 561 | y4, y5, y6, y7, \ |
|
556 | 567 | y0, y1, y2, y3, \ |
557 | 568 | y4, y5, y6, y7, \ |
558 | 569 | mem_tmp, rk, round) \ |
| 570 | + vpxor y7, y7, y7; \ |
559 | 571 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
560 | | - y0, rk, 8, round); \ |
| 572 | + y0, y7, y2, rk, 8, round); \ |
561 | 573 | \ |
562 | 574 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
563 | 575 | x6, x7, x4, x5, \ |
|
574 | 586 | x4, x5, x6, x7, \ |
575 | 587 | mem_tmp, 0); \ |
576 | 588 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
577 | | - y0, rk, 0, round); \ |
| 589 | + y0, y7, y2, rk, 0, round); \ |
578 | 590 | \ |
579 | 591 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
580 | 592 | x6, x7, x4, x5, \ |
|
614 | 626 | y0, y1, y2, y3, \ |
615 | 627 | y4, y5, y6, y7, \ |
616 | 628 | mem_tmp, rk, round) \ |
| 629 | + vpxor y7, y7, y7; \ |
617 | 630 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
618 | | - y0, rk, 8, round); \ |
| 631 | + y0, y7, y2, rk, 8, round); \ |
619 | 632 | \ |
620 | 633 | aria_sbox_8way_gfni(x0, x1, x2, x3, \ |
621 | 634 | x4, x5, x6, x7, \ |
|
632 | 645 | x4, x5, x6, x7, \ |
633 | 646 | mem_tmp, 0); \ |
634 | 647 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
635 | | - y0, rk, 0, round); \ |
| 648 | + y0, y7, y2, rk, 0, round); \ |
636 | 649 | \ |
637 | 650 | aria_sbox_8way_gfni(x0, x1, x2, x3, \ |
638 | 651 | x4, x5, x6, x7, \ |
|
672 | 685 | y0, y1, y2, y3, \ |
673 | 686 | y4, y5, y6, y7, \ |
674 | 687 | mem_tmp, rk, round, last_round) \ |
| 688 | + vpxor y7, y7, y7; \ |
675 | 689 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
676 | | - y0, rk, 8, round); \ |
| 690 | + y0, y7, y2, rk, 8, round); \ |
677 | 691 | \ |
678 | 692 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
679 | 693 | x6, x7, x4, x5, \ |
680 | 694 | y0, y1, y2, y3, \ |
681 | 695 | y4, y5, y6, y7); \ |
682 | 696 | \ |
683 | 697 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
684 | | - y0, rk, 8, last_round); \ |
| 698 | + y0, y7, y2, rk, 8, last_round); \ |
685 | 699 | \ |
686 | 700 | aria_store_state_8way(x0, x1, x2, x3, \ |
687 | 701 | x4, x5, x6, x7, \ |
|
691 | 705 | x4, x5, x6, x7, \ |
692 | 706 | mem_tmp, 0); \ |
693 | 707 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
694 | | - y0, rk, 0, round); \ |
| 708 | + y0, y7, y2, rk, 0, round); \ |
695 | 709 | \ |
696 | 710 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
697 | 711 | x6, x7, x4, x5, \ |
698 | 712 | y0, y1, y2, y3, \ |
699 | 713 | y4, y5, y6, y7); \ |
700 | 714 | \ |
701 | 715 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
702 | | - y0, rk, 0, last_round); \ |
| 716 | + y0, y7, y2, rk, 0, last_round); \ |
703 | 717 | \ |
704 | 718 | aria_load_state_8way(y0, y1, y2, y3, \ |
705 | 719 | y4, y5, y6, y7, \ |
|
772 | 786 | BV8(0, 1, 1, 1, 1, 1, 0, 0), |
773 | 787 | BV8(0, 0, 1, 1, 1, 1, 1, 0), |
774 | 788 | BV8(0, 0, 0, 1, 1, 1, 1, 1)) |
| 789 | + .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), |
| 790 | + BV8(1, 1, 0, 0, 0, 1, 1, 1), |
| 791 | + BV8(1, 1, 1, 0, 0, 0, 1, 1), |
| 792 | + BV8(1, 1, 1, 1, 0, 0, 0, 1), |
| 793 | + BV8(1, 1, 1, 1, 1, 0, 0, 0), |
| 794 | + BV8(0, 1, 1, 1, 1, 1, 0, 0), |
| 795 | + BV8(0, 0, 1, 1, 1, 1, 1, 0), |
| 796 | + BV8(0, 0, 0, 1, 1, 1, 1, 1)) |
775 | 797 |
|
776 | 798 | /* AES inverse affine: */ |
777 | 799 | #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) |
|
784 | 806 | BV8(0, 0, 1, 0, 1, 0, 0, 1), |
785 | 807 | BV8(1, 0, 0, 1, 0, 1, 0, 0), |
786 | 808 | BV8(0, 1, 0, 0, 1, 0, 1, 0)) |
| 809 | + .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), |
| 810 | + BV8(1, 0, 0, 1, 0, 0, 1, 0), |
| 811 | + BV8(0, 1, 0, 0, 1, 0, 0, 1), |
| 812 | + BV8(1, 0, 1, 0, 0, 1, 0, 0), |
| 813 | + BV8(0, 1, 0, 1, 0, 0, 1, 0), |
| 814 | + BV8(0, 0, 1, 0, 1, 0, 0, 1), |
| 815 | + BV8(1, 0, 0, 1, 0, 1, 0, 0), |
| 816 | + BV8(0, 1, 0, 0, 1, 0, 1, 0)) |
787 | 817 |
|
788 | 818 | /* S2: */ |
789 | 819 | #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) |
|
796 | 826 | BV8(1, 1, 0, 0, 1, 1, 1, 0), |
797 | 827 | BV8(0, 1, 1, 0, 0, 0, 1, 1), |
798 | 828 | BV8(1, 1, 1, 1, 0, 1, 1, 0)) |
| 829 | + .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), |
| 830 | + BV8(0, 0, 1, 1, 1, 1, 1, 1), |
| 831 | + BV8(1, 1, 1, 0, 1, 1, 0, 1), |
| 832 | + BV8(1, 1, 0, 0, 0, 0, 1, 1), |
| 833 | + BV8(0, 1, 0, 0, 0, 0, 1, 1), |
| 834 | + BV8(1, 1, 0, 0, 1, 1, 1, 0), |
| 835 | + BV8(0, 1, 1, 0, 0, 0, 1, 1), |
| 836 | + BV8(1, 1, 1, 1, 0, 1, 1, 0)) |
799 | 837 |
|
800 | 838 | /* X2: */ |
801 | 839 | #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) |
|
808 | 846 | BV8(0, 1, 1, 0, 1, 0, 1, 1), |
809 | 847 | BV8(1, 0, 1, 1, 1, 1, 0, 1), |
810 | 848 | BV8(1, 0, 0, 1, 0, 0, 1, 1)) |
| 849 | + .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), |
| 850 | + BV8(0, 0, 1, 0, 0, 1, 1, 0), |
| 851 | + BV8(0, 0, 0, 0, 1, 0, 1, 0), |
| 852 | + BV8(1, 1, 1, 0, 0, 0, 1, 1), |
| 853 | + BV8(1, 1, 1, 0, 1, 1, 0, 0), |
| 854 | + BV8(0, 1, 1, 0, 1, 0, 1, 1), |
| 855 | + BV8(1, 0, 1, 1, 1, 1, 0, 1), |
| 856 | + BV8(1, 0, 0, 1, 0, 0, 1, 1)) |
811 | 857 |
|
812 | 858 | /* Identity matrix: */ |
813 | 859 | .Ltf_id_bitmatrix: |
|
819 | 865 | BV8(0, 0, 0, 0, 0, 1, 0, 0), |
820 | 866 | BV8(0, 0, 0, 0, 0, 0, 1, 0), |
821 | 867 | BV8(0, 0, 0, 0, 0, 0, 0, 1)) |
| 868 | + .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), |
| 869 | + BV8(0, 1, 0, 0, 0, 0, 0, 0), |
| 870 | + BV8(0, 0, 1, 0, 0, 0, 0, 0), |
| 871 | + BV8(0, 0, 0, 1, 0, 0, 0, 0), |
| 872 | + BV8(0, 0, 0, 0, 1, 0, 0, 0), |
| 873 | + BV8(0, 0, 0, 0, 0, 1, 0, 0), |
| 874 | + BV8(0, 0, 0, 0, 0, 0, 1, 0), |
| 875 | + BV8(0, 0, 0, 0, 0, 0, 0, 1)) |
822 | 876 | #endif /* CONFIG_AS_GFNI */ |
823 | 877 |
|
824 | 878 | /* 4-bit mask */ |
|
0 commit comments