531#if defined(JPH_USE_SSE)
537 __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
538 __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
539 __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
540 row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
541 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
542 __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
543 __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
544 row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
546 tmp1 = _mm_mul_ps(row2, row3);
547 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
548 __m128 minor0 = _mm_mul_ps(row1, tmp1);
549 __m128 minor1 = _mm_mul_ps(row0, tmp1);
550 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
551 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
552 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
553 minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
555 tmp1 = _mm_mul_ps(row1, row2);
556 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
557 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
558 __m128 minor3 = _mm_mul_ps(row0, tmp1);
559 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
560 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
561 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
562 minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
564 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
565 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
566 row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
567 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
568 __m128 minor2 = _mm_mul_ps(row0, tmp1);
569 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
570 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
571 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
572 minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
574 tmp1 = _mm_mul_ps(row0, row1);
575 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
576 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
577 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
578 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
579 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
580 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
582 tmp1 = _mm_mul_ps(row0, row3);
583 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
584 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
585 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
586 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
587 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
588 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
590 tmp1 = _mm_mul_ps(row0, row2);
591 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
592 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
593 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
594 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
595 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
596 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
598 __m128 det = _mm_mul_ps(row0, minor0);
599 det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det);
600 det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
601 det = _mm_div_ss(_mm_set_ss(1.0f), det);
602 det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
605 result.mCol[0].
mValue = _mm_mul_ps(det, minor0);
606 result.mCol[1].
mValue = _mm_mul_ps(det, minor1);
607 result.mCol[2].
mValue = _mm_mul_ps(det, minor2);
608 result.mCol[3].
mValue = _mm_mul_ps(det, minor3);
610#elif defined(JPH_USE_NEON)
612 Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
613 Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
614 Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
615 row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
616 tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
617 Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
618 Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
619 row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
621 tmp1 = vmulq_f32(row2, row3);
622 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
623 Type minor0 = vmulq_f32(row1, tmp1);
624 Type minor1 = vmulq_f32(row0, tmp1);
625 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
626 minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
627 minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
628 minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
630 tmp1 = vmulq_f32(row1, row2);
631 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
632 minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
633 Type minor3 = vmulq_f32(row0, tmp1);
634 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
635 minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
636 minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
637 minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
639 tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
640 tmp1 = vmulq_f32(tmp1, row3);
641 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
642 row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
643 minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
644 Type minor2 = vmulq_f32(row0, tmp1);
645 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
646 minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
647 minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
648 minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
650 tmp1 = vmulq_f32(row0, row1);
651 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
652 minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
653 minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
654 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
655 minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
656 minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
658 tmp1 = vmulq_f32(row0, row3);
659 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
660 minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
661 minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
662 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
663 minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
664 minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
666 tmp1 = vmulq_f32(row0, row2);
667 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
668 minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
669 minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
670 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
671 minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
672 minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
674 Type det = vmulq_f32(row0, minor0);
675 det = vdupq_n_f32(vaddvq_f32(det));
676 det = vdivq_f32(vdupq_n_f32(1.0f), det);
679 result.mCol[0].
mValue = vmulq_f32(det, minor0);
680 result.mCol[1].
mValue = vmulq_f32(det, minor1);
681 result.mCol[2].
mValue = vmulq_f32(det, minor2);
682 result.mCol[3].
mValue = vmulq_f32(det, minor3);
690 float m10211120 = m10 * m21 - m11 * m20;
691 float m10221220 = m10 * m22 - m12 * m20;
692 float m10231320 = m10 * m23 - m13 * m20;
693 float m10311130 = m10 * m31 - m11 * m30;
694 float m10321230 = m10 * m32 - m12 * m30;
695 float m10331330 = m10 * m33 - m13 * m30;
696 float m11221221 = m11 * m22 - m12 * m21;
697 float m11231321 = m11 * m23 - m13 * m21;
698 float m11321231 = m11 * m32 - m12 * m31;
699 float m11331331 = m11 * m33 - m13 * m31;
700 float m12231322 = m12 * m23 - m13 * m22;
701 float m12331332 = m12 * m33 - m13 * m32;
702 float m20312130 = m20 * m31 - m21 * m30;
703 float m20322230 = m20 * m32 - m22 * m30;
704 float m20332330 = m20 * m33 - m23 * m30;
705 float m21322231 = m21 * m32 - m22 * m31;
706 float m21332331 = m21 * m33 - m23 * m31;
707 float m22332332 = m22 * m33 - m23 * m32;
709 Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
710 Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
711 Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
712 Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
714 float det = m00 * col0.
mF32[0] + m01 * col0.
mF32[1] + m02 * col0.
mF32[2] + m03 * col0.
mF32[3];
716 return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
741#if defined(JPH_USE_SSE)
742 __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
743 __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
744 __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
745 row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
746 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
747 __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
748 __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
749 row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
751 tmp1 = _mm_mul_ps(row2, row3);
752 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
753 __m128 minor0 = _mm_mul_ps(row1, tmp1);
754 __m128 minor1 = _mm_mul_ps(row0, tmp1);
755 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
756 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
757 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
758 minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
760 tmp1 = _mm_mul_ps(row1, row2);
761 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
762 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
763 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
764 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
766 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
767 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
768 row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
769 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
770 __m128 minor2 = _mm_mul_ps(row0, tmp1);
771 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
772 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
773 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
774 minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
776 tmp1 = _mm_mul_ps(row0, row1);
777 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
778 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
779 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
780 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
782 tmp1 = _mm_mul_ps(row0, row3);
783 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
784 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
785 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
786 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
787 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
788 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
790 tmp1 = _mm_mul_ps(row0, row2);
791 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
792 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
793 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
794 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
797 result.mCol[0].
mValue = minor0;
798 result.mCol[1].
mValue = minor1;
799 result.mCol[2].
mValue = minor2;
800 result.mCol[3] =
Vec4(0, 0, 0, 1);
802#elif defined(JPH_USE_NEON)
803 Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
804 Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
805 Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
806 Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
807 row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
808 tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
809 Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
810 Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
811 row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
813 tmp1 = vmulq_f32(row2, row3);
814 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
815 Type minor0 = vmulq_f32(row1, tmp1);
816 Type minor1 = vmulq_f32(row0, tmp1);
817 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
818 minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
819 minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
820 minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
822 tmp1 = vmulq_f32(row1, row2);
823 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
824 minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
825 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
826 minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
828 tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
829 tmp1 = vmulq_f32(tmp1, row3);
830 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
831 row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
832 minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
833 Type minor2 = vmulq_f32(row0, tmp1);
834 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
835 minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
836 minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
837 minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
839 tmp1 = vmulq_f32(row0, row1);
840 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
841 minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
842 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
843 minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
845 tmp1 = vmulq_f32(row0, row3);
846 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
847 minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
848 minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
849 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
850 minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
851 minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
853 tmp1 = vmulq_f32(row0, row2);
854 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
855 minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
856 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
857 minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
860 result.mCol[0].
mValue = minor0;
861 result.mCol[1].
mValue = minor1;
862 result.mCol[2].
mValue = minor2;
863 result.mCol[3].
mValue = v0001;
892#if defined(JPH_USE_SSE)
893 __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
894 __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
895 __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
896 row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
897 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
898 __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
899 __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
900 row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
902 tmp1 = _mm_mul_ps(row2, row3);
903 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
904 __m128 minor0 = _mm_mul_ps(row1, tmp1);
905 __m128 minor1 = _mm_mul_ps(row0, tmp1);
906 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
907 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
908 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
909 minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
911 tmp1 = _mm_mul_ps(row1, row2);
912 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
913 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
914 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
915 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
917 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
918 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
919 row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
920 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
921 __m128 minor2 = _mm_mul_ps(row0, tmp1);
922 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
923 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
924 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
925 minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
927 tmp1 = _mm_mul_ps(row0, row1);
928 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
929 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
930 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
931 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
933 tmp1 = _mm_mul_ps(row0, row3);
934 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
935 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
936 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
937 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
938 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
939 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
941 tmp1 = _mm_mul_ps(row0, row2);
942 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
943 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
944 tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
945 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
947 __m128 det = _mm_mul_ps(row0, minor0);
948 det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det);
949 det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
950 det = _mm_div_ss(_mm_set_ss(1.0f), det);
951 det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
954 result.mCol[0].
mValue = _mm_mul_ps(det, minor0);
955 result.mCol[1].
mValue = _mm_mul_ps(det, minor1);
956 result.mCol[2].
mValue = _mm_mul_ps(det, minor2);
957 result.mCol[3] =
Vec4(0, 0, 0, 1);
959#elif defined(JPH_USE_NEON)
960 Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
961 Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
962 Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
963 Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
964 row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
965 tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
966 Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
967 Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
968 row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
970 tmp1 = vmulq_f32(row2, row3);
971 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
972 Type minor0 = vmulq_f32(row1, tmp1);
973 Type minor1 = vmulq_f32(row0, tmp1);
974 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
975 minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
976 minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
977 minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
979 tmp1 = vmulq_f32(row1, row2);
980 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
981 minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
982 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
983 minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
985 tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
986 tmp1 = vmulq_f32(tmp1, row3);
987 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
988 row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
989 minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
990 Type minor2 = vmulq_f32(row0, tmp1);
991 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
992 minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
993 minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
994 minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
996 tmp1 = vmulq_f32(row0, row1);
997 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
998 minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
999 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
1000 minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
1002 tmp1 = vmulq_f32(row0, row3);
1003 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
1004 minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
1005 minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
1006 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
1007 minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
1008 minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
1010 tmp1 = vmulq_f32(row0, row2);
1011 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
1012 minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
1013 tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
1014 minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
1016 Type det = vmulq_f32(row0, minor0);
1017 det = vdupq_n_f32(vaddvq_f32(det));
1018 det = vdivq_f32(vdupq_n_f32(1.0f), det);
1021 result.mCol[0].
mValue = vmulq_f32(det, minor0);
1022 result.mCol[1].
mValue = vmulq_f32(det, minor1);
1023 result.mCol[2].
mValue = vmulq_f32(det, minor2);
1024 result.mCol[3].
mValue = v0001;