1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for float array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
10 private import util.cpuid;
14 private import core.stdc.stdio : printf;
15 /* This is so unit tests will test every CPU variant
18 const int CPUID_MAX = 5;
19 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
20 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
21 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
22 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
26 alias util.cpuid.mmx mmx;
27 alias util.cpuid.sse sse;
28 alias util.cpuid.sse2 sse2;
29 alias util.cpuid.amd3dnow amd3dnow;
34 bool disjoint(T)(T[] a, T[] b)
36 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
43 /* ======================================================================== */
45 /***********************
50 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
53 assert(a.length == b.length && b.length == c.length);
54 assert(disjoint(a, b));
55 assert(disjoint(a, c));
56 assert(disjoint(b, c));
60 //printf("_arraySliceSliceAddSliceAssign_f()\n");
62 auto aend = aptr + a.length;
66 version (D_InlineAsm_X86)
68 // SSE version is 834% faster
69 if (sse() && b.length >= 16)
71 version (log) printf("\tsse unaligned\n");
72 auto n = aptr + (b.length & ~15);
77 mov EAX, bptr; // left operand
78 mov ECX, cptr; // right operand
79 mov ESI, aptr; // destination operand
80 mov EDI, n; // end comparison
85 movups XMM1, [EAX+16];
86 movups XMM2, [EAX+32];
87 movups XMM3, [EAX+48];
90 movups XMM5, [ECX+16];
91 movups XMM6, [ECX+32];
92 movups XMM7, [ECX+48];
99 movups [ESI+ 0-64], XMM0;
100 movups [ESI+16-64], XMM1;
101 movups [ESI+32-64], XMM2;
102 movups [ESI+48-64], XMM3;
112 // 3DNow! version is only 13% faster
113 if (amd3dnow() && b.length >= 8)
115 version (log) printf("\tamd3dnow\n");
116 auto n = aptr + (b.length & ~7);
120 mov ESI, aptr; // destination operand
121 mov EDI, n; // end comparison
122 mov EAX, bptr; // left operand
123 mov ECX, cptr; // right operand
154 version (log) if (aptr < aend) printf("\tbase\n");
156 *aptr++ = *bptr++ + *cptr++;
164 printf("_arraySliceSliceAddSliceAssign_f unittest\n");
165 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
167 version (log) printf(" cpuid %d\n", cpuid);
169 for (int j = 0; j < 2; j++)
172 T[] a = new T[dim + j]; // aligned on 16 byte boundary
173 a = a[j .. dim + j]; // misalign for second iteration
174 T[] b = new T[dim + j];
176 T[] c = new T[dim + j];
179 for (int i = 0; i < dim; i++)
181 b[i] = cast(T)(i + 7);
182 c[i] = cast(T)(i * 2);
187 for (int i = 0; i < dim; i++)
189 if (c[i] != cast(T)(a[i] + b[i]))
191 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
199 /* ======================================================================== */
201 /***********************
206 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
209 assert(a.length == b.length && b.length == c.length);
210 assert(disjoint(a, b));
211 assert(disjoint(a, c));
212 assert(disjoint(b, c));
217 auto aend = aptr + a.length;
221 version (D_InlineAsm_X86)
223 // SSE version is 834% faster
224 if (sse() && b.length >= 16)
226 auto n = aptr + (b.length & ~15);
231 mov EAX, bptr; // left operand
232 mov ECX, cptr; // right operand
233 mov ESI, aptr; // destination operand
234 mov EDI, n; // end comparison
239 movups XMM1, [EAX+16];
240 movups XMM2, [EAX+32];
241 movups XMM3, [EAX+48];
244 movups XMM5, [ECX+16];
245 movups XMM6, [ECX+32];
246 movups XMM7, [ECX+48];
253 movups [ESI+ 0-64], XMM0;
254 movups [ESI+16-64], XMM1;
255 movups [ESI+32-64], XMM2;
256 movups [ESI+48-64], XMM3;
266 // 3DNow! version is only 13% faster
267 if (amd3dnow() && b.length >= 8)
269 auto n = aptr + (b.length & ~7);
273 mov ESI, aptr; // destination operand
274 mov EDI, n; // end comparison
275 mov EAX, bptr; // left operand
276 mov ECX, cptr; // right operand
308 *aptr++ = *bptr++ - *cptr++;
316 printf("_arraySliceSliceMinSliceAssign_f unittest\n");
317 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
319 version (log) printf(" cpuid %d\n", cpuid);
321 for (int j = 0; j < 2; j++)
324 T[] a = new T[dim + j]; // aligned on 16 byte boundary
325 a = a[j .. dim + j]; // misalign for second iteration
326 T[] b = new T[dim + j];
328 T[] c = new T[dim + j];
331 for (int i = 0; i < dim; i++)
333 b[i] = cast(T)(i + 7);
334 c[i] = cast(T)(i * 2);
339 for (int i = 0; i < dim; i++)
341 if (c[i] != cast(T)(a[i] - b[i]))
343 printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
351 /* ======================================================================== */
353 /***********************
358 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
361 assert(a.length == b.length);
362 assert(disjoint(a, b));
366 //printf("_arraySliceExpAddSliceAssign_f()\n");
368 auto aend = aptr + a.length;
371 version (D_InlineAsm_X86)
373 // SSE version is 665% faster
374 if (sse() && a.length >= 16)
376 auto n = aptr + (a.length & ~15);
385 shufps XMM4, XMM4, 0;
391 movups XMM1, [EAX+16];
392 movups XMM2, [EAX+32];
393 movups XMM3, [EAX+48];
399 movups [ESI+ 0-64], XMM0;
400 movups [ESI+16-64], XMM1;
401 movups [ESI+32-64], XMM2;
402 movups [ESI+48-64], XMM3;
411 // 3DNow! version is 69% faster
412 if (amd3dnow() && a.length >= 8)
414 auto n = aptr + (a.length & ~7);
416 ulong w = *cast(uint *) &value;
417 ulong v = w | (w << 32L);
424 movq MM4, qword ptr [v];
453 *aptr++ = *bptr++ + value;
460 printf("_arraySliceExpAddSliceAssign_f unittest\n");
461 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
463 version (log) printf(" cpuid %d\n", cpuid);
465 for (int j = 0; j < 2; j++)
468 T[] a = new T[dim + j]; // aligned on 16 byte boundary
469 a = a[j .. dim + j]; // misalign for second iteration
470 T[] b = new T[dim + j];
472 T[] c = new T[dim + j];
475 for (int i = 0; i < dim; i++)
477 b[i] = cast(T)(i + 7);
478 c[i] = cast(T)(i * 2);
483 for (int i = 0; i < dim; i++)
485 if (c[i] != cast(T)(a[i] + 6))
487 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
495 /* ======================================================================== */
497 /***********************
502 T[] _arrayExpSliceAddass_f(T[] a, T value)
504 //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
506 auto aend = aptr + a.length;
508 version (D_InlineAsm_X86)
510 // SSE version is 302% faster
511 if (sse() && a.length >= 16)
514 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
517 n = cast(T*)((cast(uint)aend) & ~15);
526 shufps XMM4, XMM4, 0;
531 movaps XMM1, [ESI+16];
532 movaps XMM2, [ESI+32];
533 movaps XMM3, [ESI+48];
539 movaps [ESI+ 0-64], XMM0;
540 movaps [ESI+16-64], XMM1;
541 movaps [ESI+32-64], XMM2;
542 movaps [ESI+48-64], XMM3;
550 // 3DNow! version is 63% faster
551 if (amd3dnow() && a.length >= 8)
553 auto n = aptr + (a.length & ~7);
555 ulong w = *cast(uint *) &value;
556 ulong v = w | (w << 32L);
560 mov ESI, dword ptr [aptr];
561 mov EDI, dword ptr [n];
562 movq MM4, qword ptr [v];
583 mov dword ptr [aptr], ESI;
596 printf("_arrayExpSliceAddass_f unittest\n");
597 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
599 version (log) printf(" cpuid %d\n", cpuid);
601 for (int j = 0; j < 2; j++)
604 T[] a = new T[dim + j]; // aligned on 16 byte boundary
605 a = a[j .. dim + j]; // misalign for second iteration
606 T[] b = new T[dim + j];
608 T[] c = new T[dim + j];
611 for (int i = 0; i < dim; i++)
613 b[i] = cast(T)(i + 7);
614 c[i] = cast(T)(i * 2);
620 for (int i = 0; i < dim; i++)
622 if (c[i] != cast(T)(a[i] + 6))
624 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
632 /* ======================================================================== */
634 /***********************
639 T[] _arraySliceSliceAddass_f(T[] a, T[] b)
642 assert (a.length == b.length);
643 assert (disjoint(a, b));
647 //printf("_arraySliceSliceAddass_f()\n");
649 auto aend = aptr + a.length;
652 version (D_InlineAsm_X86)
654 // SSE version is 468% faster
655 if (sse() && a.length >= 16)
657 auto n = aptr + (a.length & ~15);
662 mov ECX, bptr; // right operand
663 mov ESI, aptr; // destination operand
664 mov EDI, n; // end comparison
669 movups XMM1, [ESI+16];
670 movups XMM2, [ESI+32];
671 movups XMM3, [ESI+48];
674 movups XMM5, [ECX+16];
675 movups XMM6, [ECX+32];
676 movups XMM7, [ECX+48];
682 movups [ESI+ 0-64], XMM0;
683 movups [ESI+16-64], XMM1;
684 movups [ESI+32-64], XMM2;
685 movups [ESI+48-64], XMM3;
694 // 3DNow! version is 57% faster
695 if (amd3dnow() && a.length >= 8)
697 auto n = aptr + (a.length & ~7);
701 mov ESI, dword ptr [aptr]; // destination operand
702 mov EDI, dword ptr [n]; // end comparison
703 mov ECX, dword ptr [bptr]; // right operand
725 mov dword ptr [aptr], ESI;
726 mov dword ptr [bptr], ECX;
739 printf("_arraySliceSliceAddass_f unittest\n");
740 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
742 version (log) printf(" cpuid %d\n", cpuid);
744 for (int j = 0; j < 2; j++)
747 T[] a = new T[dim + j]; // aligned on 16 byte boundary
748 a = a[j .. dim + j]; // misalign for second iteration
749 T[] b = new T[dim + j];
751 T[] c = new T[dim + j];
754 for (int i = 0; i < dim; i++)
756 b[i] = cast(T)(i + 7);
757 c[i] = cast(T)(i * 2);
763 for (int i = 0; i < dim; i++)
765 if (c[i] != cast(T)(a[i] + b[i]))
767 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
775 /* ======================================================================== */
777 /***********************
782 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
785 assert (a.length == b.length);
786 assert (disjoint(a, b));
790 //printf("_arraySliceExpMinSliceAssign_f()\n");
792 auto aend = aptr + a.length;
795 version (D_InlineAsm_X86)
797 // SSE version is 622% faster
798 if (sse() && a.length >= 16)
800 auto n = aptr + (a.length & ~15);
809 shufps XMM4, XMM4, 0;
815 movups XMM1, [EAX+16];
816 movups XMM2, [EAX+32];
817 movups XMM3, [EAX+48];
823 movups [ESI+ 0-64], XMM0;
824 movups [ESI+16-64], XMM1;
825 movups [ESI+32-64], XMM2;
826 movups [ESI+48-64], XMM3;
835 // 3DNow! version is 67% faster
836 if (amd3dnow() && a.length >= 8)
838 auto n = aptr + (a.length & ~7);
846 mov ESI, dword ptr [aptr];
847 mov EDI, dword ptr [n];
848 mov EAX, dword ptr [bptr];
849 movq MM4, qword ptr [w];
871 mov dword ptr [aptr], ESI;
872 mov dword ptr [bptr], EAX;
878 *aptr++ = *bptr++ - value;
885 printf("_arraySliceExpMinSliceAssign_f unittest\n");
886 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
888 version (log) printf(" cpuid %d\n", cpuid);
890 for (int j = 0; j < 2; j++)
893 T[] a = new T[dim + j]; // aligned on 16 byte boundary
894 a = a[j .. dim + j]; // misalign for second iteration
895 T[] b = new T[dim + j];
897 T[] c = new T[dim + j];
900 for (int i = 0; i < dim; i++)
902 b[i] = cast(T)(i + 7);
903 c[i] = cast(T)(i * 2);
908 for (int i = 0; i < dim; i++)
910 if (c[i] != cast(T)(a[i] - 6))
912 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
920 /* ======================================================================== */
922 /***********************
927 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
930 assert (a.length == b.length);
931 assert (disjoint(a, b));
935 //printf("_arrayExpSliceMinSliceAssign_f()\n");
937 auto aend = aptr + a.length;
940 version (D_InlineAsm_X86)
942 // SSE version is 690% faster
943 if (sse() && a.length >= 16)
945 auto n = aptr + (a.length & ~15);
954 shufps XMM4, XMM4, 0;
962 movups XMM1, [EAX+16];
963 movups XMM2, [EAX+32];
964 movups XMM3, [EAX+48];
968 movups [ESI+ 0-64], XMM5;
969 movups [ESI+16-64], XMM6;
974 movups [ESI+32-64], XMM5;
975 movups [ESI+48-64], XMM6;
984 // 3DNow! version is 67% faster
985 if (amd3dnow() && a.length >= 8)
987 auto n = aptr + (a.length & ~7);
989 ulong w = *cast(uint *) &value;
990 ulong v = w | (w << 32L);
997 movq MM4, qword ptr [v];
1026 *aptr++ = value - *bptr++;
1033 printf("_arrayExpSliceMinSliceAssign_f unittest\n");
1034 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1036 version (log) printf(" cpuid %d\n", cpuid);
1038 for (int j = 0; j < 2; j++)
1041 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1042 a = a[j .. dim + j]; // misalign for second iteration
1043 T[] b = new T[dim + j];
1044 b = b[j .. dim + j];
1045 T[] c = new T[dim + j];
1046 c = c[j .. dim + j];
1048 for (int i = 0; i < dim; i++)
1050 b[i] = cast(T)(i + 7);
1051 c[i] = cast(T)(i * 2);
1056 for (int i = 0; i < dim; i++)
1058 if (c[i] != cast(T)(6 - a[i]))
1060 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
1068 /* ======================================================================== */
1070 /***********************
1075 T[] _arrayExpSliceMinass_f(T[] a, T value)
1077 //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1079 auto aend = aptr + a.length;
1081 version (D_InlineAsm_X86)
1083 // SSE version is 304% faster
1084 if (sse() && a.length >= 16)
1087 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1090 n = cast(T*)((cast(uint)aend) & ~15);
1099 shufps XMM4, XMM4, 0;
1104 movaps XMM1, [ESI+16];
1105 movaps XMM2, [ESI+32];
1106 movaps XMM3, [ESI+48];
1112 movaps [ESI+ 0-64], XMM0;
1113 movaps [ESI+16-64], XMM1;
1114 movaps [ESI+32-64], XMM2;
1115 movaps [ESI+48-64], XMM3;
1123 // 3DNow! version is 63% faster
1124 if (amd3dnow() && a.length >= 8)
1126 auto n = aptr + (a.length & ~7);
1128 ulong w = *cast(uint *) &value;
1129 ulong v = w | (w << 32L);
1133 mov ESI, dword ptr [aptr];
1134 mov EDI, dword ptr [n];
1135 movq MM4, qword ptr [v];
1156 mov dword ptr [aptr], ESI;
1169 printf("_arrayExpSliceminass_f unittest\n");
1170 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1172 version (log) printf(" cpuid %d\n", cpuid);
1174 for (int j = 0; j < 2; j++)
1177 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1178 a = a[j .. dim + j]; // misalign for second iteration
1179 T[] b = new T[dim + j];
1180 b = b[j .. dim + j];
1181 T[] c = new T[dim + j];
1182 c = c[j .. dim + j];
1184 for (int i = 0; i < dim; i++)
1186 b[i] = cast(T)(i + 7);
1187 c[i] = cast(T)(i * 2);
1193 for (int i = 0; i < dim; i++)
1195 if (c[i] != cast(T)(a[i] - 6))
1197 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1205 /* ======================================================================== */
1207 /***********************
1212 T[] _arraySliceSliceMinass_f(T[] a, T[] b)
1215 assert (a.length == b.length);
1216 assert (disjoint(a, b));
1220 //printf("_arraySliceSliceMinass_f()\n");
1222 auto aend = aptr + a.length;
1225 version (D_InlineAsm_X86)
1227 // SSE version is 468% faster
1228 if (sse() && a.length >= 16)
1230 auto n = aptr + (a.length & ~15);
1235 mov ECX, bptr; // right operand
1236 mov ESI, aptr; // destination operand
1237 mov EDI, n; // end comparison
1242 movups XMM1, [ESI+16];
1243 movups XMM2, [ESI+32];
1244 movups XMM3, [ESI+48];
1247 movups XMM5, [ECX+16];
1248 movups XMM6, [ECX+32];
1249 movups XMM7, [ECX+48];
1255 movups [ESI+ 0-64], XMM0;
1256 movups [ESI+16-64], XMM1;
1257 movups [ESI+32-64], XMM2;
1258 movups [ESI+48-64], XMM3;
1267 // 3DNow! version is 57% faster
1268 if (amd3dnow() && a.length >= 8)
1270 auto n = aptr + (a.length & ~7);
1274 mov ESI, dword ptr [aptr]; // destination operand
1275 mov EDI, dword ptr [n]; // end comparison
1276 mov ECX, dword ptr [bptr]; // right operand
1286 pfsub MM2, [ECX+16];
1287 pfsub MM3, [ECX+24];
1312 printf("_arrayExpSliceMinass_f unittest\n");
1313 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1315 version (log) printf(" cpuid %d\n", cpuid);
1317 for (int j = 0; j < 2; j++)
1320 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1321 a = a[j .. dim + j]; // misalign for second iteration
1322 T[] b = new T[dim + j];
1323 b = b[j .. dim + j];
1324 T[] c = new T[dim + j];
1325 c = c[j .. dim + j];
1327 for (int i = 0; i < dim; i++)
1329 b[i] = cast(T)(i + 7);
1330 c[i] = cast(T)(i * 2);
1336 for (int i = 0; i < dim; i++)
1338 if (c[i] != cast(T)(a[i] - 6))
1340 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1348 /* ======================================================================== */
1350 /***********************
1355 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
1358 assert(a.length == b.length);
1359 assert(disjoint(a, b));
1363 //printf("_arraySliceExpMulSliceAssign_f()\n");
1365 auto aend = aptr + a.length;
1368 version (D_InlineAsm_X86)
1370 // SSE version is 607% faster
1371 if (sse() && a.length >= 16)
1373 auto n = aptr + (a.length & ~15);
1382 shufps XMM4, XMM4, 0;
1388 movups XMM1, [EAX+16];
1389 movups XMM2, [EAX+32];
1390 movups XMM3, [EAX+48];
1396 movups [ESI+ 0-64], XMM0;
1397 movups [ESI+16-64], XMM1;
1398 movups [ESI+32-64], XMM2;
1399 movups [ESI+48-64], XMM3;
1408 // 3DNow! version is 69% faster
1409 if (amd3dnow() && a.length >= 8)
1411 auto n = aptr + (a.length & ~7);
1413 ulong w = *cast(uint *) &value;
1414 ulong v = w | (w << 32L);
1418 mov ESI, dword ptr [aptr];
1419 mov EDI, dword ptr [n];
1420 mov EAX, dword ptr [bptr];
1421 movq MM4, qword ptr [v];
1450 *aptr++ = *bptr++ * value;
1457 printf("_arraySliceExpMulSliceAssign_f unittest\n");
1458 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1460 version (log) printf(" cpuid %d\n", cpuid);
1462 for (int j = 0; j < 2; j++)
1465 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1466 a = a[j .. dim + j]; // misalign for second iteration
1467 T[] b = new T[dim + j];
1468 b = b[j .. dim + j];
1469 T[] c = new T[dim + j];
1470 c = c[j .. dim + j];
1472 for (int i = 0; i < dim; i++)
1474 b[i] = cast(T)(i + 7);
1475 c[i] = cast(T)(i * 2);
1480 for (int i = 0; i < dim; i++)
1482 if (c[i] != cast(T)(a[i] * 6))
1484 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1492 /* ======================================================================== */
1494 /***********************
1499 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
1502 assert(a.length == b.length && b.length == c.length);
1503 assert(disjoint(a, b));
1504 assert(disjoint(a, c));
1505 assert(disjoint(b, c));
1509 //printf("_arraySliceSliceMulSliceAssign_f()\n");
1511 auto aend = aptr + a.length;
1515 version (D_InlineAsm_X86)
1517 // SSE version is 833% faster
1518 if (sse() && a.length >= 16)
1520 auto n = aptr + (a.length & ~15);
1525 mov EAX, bptr; // left operand
1526 mov ECX, cptr; // right operand
1527 mov ESI, aptr; // destination operand
1528 mov EDI, n; // end comparison
1533 movups XMM1, [EAX+16];
1534 movups XMM2, [EAX+32];
1535 movups XMM3, [EAX+48];
1538 movups XMM5, [ECX+16];
1539 movups XMM6, [ECX+32];
1540 movups XMM7, [ECX+48];
1547 movups [ESI+ 0-64], XMM0;
1548 movups [ESI+16-64], XMM1;
1549 movups [ESI+32-64], XMM2;
1550 movups [ESI+48-64], XMM3;
1560 // 3DNow! version is only 13% faster
1561 if (amd3dnow() && a.length >= 8)
1563 auto n = aptr + (a.length & ~7);
1567 mov ESI, dword ptr [aptr]; // destination operand
1568 mov EDI, dword ptr [n]; // end comparison
1569 mov EAX, dword ptr [bptr]; // left operand
1570 mov ECX, dword ptr [cptr]; // right operand
1580 pfmul MM2, [ECX+16];
1581 pfmul MM3, [ECX+24];
1601 *aptr++ = *bptr++ * *cptr++;
1608 printf("_arraySliceSliceMulSliceAssign_f unittest\n");
1609 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1611 version (log) printf(" cpuid %d\n", cpuid);
1613 for (int j = 0; j < 2; j++)
1616 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1617 a = a[j .. dim + j]; // misalign for second iteration
1618 T[] b = new T[dim + j];
1619 b = b[j .. dim + j];
1620 T[] c = new T[dim + j];
1621 c = c[j .. dim + j];
1623 for (int i = 0; i < dim; i++)
1625 b[i] = cast(T)(i + 7);
1626 c[i] = cast(T)(i * 2);
1631 for (int i = 0; i < dim; i++)
1633 if (c[i] != cast(T)(a[i] * b[i]))
1635 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1643 /* ======================================================================== */
1645 /***********************
1650 T[] _arrayExpSliceMulass_f(T[] a, T value)
1652 //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1654 auto aend = aptr + a.length;
1656 version (D_InlineAsm_X86)
1658 // SSE version is 303% faster
1659 if (sse() && a.length >= 16)
1662 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1665 n = cast(T*)((cast(uint)aend) & ~15);
1674 shufps XMM4, XMM4, 0;
1679 movaps XMM1, [ESI+16];
1680 movaps XMM2, [ESI+32];
1681 movaps XMM3, [ESI+48];
1687 movaps [ESI+ 0-64], XMM0;
1688 movaps [ESI+16-64], XMM1;
1689 movaps [ESI+32-64], XMM2;
1690 movaps [ESI+48-64], XMM3;
1698 // 3DNow! version is 63% faster
1699 if (amd3dnow() && a.length >= 8)
1701 auto n = aptr + (a.length & ~7);
1703 ulong w = *cast(uint *) &value;
1704 ulong v = w | (w << 32L);
1708 mov ESI, dword ptr [aptr];
1709 mov EDI, dword ptr [n];
1710 movq MM4, qword ptr [v];
1731 mov dword ptr [aptr], ESI;
1744 printf("_arrayExpSliceMulass_f unittest\n");
1745 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1747 version (log) printf(" cpuid %d\n", cpuid);
1749 for (int j = 0; j < 2; j++)
1752 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1753 a = a[j .. dim + j]; // misalign for second iteration
1754 T[] b = new T[dim + j];
1755 b = b[j .. dim + j];
1756 T[] c = new T[dim + j];
1757 c = c[j .. dim + j];
1759 for (int i = 0; i < dim; i++)
1761 b[i] = cast(T)(i + 7);
1762 c[i] = cast(T)(i * 2);
1768 for (int i = 0; i < dim; i++)
1770 if (c[i] != cast(T)(a[i] * 6))
1772 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1780 /* ======================================================================== */
1782 /***********************
1787 T[] _arraySliceSliceMulass_f(T[] a, T[] b)
1790 assert (a.length == b.length);
1791 assert (disjoint(a, b));
1795 //printf("_arraySliceSliceMulass_f()\n");
1797 auto aend = aptr + a.length;
1800 version (D_InlineAsm_X86)
1802 // SSE version is 525% faster
1803 if (sse() && a.length >= 16)
1805 auto n = aptr + (a.length & ~15);
1810 mov ECX, bptr; // right operand
1811 mov ESI, aptr; // destination operand
1812 mov EDI, n; // end comparison
1817 movups XMM1, [ESI+16];
1818 movups XMM2, [ESI+32];
1819 movups XMM3, [ESI+48];
1822 movups XMM5, [ECX+16];
1823 movups XMM6, [ECX+32];
1824 movups XMM7, [ECX+48];
1830 movups [ESI+ 0-64], XMM0;
1831 movups [ESI+16-64], XMM1;
1832 movups [ESI+32-64], XMM2;
1833 movups [ESI+48-64], XMM3;
1842 // 3DNow! version is 57% faster
1843 if (amd3dnow() && a.length >= 8)
1845 auto n = aptr + (a.length & ~7);
1849 mov ESI, dword ptr [aptr]; // destination operand
1850 mov EDI, dword ptr [n]; // end comparison
1851 mov ECX, dword ptr [bptr]; // right operand
1861 pfmul MM2, [ECX+16];
1862 pfmul MM3, [ECX+24];
1873 mov dword ptr [aptr], ESI;
1874 mov dword ptr [bptr], ECX;
1887 printf("_arrayExpSliceMulass_f unittest\n");
1888 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1890 version (log) printf(" cpuid %d\n", cpuid);
1892 for (int j = 0; j < 2; j++)
1895 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1896 a = a[j .. dim + j]; // misalign for second iteration
1897 T[] b = new T[dim + j];
1898 b = b[j .. dim + j];
1899 T[] c = new T[dim + j];
1900 c = c[j .. dim + j];
1902 for (int i = 0; i < dim; i++)
1904 b[i] = cast(T)(i + 7);
1905 c[i] = cast(T)(i * 2);
1911 for (int i = 0; i < dim; i++)
1913 if (c[i] != cast(T)(a[i] * 6))
1915 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1923 /* ======================================================================== */
1925 /***********************
1930 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
1933 assert(a.length == b.length);
1934 assert(disjoint(a, b));
1938 //printf("_arraySliceExpDivSliceAssign_f()\n");
1940 auto aend = aptr + a.length;
1943 /* Multiplying by the reciprocal is faster, but does
1944 * not produce as accurate an answer.
1946 T recip = cast(T)1 / value;
1948 version (D_InlineAsm_X86)
1950 // SSE version is 587% faster
1951 if (sse() && a.length >= 16)
1953 auto n = aptr + (a.length & ~15);
1964 shufps XMM4, XMM4, 0;
1970 movups XMM1, [EAX+16];
1971 movups XMM2, [EAX+32];
1972 movups XMM3, [EAX+48];
1982 movups [ESI+ 0-64], XMM0;
1983 movups [ESI+16-64], XMM1;
1984 movups [ESI+32-64], XMM2;
1985 movups [ESI+48-64], XMM3;
1994 // 3DNow! version is 72% faster
1995 if (amd3dnow() && a.length >= 8)
1997 auto n = aptr + (a.length & ~7);
2006 mov ESI, dword ptr [aptr];
2007 mov EDI, dword ptr [n];
2008 mov EAX, dword ptr [bptr];
2009 movq MM4, qword ptr [w];
2031 mov dword ptr [aptr], ESI;
2032 mov dword ptr [bptr], EAX;
2038 *aptr++ = *bptr++ * recip;
2045 printf("_arraySliceExpDivSliceAssign_f unittest\n");
2046 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2048 version (log) printf(" cpuid %d\n", cpuid);
2050 for (int j = 0; j < 2; j++)
2053 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2054 a = a[j .. dim + j]; // misalign for second iteration
2055 T[] b = new T[dim + j];
2056 b = b[j .. dim + j];
2057 T[] c = new T[dim + j];
2058 c = c[j .. dim + j];
2060 for (int i = 0; i < dim; i++)
2062 b[i] = cast(T)(i + 7);
2063 c[i] = cast(T)(i * 2);
2068 for (int i = 0; i < dim; i++)
2070 if (c[i] != cast(T)(a[i] / 8))
2072 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2080 /* ======================================================================== */
2082 /***********************
2087 T[] _arrayExpSliceDivass_f(T[] a, T value)
2089 //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2091 auto aend = aptr + a.length;
2093 /* Multiplying by the reciprocal is faster, but does
2094 * not produce as accurate an answer.
2096 T recip = cast(T)1 / value;
2098 version (D_InlineAsm_X86)
2100 // SSE version is 245% faster
2101 if (sse() && a.length >= 16)
2104 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
2107 n = cast(T*)((cast(uint)aend) & ~15);
2118 shufps XMM4, XMM4, 0;
2123 movaps XMM1, [ESI+16];
2124 movaps XMM2, [ESI+32];
2125 movaps XMM3, [ESI+48];
2135 movaps [ESI+ 0-64], XMM0;
2136 movaps [ESI+16-64], XMM1;
2137 movaps [ESI+32-64], XMM2;
2138 movaps [ESI+48-64], XMM3;
2146 // 3DNow! version is 57% faster
2147 if (amd3dnow() && a.length >= 8)
2149 auto n = aptr + (a.length & ~7);
2153 w[0] = w[1] = recip;
2157 mov ESI, dword ptr [aptr];
2158 mov EDI, dword ptr [n];
2159 movq MM4, qword ptr [w];
2180 mov dword ptr [aptr], ESI;
2193 printf("_arrayExpSliceDivass_f unittest\n");
2194 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2196 version (log) printf(" cpuid %d\n", cpuid);
2198 for (int j = 0; j < 2; j++)
2201 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2202 a = a[j .. dim + j]; // misalign for second iteration
2203 T[] b = new T[dim + j];
2204 b = b[j .. dim + j];
2205 T[] c = new T[dim + j];
2206 c = c[j .. dim + j];
2208 for (int i = 0; i < dim; i++)
2210 b[i] = cast(T)(i + 7);
2211 c[i] = cast(T)(i * 2);
2217 for (int i = 0; i < dim; i++)
2219 if (c[i] != cast(T)(a[i] / 8))
2221 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2230 /* ======================================================================== */
2232 /***********************
2234 * a[] -= b[] * value
2237 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
2239 return _arraySliceExpMulSliceAddass_f(a, -value, b);
2242 /***********************
2244 * a[] += b[] * value
2247 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
2250 assert(a.length == b.length);
2251 assert(disjoint(a, b));
2256 auto aend = aptr + a.length;
2261 *aptr++ += *bptr++ * value;
2268 printf("_arraySliceExpMulSliceAddass_f unittest\n");
2272 version (log) printf(" cpuid %d\n", cpuid);
2274 for (int j = 0; j < 1; j++)
2277 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2278 a = a[j .. dim + j]; // misalign for second iteration
2279 T[] b = new T[dim + j];
2280 b = b[j .. dim + j];
2281 T[] c = new T[dim + j];
2282 c = c[j .. dim + j];
2284 for (int i = 0; i < dim; i++)
2286 b[i] = cast(T)(i + 7);
2287 c[i] = cast(T)(i * 2);
2293 for (int i = 0; i < dim; i++)
2295 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2296 if (c[i] != cast(T)(b[i] + a[i] * 6))
2298 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);