1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for float array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
10 private import util.cpuid;
14 /* This is so unit tests will test every CPU variant
17 const int CPUID_MAX = 5;
18 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
19 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
20 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
21 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
25 alias util.cpuid.mmx mmx;
26 alias util.cpuid.sse sse;
27 alias util.cpuid.sse2 sse2;
28 alias util.cpuid.amd3dnow amd3dnow;
33 bool disjoint(T)(T[] a, T[] b)
35 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
42 /* ======================================================================== */
44 /***********************
49 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
52 assert(a.length == b.length && b.length == c.length);
53 assert(disjoint(a, b));
54 assert(disjoint(a, c));
55 assert(disjoint(b, c));
59 //printf("_arraySliceSliceAddSliceAssign_f()\n");
61 auto aend = aptr + a.length;
65 version (D_InlineAsm_X86)
67 // SSE version is 834% faster
68 if (sse() && b.length >= 16)
70 version (log) printf("\tsse unaligned\n");
71 auto n = aptr + (b.length & ~15);
76 mov EAX, bptr; // left operand
77 mov ECX, cptr; // right operand
78 mov ESI, aptr; // destination operand
79 mov EDI, n; // end comparison
84 movups XMM1, [EAX+16];
85 movups XMM2, [EAX+32];
86 movups XMM3, [EAX+48];
89 movups XMM5, [ECX+16];
90 movups XMM6, [ECX+32];
91 movups XMM7, [ECX+48];
98 movups [ESI+ 0-64], XMM0;
99 movups [ESI+16-64], XMM1;
100 movups [ESI+32-64], XMM2;
101 movups [ESI+48-64], XMM3;
111 // 3DNow! version is only 13% faster
112 if (amd3dnow() && b.length >= 8)
114 version (log) printf("\tamd3dnow\n");
115 auto n = aptr + (b.length & ~7);
119 mov ESI, aptr; // destination operand
120 mov EDI, n; // end comparison
121 mov EAX, bptr; // left operand
122 mov ECX, cptr; // right operand
153 version (log) if (aptr < aend) printf("\tbase\n");
155 *aptr++ = *bptr++ + *cptr++;
163 printf("_arraySliceSliceAddSliceAssign_f unittest\n");
164 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
166 version (log) printf(" cpuid %d\n", cpuid);
168 for (int j = 0; j < 2; j++)
171 T[] a = new T[dim + j]; // aligned on 16 byte boundary
172 a = a[j .. dim + j]; // misalign for second iteration
173 T[] b = new T[dim + j];
175 T[] c = new T[dim + j];
178 for (int i = 0; i < dim; i++)
180 b[i] = cast(T)(i + 7);
181 c[i] = cast(T)(i * 2);
186 for (int i = 0; i < dim; i++)
188 if (c[i] != cast(T)(a[i] + b[i]))
190 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
198 /* ======================================================================== */
200 /***********************
205 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
208 assert(a.length == b.length && b.length == c.length);
209 assert(disjoint(a, b));
210 assert(disjoint(a, c));
211 assert(disjoint(b, c));
216 auto aend = aptr + a.length;
220 version (D_InlineAsm_X86)
222 // SSE version is 834% faster
223 if (sse() && b.length >= 16)
225 auto n = aptr + (b.length & ~15);
230 mov EAX, bptr; // left operand
231 mov ECX, cptr; // right operand
232 mov ESI, aptr; // destination operand
233 mov EDI, n; // end comparison
238 movups XMM1, [EAX+16];
239 movups XMM2, [EAX+32];
240 movups XMM3, [EAX+48];
243 movups XMM5, [ECX+16];
244 movups XMM6, [ECX+32];
245 movups XMM7, [ECX+48];
252 movups [ESI+ 0-64], XMM0;
253 movups [ESI+16-64], XMM1;
254 movups [ESI+32-64], XMM2;
255 movups [ESI+48-64], XMM3;
265 // 3DNow! version is only 13% faster
266 if (amd3dnow() && b.length >= 8)
268 auto n = aptr + (b.length & ~7);
272 mov ESI, aptr; // destination operand
273 mov EDI, n; // end comparison
274 mov EAX, bptr; // left operand
275 mov ECX, cptr; // right operand
307 *aptr++ = *bptr++ - *cptr++;
315 printf("_arraySliceSliceMinSliceAssign_f unittest\n");
316 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
318 version (log) printf(" cpuid %d\n", cpuid);
320 for (int j = 0; j < 2; j++)
323 T[] a = new T[dim + j]; // aligned on 16 byte boundary
324 a = a[j .. dim + j]; // misalign for second iteration
325 T[] b = new T[dim + j];
327 T[] c = new T[dim + j];
330 for (int i = 0; i < dim; i++)
332 b[i] = cast(T)(i + 7);
333 c[i] = cast(T)(i * 2);
338 for (int i = 0; i < dim; i++)
340 if (c[i] != cast(T)(a[i] - b[i]))
342 printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
350 /* ======================================================================== */
352 /***********************
357 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
360 assert(a.length == b.length);
361 assert(disjoint(a, b));
365 //printf("_arraySliceExpAddSliceAssign_f()\n");
367 auto aend = aptr + a.length;
370 version (D_InlineAsm_X86)
372 // SSE version is 665% faster
373 if (sse() && a.length >= 16)
375 auto n = aptr + (a.length & ~15);
384 shufps XMM4, XMM4, 0;
390 movups XMM1, [EAX+16];
391 movups XMM2, [EAX+32];
392 movups XMM3, [EAX+48];
398 movups [ESI+ 0-64], XMM0;
399 movups [ESI+16-64], XMM1;
400 movups [ESI+32-64], XMM2;
401 movups [ESI+48-64], XMM3;
410 // 3DNow! version is 69% faster
411 if (amd3dnow() && a.length >= 8)
413 auto n = aptr + (a.length & ~7);
415 ulong w = *cast(uint *) &value;
416 ulong v = w | (w << 32L);
423 movq MM4, qword ptr [v];
452 *aptr++ = *bptr++ + value;
459 printf("_arraySliceExpAddSliceAssign_f unittest\n");
460 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
462 version (log) printf(" cpuid %d\n", cpuid);
464 for (int j = 0; j < 2; j++)
467 T[] a = new T[dim + j]; // aligned on 16 byte boundary
468 a = a[j .. dim + j]; // misalign for second iteration
469 T[] b = new T[dim + j];
471 T[] c = new T[dim + j];
474 for (int i = 0; i < dim; i++)
476 b[i] = cast(T)(i + 7);
477 c[i] = cast(T)(i * 2);
482 for (int i = 0; i < dim; i++)
484 if (c[i] != cast(T)(a[i] + 6))
486 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
494 /* ======================================================================== */
496 /***********************
501 T[] _arrayExpSliceAddass_f(T[] a, T value)
503 //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
505 auto aend = aptr + a.length;
507 version (D_InlineAsm_X86)
509 // SSE version is 302% faster
510 if (sse() && a.length >= 16)
513 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
516 n = cast(T*)((cast(uint)aend) & ~15);
525 shufps XMM4, XMM4, 0;
530 movaps XMM1, [ESI+16];
531 movaps XMM2, [ESI+32];
532 movaps XMM3, [ESI+48];
538 movaps [ESI+ 0-64], XMM0;
539 movaps [ESI+16-64], XMM1;
540 movaps [ESI+32-64], XMM2;
541 movaps [ESI+48-64], XMM3;
549 // 3DNow! version is 63% faster
550 if (amd3dnow() && a.length >= 8)
552 auto n = aptr + (a.length & ~7);
554 ulong w = *cast(uint *) &value;
555 ulong v = w | (w << 32L);
559 mov ESI, dword ptr [aptr];
560 mov EDI, dword ptr [n];
561 movq MM4, qword ptr [v];
582 mov dword ptr [aptr], ESI;
595 printf("_arrayExpSliceAddass_f unittest\n");
596 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
598 version (log) printf(" cpuid %d\n", cpuid);
600 for (int j = 0; j < 2; j++)
603 T[] a = new T[dim + j]; // aligned on 16 byte boundary
604 a = a[j .. dim + j]; // misalign for second iteration
605 T[] b = new T[dim + j];
607 T[] c = new T[dim + j];
610 for (int i = 0; i < dim; i++)
612 b[i] = cast(T)(i + 7);
613 c[i] = cast(T)(i * 2);
619 for (int i = 0; i < dim; i++)
621 if (c[i] != cast(T)(a[i] + 6))
623 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
631 /* ======================================================================== */
633 /***********************
638 T[] _arraySliceSliceAddass_f(T[] a, T[] b)
641 assert (a.length == b.length);
642 assert (disjoint(a, b));
646 //printf("_arraySliceSliceAddass_f()\n");
648 auto aend = aptr + a.length;
651 version (D_InlineAsm_X86)
653 // SSE version is 468% faster
654 if (sse() && a.length >= 16)
656 auto n = aptr + (a.length & ~15);
661 mov ECX, bptr; // right operand
662 mov ESI, aptr; // destination operand
663 mov EDI, n; // end comparison
668 movups XMM1, [ESI+16];
669 movups XMM2, [ESI+32];
670 movups XMM3, [ESI+48];
673 movups XMM5, [ECX+16];
674 movups XMM6, [ECX+32];
675 movups XMM7, [ECX+48];
681 movups [ESI+ 0-64], XMM0;
682 movups [ESI+16-64], XMM1;
683 movups [ESI+32-64], XMM2;
684 movups [ESI+48-64], XMM3;
693 // 3DNow! version is 57% faster
694 if (amd3dnow() && a.length >= 8)
696 auto n = aptr + (a.length & ~7);
700 mov ESI, dword ptr [aptr]; // destination operand
701 mov EDI, dword ptr [n]; // end comparison
702 mov ECX, dword ptr [bptr]; // right operand
724 mov dword ptr [aptr], ESI;
725 mov dword ptr [bptr], ECX;
738 printf("_arraySliceSliceAddass_f unittest\n");
739 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
741 version (log) printf(" cpuid %d\n", cpuid);
743 for (int j = 0; j < 2; j++)
746 T[] a = new T[dim + j]; // aligned on 16 byte boundary
747 a = a[j .. dim + j]; // misalign for second iteration
748 T[] b = new T[dim + j];
750 T[] c = new T[dim + j];
753 for (int i = 0; i < dim; i++)
755 b[i] = cast(T)(i + 7);
756 c[i] = cast(T)(i * 2);
762 for (int i = 0; i < dim; i++)
764 if (c[i] != cast(T)(a[i] + b[i]))
766 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
774 /* ======================================================================== */
776 /***********************
781 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
784 assert (a.length == b.length);
785 assert (disjoint(a, b));
789 //printf("_arraySliceExpMinSliceAssign_f()\n");
791 auto aend = aptr + a.length;
794 version (D_InlineAsm_X86)
796 // SSE version is 622% faster
797 if (sse() && a.length >= 16)
799 auto n = aptr + (a.length & ~15);
808 shufps XMM4, XMM4, 0;
814 movups XMM1, [EAX+16];
815 movups XMM2, [EAX+32];
816 movups XMM3, [EAX+48];
822 movups [ESI+ 0-64], XMM0;
823 movups [ESI+16-64], XMM1;
824 movups [ESI+32-64], XMM2;
825 movups [ESI+48-64], XMM3;
834 // 3DNow! version is 67% faster
835 if (amd3dnow() && a.length >= 8)
837 auto n = aptr + (a.length & ~7);
845 mov ESI, dword ptr [aptr];
846 mov EDI, dword ptr [n];
847 mov EAX, dword ptr [bptr];
848 movq MM4, qword ptr [w];
870 mov dword ptr [aptr], ESI;
871 mov dword ptr [bptr], EAX;
877 *aptr++ = *bptr++ - value;
884 printf("_arraySliceExpMinSliceAssign_f unittest\n");
885 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
887 version (log) printf(" cpuid %d\n", cpuid);
889 for (int j = 0; j < 2; j++)
892 T[] a = new T[dim + j]; // aligned on 16 byte boundary
893 a = a[j .. dim + j]; // misalign for second iteration
894 T[] b = new T[dim + j];
896 T[] c = new T[dim + j];
899 for (int i = 0; i < dim; i++)
901 b[i] = cast(T)(i + 7);
902 c[i] = cast(T)(i * 2);
907 for (int i = 0; i < dim; i++)
909 if (c[i] != cast(T)(a[i] - 6))
911 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
919 /* ======================================================================== */
921 /***********************
926 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
929 assert (a.length == b.length);
930 assert (disjoint(a, b));
934 //printf("_arrayExpSliceMinSliceAssign_f()\n");
936 auto aend = aptr + a.length;
939 version (D_InlineAsm_X86)
941 // SSE version is 690% faster
942 if (sse() && a.length >= 16)
944 auto n = aptr + (a.length & ~15);
953 shufps XMM4, XMM4, 0;
961 movups XMM1, [EAX+16];
962 movups XMM2, [EAX+32];
963 movups XMM3, [EAX+48];
967 movups [ESI+ 0-64], XMM5;
968 movups [ESI+16-64], XMM6;
973 movups [ESI+32-64], XMM5;
974 movups [ESI+48-64], XMM6;
983 // 3DNow! version is 67% faster
984 if (amd3dnow() && a.length >= 8)
986 auto n = aptr + (a.length & ~7);
988 ulong w = *cast(uint *) &value;
989 ulong v = w | (w << 32L);
996 movq MM4, qword ptr [v];
1025 *aptr++ = value - *bptr++;
1032 printf("_arrayExpSliceMinSliceAssign_f unittest\n");
1033 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1035 version (log) printf(" cpuid %d\n", cpuid);
1037 for (int j = 0; j < 2; j++)
1040 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1041 a = a[j .. dim + j]; // misalign for second iteration
1042 T[] b = new T[dim + j];
1043 b = b[j .. dim + j];
1044 T[] c = new T[dim + j];
1045 c = c[j .. dim + j];
1047 for (int i = 0; i < dim; i++)
1049 b[i] = cast(T)(i + 7);
1050 c[i] = cast(T)(i * 2);
1055 for (int i = 0; i < dim; i++)
1057 if (c[i] != cast(T)(6 - a[i]))
1059 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
1067 /* ======================================================================== */
1069 /***********************
1074 T[] _arrayExpSliceMinass_f(T[] a, T value)
1076 //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1078 auto aend = aptr + a.length;
1080 version (D_InlineAsm_X86)
1082 // SSE version is 304% faster
1083 if (sse() && a.length >= 16)
1086 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1089 n = cast(T*)((cast(uint)aend) & ~15);
1098 shufps XMM4, XMM4, 0;
1103 movaps XMM1, [ESI+16];
1104 movaps XMM2, [ESI+32];
1105 movaps XMM3, [ESI+48];
1111 movaps [ESI+ 0-64], XMM0;
1112 movaps [ESI+16-64], XMM1;
1113 movaps [ESI+32-64], XMM2;
1114 movaps [ESI+48-64], XMM3;
1122 // 3DNow! version is 63% faster
1123 if (amd3dnow() && a.length >= 8)
1125 auto n = aptr + (a.length & ~7);
1127 ulong w = *cast(uint *) &value;
1128 ulong v = w | (w << 32L);
1132 mov ESI, dword ptr [aptr];
1133 mov EDI, dword ptr [n];
1134 movq MM4, qword ptr [v];
1155 mov dword ptr [aptr], ESI;
1168 printf("_arrayExpSliceminass_f unittest\n");
1169 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1171 version (log) printf(" cpuid %d\n", cpuid);
1173 for (int j = 0; j < 2; j++)
1176 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1177 a = a[j .. dim + j]; // misalign for second iteration
1178 T[] b = new T[dim + j];
1179 b = b[j .. dim + j];
1180 T[] c = new T[dim + j];
1181 c = c[j .. dim + j];
1183 for (int i = 0; i < dim; i++)
1185 b[i] = cast(T)(i + 7);
1186 c[i] = cast(T)(i * 2);
1192 for (int i = 0; i < dim; i++)
1194 if (c[i] != cast(T)(a[i] - 6))
1196 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1204 /* ======================================================================== */
1206 /***********************
1211 T[] _arraySliceSliceMinass_f(T[] a, T[] b)
1214 assert (a.length == b.length);
1215 assert (disjoint(a, b));
1219 //printf("_arraySliceSliceMinass_f()\n");
1221 auto aend = aptr + a.length;
1224 version (D_InlineAsm_X86)
1226 // SSE version is 468% faster
1227 if (sse() && a.length >= 16)
1229 auto n = aptr + (a.length & ~15);
1234 mov ECX, bptr; // right operand
1235 mov ESI, aptr; // destination operand
1236 mov EDI, n; // end comparison
1241 movups XMM1, [ESI+16];
1242 movups XMM2, [ESI+32];
1243 movups XMM3, [ESI+48];
1246 movups XMM5, [ECX+16];
1247 movups XMM6, [ECX+32];
1248 movups XMM7, [ECX+48];
1254 movups [ESI+ 0-64], XMM0;
1255 movups [ESI+16-64], XMM1;
1256 movups [ESI+32-64], XMM2;
1257 movups [ESI+48-64], XMM3;
1266 // 3DNow! version is 57% faster
1267 if (amd3dnow() && a.length >= 8)
1269 auto n = aptr + (a.length & ~7);
1273 mov ESI, dword ptr [aptr]; // destination operand
1274 mov EDI, dword ptr [n]; // end comparison
1275 mov ECX, dword ptr [bptr]; // right operand
1285 pfsub MM2, [ECX+16];
1286 pfsub MM3, [ECX+24];
1311 printf("_arrayExpSliceMinass_f unittest\n");
1312 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1314 version (log) printf(" cpuid %d\n", cpuid);
1316 for (int j = 0; j < 2; j++)
1319 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1320 a = a[j .. dim + j]; // misalign for second iteration
1321 T[] b = new T[dim + j];
1322 b = b[j .. dim + j];
1323 T[] c = new T[dim + j];
1324 c = c[j .. dim + j];
1326 for (int i = 0; i < dim; i++)
1328 b[i] = cast(T)(i + 7);
1329 c[i] = cast(T)(i * 2);
1335 for (int i = 0; i < dim; i++)
1337 if (c[i] != cast(T)(a[i] - 6))
1339 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1347 /* ======================================================================== */
1349 /***********************
1354 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
1357 assert(a.length == b.length);
1358 assert(disjoint(a, b));
1362 //printf("_arraySliceExpMulSliceAssign_f()\n");
1364 auto aend = aptr + a.length;
1367 version (D_InlineAsm_X86)
1369 // SSE version is 607% faster
1370 if (sse() && a.length >= 16)
1372 auto n = aptr + (a.length & ~15);
1381 shufps XMM4, XMM4, 0;
1387 movups XMM1, [EAX+16];
1388 movups XMM2, [EAX+32];
1389 movups XMM3, [EAX+48];
1395 movups [ESI+ 0-64], XMM0;
1396 movups [ESI+16-64], XMM1;
1397 movups [ESI+32-64], XMM2;
1398 movups [ESI+48-64], XMM3;
1407 // 3DNow! version is 69% faster
1408 if (amd3dnow() && a.length >= 8)
1410 auto n = aptr + (a.length & ~7);
1412 ulong w = *cast(uint *) &value;
1413 ulong v = w | (w << 32L);
1417 mov ESI, dword ptr [aptr];
1418 mov EDI, dword ptr [n];
1419 mov EAX, dword ptr [bptr];
1420 movq MM4, qword ptr [v];
1449 *aptr++ = *bptr++ * value;
1456 printf("_arraySliceExpMulSliceAssign_f unittest\n");
1457 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1459 version (log) printf(" cpuid %d\n", cpuid);
1461 for (int j = 0; j < 2; j++)
1464 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1465 a = a[j .. dim + j]; // misalign for second iteration
1466 T[] b = new T[dim + j];
1467 b = b[j .. dim + j];
1468 T[] c = new T[dim + j];
1469 c = c[j .. dim + j];
1471 for (int i = 0; i < dim; i++)
1473 b[i] = cast(T)(i + 7);
1474 c[i] = cast(T)(i * 2);
1479 for (int i = 0; i < dim; i++)
1481 if (c[i] != cast(T)(a[i] * 6))
1483 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1491 /* ======================================================================== */
1493 /***********************
1498 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
1501 assert(a.length == b.length && b.length == c.length);
1502 assert(disjoint(a, b));
1503 assert(disjoint(a, c));
1504 assert(disjoint(b, c));
1508 //printf("_arraySliceSliceMulSliceAssign_f()\n");
1510 auto aend = aptr + a.length;
1514 version (D_InlineAsm_X86)
1516 // SSE version is 833% faster
1517 if (sse() && a.length >= 16)
1519 auto n = aptr + (a.length & ~15);
1524 mov EAX, bptr; // left operand
1525 mov ECX, cptr; // right operand
1526 mov ESI, aptr; // destination operand
1527 mov EDI, n; // end comparison
1532 movups XMM1, [EAX+16];
1533 movups XMM2, [EAX+32];
1534 movups XMM3, [EAX+48];
1537 movups XMM5, [ECX+16];
1538 movups XMM6, [ECX+32];
1539 movups XMM7, [ECX+48];
1546 movups [ESI+ 0-64], XMM0;
1547 movups [ESI+16-64], XMM1;
1548 movups [ESI+32-64], XMM2;
1549 movups [ESI+48-64], XMM3;
1559 // 3DNow! version is only 13% faster
1560 if (amd3dnow() && a.length >= 8)
1562 auto n = aptr + (a.length & ~7);
1566 mov ESI, dword ptr [aptr]; // destination operand
1567 mov EDI, dword ptr [n]; // end comparison
1568 mov EAX, dword ptr [bptr]; // left operand
1569 mov ECX, dword ptr [cptr]; // right operand
1579 pfmul MM2, [ECX+16];
1580 pfmul MM3, [ECX+24];
1600 *aptr++ = *bptr++ * *cptr++;
1607 printf("_arraySliceSliceMulSliceAssign_f unittest\n");
1608 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1610 version (log) printf(" cpuid %d\n", cpuid);
1612 for (int j = 0; j < 2; j++)
1615 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1616 a = a[j .. dim + j]; // misalign for second iteration
1617 T[] b = new T[dim + j];
1618 b = b[j .. dim + j];
1619 T[] c = new T[dim + j];
1620 c = c[j .. dim + j];
1622 for (int i = 0; i < dim; i++)
1624 b[i] = cast(T)(i + 7);
1625 c[i] = cast(T)(i * 2);
1630 for (int i = 0; i < dim; i++)
1632 if (c[i] != cast(T)(a[i] * b[i]))
1634 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1642 /* ======================================================================== */
1644 /***********************
1649 T[] _arrayExpSliceMulass_f(T[] a, T value)
1651 //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1653 auto aend = aptr + a.length;
1655 version (D_InlineAsm_X86)
1657 // SSE version is 303% faster
1658 if (sse() && a.length >= 16)
1661 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1664 n = cast(T*)((cast(uint)aend) & ~15);
1673 shufps XMM4, XMM4, 0;
1678 movaps XMM1, [ESI+16];
1679 movaps XMM2, [ESI+32];
1680 movaps XMM3, [ESI+48];
1686 movaps [ESI+ 0-64], XMM0;
1687 movaps [ESI+16-64], XMM1;
1688 movaps [ESI+32-64], XMM2;
1689 movaps [ESI+48-64], XMM3;
1697 // 3DNow! version is 63% faster
1698 if (amd3dnow() && a.length >= 8)
1700 auto n = aptr + (a.length & ~7);
1702 ulong w = *cast(uint *) &value;
1703 ulong v = w | (w << 32L);
1707 mov ESI, dword ptr [aptr];
1708 mov EDI, dword ptr [n];
1709 movq MM4, qword ptr [v];
1730 mov dword ptr [aptr], ESI;
1743 printf("_arrayExpSliceMulass_f unittest\n");
1744 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1746 version (log) printf(" cpuid %d\n", cpuid);
1748 for (int j = 0; j < 2; j++)
1751 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1752 a = a[j .. dim + j]; // misalign for second iteration
1753 T[] b = new T[dim + j];
1754 b = b[j .. dim + j];
1755 T[] c = new T[dim + j];
1756 c = c[j .. dim + j];
1758 for (int i = 0; i < dim; i++)
1760 b[i] = cast(T)(i + 7);
1761 c[i] = cast(T)(i * 2);
1767 for (int i = 0; i < dim; i++)
1769 if (c[i] != cast(T)(a[i] * 6))
1771 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1779 /* ======================================================================== */
1781 /***********************
1786 T[] _arraySliceSliceMulass_f(T[] a, T[] b)
1789 assert (a.length == b.length);
1790 assert (disjoint(a, b));
1794 //printf("_arraySliceSliceMulass_f()\n");
1796 auto aend = aptr + a.length;
1799 version (D_InlineAsm_X86)
1801 // SSE version is 525% faster
1802 if (sse() && a.length >= 16)
1804 auto n = aptr + (a.length & ~15);
1809 mov ECX, bptr; // right operand
1810 mov ESI, aptr; // destination operand
1811 mov EDI, n; // end comparison
1816 movups XMM1, [ESI+16];
1817 movups XMM2, [ESI+32];
1818 movups XMM3, [ESI+48];
1821 movups XMM5, [ECX+16];
1822 movups XMM6, [ECX+32];
1823 movups XMM7, [ECX+48];
1829 movups [ESI+ 0-64], XMM0;
1830 movups [ESI+16-64], XMM1;
1831 movups [ESI+32-64], XMM2;
1832 movups [ESI+48-64], XMM3;
1841 // 3DNow! version is 57% faster
1842 if (amd3dnow() && a.length >= 8)
1844 auto n = aptr + (a.length & ~7);
1848 mov ESI, dword ptr [aptr]; // destination operand
1849 mov EDI, dword ptr [n]; // end comparison
1850 mov ECX, dword ptr [bptr]; // right operand
1860 pfmul MM2, [ECX+16];
1861 pfmul MM3, [ECX+24];
1872 mov dword ptr [aptr], ESI;
1873 mov dword ptr [bptr], ECX;
1886 printf("_arrayExpSliceMulass_f unittest\n");
1887 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1889 version (log) printf(" cpuid %d\n", cpuid);
1891 for (int j = 0; j < 2; j++)
1894 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1895 a = a[j .. dim + j]; // misalign for second iteration
1896 T[] b = new T[dim + j];
1897 b = b[j .. dim + j];
1898 T[] c = new T[dim + j];
1899 c = c[j .. dim + j];
1901 for (int i = 0; i < dim; i++)
1903 b[i] = cast(T)(i + 7);
1904 c[i] = cast(T)(i * 2);
1910 for (int i = 0; i < dim; i++)
1912 if (c[i] != cast(T)(a[i] * 6))
1914 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1922 /* ======================================================================== */
1924 /***********************
1929 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
1932 assert(a.length == b.length);
1933 assert(disjoint(a, b));
1937 //printf("_arraySliceExpDivSliceAssign_f()\n");
1939 auto aend = aptr + a.length;
1942 /* Multiplying by the reciprocal is faster, but does
1943 * not produce as accurate an answer.
1945 T recip = cast(T)1 / value;
1947 version (D_InlineAsm_X86)
1949 // SSE version is 587% faster
1950 if (sse() && a.length >= 16)
1952 auto n = aptr + (a.length & ~15);
1963 shufps XMM4, XMM4, 0;
1969 movups XMM1, [EAX+16];
1970 movups XMM2, [EAX+32];
1971 movups XMM3, [EAX+48];
1981 movups [ESI+ 0-64], XMM0;
1982 movups [ESI+16-64], XMM1;
1983 movups [ESI+32-64], XMM2;
1984 movups [ESI+48-64], XMM3;
1993 // 3DNow! version is 72% faster
1994 if (amd3dnow() && a.length >= 8)
1996 auto n = aptr + (a.length & ~7);
2005 mov ESI, dword ptr [aptr];
2006 mov EDI, dword ptr [n];
2007 mov EAX, dword ptr [bptr];
2008 movq MM4, qword ptr [w];
2030 mov dword ptr [aptr], ESI;
2031 mov dword ptr [bptr], EAX;
2037 *aptr++ = *bptr++ * recip;
2044 printf("_arraySliceExpDivSliceAssign_f unittest\n");
2045 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2047 version (log) printf(" cpuid %d\n", cpuid);
2049 for (int j = 0; j < 2; j++)
2052 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2053 a = a[j .. dim + j]; // misalign for second iteration
2054 T[] b = new T[dim + j];
2055 b = b[j .. dim + j];
2056 T[] c = new T[dim + j];
2057 c = c[j .. dim + j];
2059 for (int i = 0; i < dim; i++)
2061 b[i] = cast(T)(i + 7);
2062 c[i] = cast(T)(i * 2);
2067 for (int i = 0; i < dim; i++)
2069 if (c[i] != cast(T)(a[i] / 8))
2071 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2079 /* ======================================================================== */
2081 /***********************
2086 T[] _arrayExpSliceDivass_f(T[] a, T value)
2088 //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2090 auto aend = aptr + a.length;
2092 /* Multiplying by the reciprocal is faster, but does
2093 * not produce as accurate an answer.
2095 T recip = cast(T)1 / value;
2097 version (D_InlineAsm_X86)
2099 // SSE version is 245% faster
2100 if (sse() && a.length >= 16)
2103 auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
2106 n = cast(T*)((cast(uint)aend) & ~15);
2117 shufps XMM4, XMM4, 0;
2122 movaps XMM1, [ESI+16];
2123 movaps XMM2, [ESI+32];
2124 movaps XMM3, [ESI+48];
2134 movaps [ESI+ 0-64], XMM0;
2135 movaps [ESI+16-64], XMM1;
2136 movaps [ESI+32-64], XMM2;
2137 movaps [ESI+48-64], XMM3;
2145 // 3DNow! version is 57% faster
2146 if (amd3dnow() && a.length >= 8)
2148 auto n = aptr + (a.length & ~7);
2152 w[0] = w[1] = recip;
2156 mov ESI, dword ptr [aptr];
2157 mov EDI, dword ptr [n];
2158 movq MM4, qword ptr [w];
2179 mov dword ptr [aptr], ESI;
2192 printf("_arrayExpSliceDivass_f unittest\n");
2193 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2195 version (log) printf(" cpuid %d\n", cpuid);
2197 for (int j = 0; j < 2; j++)
2200 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2201 a = a[j .. dim + j]; // misalign for second iteration
2202 T[] b = new T[dim + j];
2203 b = b[j .. dim + j];
2204 T[] c = new T[dim + j];
2205 c = c[j .. dim + j];
2207 for (int i = 0; i < dim; i++)
2209 b[i] = cast(T)(i + 7);
2210 c[i] = cast(T)(i * 2);
2216 for (int i = 0; i < dim; i++)
2218 if (c[i] != cast(T)(a[i] / 8))
2220 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2229 /* ======================================================================== */
2231 /***********************
2233 * a[] -= b[] * value
2236 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
2238 return _arraySliceExpMulSliceAddass_f(a, -value, b);
2241 /***********************
2243 * a[] += b[] * value
2246 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
2249 assert(a.length == b.length);
2250 assert(disjoint(a, b));
2255 auto aend = aptr + a.length;
2260 *aptr++ += *bptr++ * value;
2267 printf("_arraySliceExpMulSliceAddass_f unittest\n");
2271 version (log) printf(" cpuid %d\n", cpuid);
2273 for (int j = 0; j < 1; j++)
2276 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2277 a = a[j .. dim + j]; // misalign for second iteration
2278 T[] b = new T[dim + j];
2279 b = b[j .. dim + j];
2280 T[] c = new T[dim + j];
2281 c = c[j .. dim + j];
2283 for (int i = 0; i < dim; i++)
2285 b[i] = cast(T)(i + 7);
2286 c[i] = cast(T)(i * 2);
2292 for (int i = 0; i < dim; i++)
2294 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2295 if (c[i] != cast(T)(b[i] + a[i] * 6))
2297 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);