1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for double array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
10 private import util.cpuid;
14 /* This is so unit tests will test every CPU variant
17 const int CPUID_MAX = 5;
18 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
19 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
20 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
21 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
25 alias util.cpuid.mmx mmx;
26 alias util.cpuid.sse sse;
27 alias util.cpuid.sse2 sse2;
28 alias util.cpuid.amd3dnow amd3dnow;
33 bool disjoint(T)(T[] a, T[] b)
35 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
38 /* Performance figures measured by Burton Radons
45 /* ======================================================================== */
47 /***********************
52 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
55 assert(a.length == b.length && b.length == c.length);
56 assert(disjoint(a, b));
57 assert(disjoint(a, c));
58 assert(disjoint(b, c));
63 auto aend = aptr + a.length;
67 version (D_InlineAsm_X86)
69 // SSE2 version is 333% faster
70 if (sse2() && b.length >= 16)
72 auto n = aptr + (b.length & ~15);
77 mov EAX, bptr; // left operand
78 mov ECX, cptr; // right operand
79 mov ESI, aptr; // destination operand
80 mov EDI, n; // end comparison
85 movupd XMM1, [EAX+16];
86 movupd XMM2, [EAX+32];
87 movupd XMM3, [EAX+48];
90 movupd XMM5, [ECX+16];
91 movupd XMM6, [ECX+32];
92 movupd XMM7, [ECX+48];
99 movupd [ESI+ 0-64], XMM0;
100 movupd [ESI+16-64], XMM1;
101 movupd [ESI+32-64], XMM2;
102 movupd [ESI+48-64], XMM3;
115 *aptr++ = *bptr++ + *cptr++;
123 printf("_arraySliceSliceAddSliceAssign_d unittest\n");
124 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
126 version (log) printf(" cpuid %d\n", cpuid);
128 for (int j = 0; j < 2; j++)
131 T[] a = new T[dim + j]; // aligned on 16 byte boundary
132 a = a[j .. dim + j]; // misalign for second iteration
133 T[] b = new T[dim + j];
135 T[] c = new T[dim + j];
138 for (int i = 0; i < dim; i++)
140 b[i] = cast(T)(i + 7);
141 c[i] = cast(T)(i * 2);
146 for (int i = 0; i < dim; i++)
148 if (c[i] != cast(T)(a[i] + b[i]))
150 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
158 /* ======================================================================== */
160 /***********************
165 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
168 assert(a.length == b.length && b.length == c.length);
169 assert(disjoint(a, b));
170 assert(disjoint(a, c));
171 assert(disjoint(b, c));
176 auto aend = aptr + a.length;
180 version (D_InlineAsm_X86)
182 // SSE2 version is 324% faster
183 if (sse2() && b.length >= 8)
185 auto n = aptr + (b.length & ~7);
190 mov EAX, bptr; // left operand
191 mov ECX, cptr; // right operand
192 mov ESI, aptr; // destination operand
193 mov EDI, n; // end comparison
198 movupd XMM1, [EAX+16];
199 movupd XMM2, [EAX+32];
200 movupd XMM3, [EAX+48];
203 movupd XMM5, [ECX+16];
204 movupd XMM6, [ECX+32];
205 movupd XMM7, [ECX+48];
212 movupd [ESI+ 0-64], XMM0;
213 movupd [ESI+16-64], XMM1;
214 movupd [ESI+32-64], XMM2;
215 movupd [ESI+48-64], XMM3;
228 *aptr++ = *bptr++ - *cptr++;
236 printf("_arraySliceSliceMinSliceAssign_d unittest\n");
237 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
239 version (log) printf(" cpuid %d\n", cpuid);
241 for (int j = 0; j < 2; j++)
244 T[] a = new T[dim + j]; // aligned on 16 byte boundary
245 a = a[j .. dim + j]; // misalign for second iteration
246 T[] b = new T[dim + j];
248 T[] c = new T[dim + j];
251 for (int i = 0; i < dim; i++)
253 b[i] = cast(T)(i + 7);
254 c[i] = cast(T)(i * 2);
259 for (int i = 0; i < dim; i++)
261 if (c[i] != cast(T)(a[i] - b[i]))
263 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
272 /* ======================================================================== */
274 /***********************
279 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
282 assert(a.length == b.length);
283 assert(disjoint(a, b));
287 //printf("_arraySliceExpAddSliceAssign_d()\n");
289 auto aend = aptr + a.length;
292 version (D_InlineAsm_X86)
294 // SSE2 version is 305% faster
295 if (sse2() && a.length >= 8)
297 auto n = aptr + (a.length & ~7);
306 shufpd XMM4, XMM4, 0;
312 movupd XMM1, [EAX+16];
313 movupd XMM2, [EAX+32];
314 movupd XMM3, [EAX+48];
320 movupd [ESI+ 0-64], XMM0;
321 movupd [ESI+16-64], XMM1;
322 movupd [ESI+32-64], XMM2;
323 movupd [ESI+48-64], XMM3;
334 *aptr++ = *bptr++ + value;
341 printf("_arraySliceExpAddSliceAssign_d unittest\n");
342 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
344 version (log) printf(" cpuid %d\n", cpuid);
346 for (int j = 0; j < 2; j++)
349 T[] a = new T[dim + j]; // aligned on 16 byte boundary
350 a = a[j .. dim + j]; // misalign for second iteration
351 T[] b = new T[dim + j];
353 T[] c = new T[dim + j];
356 for (int i = 0; i < dim; i++)
358 b[i] = cast(T)(i + 7);
359 c[i] = cast(T)(i * 2);
364 for (int i = 0; i < dim; i++)
366 if (c[i] != cast(T)(a[i] + 6))
368 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
376 /* ======================================================================== */
378 /***********************
383 T[] _arrayExpSliceAddass_d(T[] a, T value)
385 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
387 auto aend = aptr + a.length;
389 version (D_InlineAsm_X86)
391 // SSE2 version is 114% faster
392 if (sse2() && a.length >= 8)
394 auto n = cast(T*)((cast(uint)aend) & ~7);
403 shufpd XMM4, XMM4, 0;
408 movupd XMM1, [ESI+16];
409 movupd XMM2, [ESI+32];
410 movupd XMM3, [ESI+48];
416 movupd [ESI+ 0-64], XMM0;
417 movupd [ESI+16-64], XMM1;
418 movupd [ESI+32-64], XMM2;
419 movupd [ESI+48-64], XMM3;
436 printf("_arrayExpSliceAddass_d unittest\n");
437 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
439 version (log) printf(" cpuid %d\n", cpuid);
441 for (int j = 0; j < 2; j++)
444 T[] a = new T[dim + j]; // aligned on 16 byte boundary
445 a = a[j .. dim + j]; // misalign for second iteration
446 T[] b = new T[dim + j];
448 T[] c = new T[dim + j];
451 for (int i = 0; i < dim; i++)
453 b[i] = cast(T)(i + 7);
454 c[i] = cast(T)(i * 2);
460 for (int i = 0; i < dim; i++)
462 if (c[i] != cast(T)(a[i] + 6))
464 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
472 /* ======================================================================== */
474 /***********************
479 T[] _arraySliceSliceAddass_d(T[] a, T[] b)
482 assert (a.length == b.length);
483 assert (disjoint(a, b));
487 //printf("_arraySliceSliceAddass_d()\n");
489 auto aend = aptr + a.length;
492 version (D_InlineAsm_X86)
494 // SSE2 version is 183% faster
495 if (sse2() && a.length >= 8)
497 auto n = aptr + (a.length & ~7);
502 mov ECX, bptr; // right operand
503 mov ESI, aptr; // destination operand
504 mov EDI, n; // end comparison
509 movupd XMM1, [ESI+16];
510 movupd XMM2, [ESI+32];
511 movupd XMM3, [ESI+48];
514 movupd XMM5, [ECX+16];
515 movupd XMM6, [ECX+32];
516 movupd XMM7, [ECX+48];
522 movupd [ESI+ 0-64], XMM0;
523 movupd [ESI+16-64], XMM1;
524 movupd [ESI+32-64], XMM2;
525 movupd [ESI+48-64], XMM3;
543 printf("_arraySliceSliceAddass_d unittest\n");
544 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
546 version (log) printf(" cpuid %d\n", cpuid);
548 for (int j = 0; j < 2; j++)
551 T[] a = new T[dim + j]; // aligned on 16 byte boundary
552 a = a[j .. dim + j]; // misalign for second iteration
553 T[] b = new T[dim + j];
555 T[] c = new T[dim + j];
558 for (int i = 0; i < dim; i++)
560 b[i] = cast(T)(i + 7);
561 c[i] = cast(T)(i * 2);
567 for (int i = 0; i < dim; i++)
569 if (c[i] != cast(T)(a[i] + b[i]))
571 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
579 /* ======================================================================== */
581 /***********************
586 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
589 assert (a.length == b.length);
590 assert (disjoint(a, b));
594 //printf("_arraySliceExpMinSliceAssign_d()\n");
596 auto aend = aptr + a.length;
599 version (D_InlineAsm_X86)
601 // SSE2 version is 305% faster
602 if (sse2() && a.length >= 8)
604 auto n = aptr + (a.length & ~7);
613 shufpd XMM4, XMM4, 0;
619 movupd XMM1, [EAX+16];
620 movupd XMM2, [EAX+32];
621 movupd XMM3, [EAX+48];
627 movupd [ESI+ 0-64], XMM0;
628 movupd [ESI+16-64], XMM1;
629 movupd [ESI+32-64], XMM2;
630 movupd [ESI+48-64], XMM3;
641 *aptr++ = *bptr++ - value;
648 printf("_arraySliceExpMinSliceAssign_d unittest\n");
649 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
651 version (log) printf(" cpuid %d\n", cpuid);
653 for (int j = 0; j < 2; j++)
656 T[] a = new T[dim + j]; // aligned on 16 byte boundary
657 a = a[j .. dim + j]; // misalign for second iteration
658 T[] b = new T[dim + j];
660 T[] c = new T[dim + j];
663 for (int i = 0; i < dim; i++)
665 b[i] = cast(T)(i + 7);
666 c[i] = cast(T)(i * 2);
671 for (int i = 0; i < dim; i++)
673 if (c[i] != cast(T)(a[i] - 6))
675 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
683 /* ======================================================================== */
685 /***********************
690 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
693 assert (a.length == b.length);
694 assert (disjoint(a, b));
698 //printf("_arrayExpSliceMinSliceAssign_d()\n");
700 auto aend = aptr + a.length;
703 version (D_InlineAsm_X86)
705 // SSE2 version is 66% faster
706 if (sse2() && a.length >= 8)
708 auto n = aptr + (a.length & ~7);
717 shufpd XMM4, XMM4, 0;
725 movupd XMM1, [EAX+16];
726 movupd XMM2, [EAX+32];
727 movupd XMM3, [EAX+48];
731 movupd [ESI+ 0-64], XMM5;
732 movupd [ESI+16-64], XMM6;
737 movupd [ESI+32-64], XMM5;
738 movupd [ESI+48-64], XMM6;
749 *aptr++ = value - *bptr++;
756 printf("_arrayExpSliceMinSliceAssign_d unittest\n");
757 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
759 version (log) printf(" cpuid %d\n", cpuid);
761 for (int j = 0; j < 2; j++)
764 T[] a = new T[dim + j]; // aligned on 16 byte boundary
765 a = a[j .. dim + j]; // misalign for second iteration
766 T[] b = new T[dim + j];
768 T[] c = new T[dim + j];
771 for (int i = 0; i < dim; i++)
773 b[i] = cast(T)(i + 7);
774 c[i] = cast(T)(i * 2);
779 for (int i = 0; i < dim; i++)
781 if (c[i] != cast(T)(6 - a[i]))
783 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
791 /* ======================================================================== */
793 /***********************
798 T[] _arrayExpSliceMinass_d(T[] a, T value)
800 //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
802 auto aend = aptr + a.length;
804 version (D_InlineAsm_X86)
806 // SSE2 version is 115% faster
807 if (sse2() && a.length >= 8)
809 auto n = cast(T*)((cast(uint)aend) & ~7);
818 shufpd XMM4, XMM4, 0;
823 movupd XMM1, [ESI+16];
824 movupd XMM2, [ESI+32];
825 movupd XMM3, [ESI+48];
831 movupd [ESI+ 0-64], XMM0;
832 movupd [ESI+16-64], XMM1;
833 movupd [ESI+32-64], XMM2;
834 movupd [ESI+48-64], XMM3;
851 printf("_arrayExpSliceMinass_d unittest\n");
852 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
854 version (log) printf(" cpuid %d\n", cpuid);
856 for (int j = 0; j < 2; j++)
859 T[] a = new T[dim + j]; // aligned on 16 byte boundary
860 a = a[j .. dim + j]; // misalign for second iteration
861 T[] b = new T[dim + j];
863 T[] c = new T[dim + j];
866 for (int i = 0; i < dim; i++)
868 b[i] = cast(T)(i + 7);
869 c[i] = cast(T)(i * 2);
875 for (int i = 0; i < dim; i++)
877 if (c[i] != cast(T)(a[i] - 6))
879 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
887 /* ======================================================================== */
889 /***********************
894 T[] _arraySliceSliceMinass_d(T[] a, T[] b)
897 assert (a.length == b.length);
898 assert (disjoint(a, b));
902 //printf("_arraySliceSliceMinass_d()\n");
904 auto aend = aptr + a.length;
907 version (D_InlineAsm_X86)
909 // SSE2 version is 183% faster
910 if (sse2() && a.length >= 8)
912 auto n = aptr + (a.length & ~7);
917 mov ECX, bptr; // right operand
918 mov ESI, aptr; // destination operand
919 mov EDI, n; // end comparison
924 movupd XMM1, [ESI+16];
925 movupd XMM2, [ESI+32];
926 movupd XMM3, [ESI+48];
929 movupd XMM5, [ECX+16];
930 movupd XMM6, [ECX+32];
931 movupd XMM7, [ECX+48];
937 movupd [ESI+ 0-64], XMM0;
938 movupd [ESI+16-64], XMM1;
939 movupd [ESI+32-64], XMM2;
940 movupd [ESI+48-64], XMM3;
958 printf("_arrayExpSliceMinass_d unittest\n");
959 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
961 version (log) printf(" cpuid %d\n", cpuid);
963 for (int j = 0; j < 2; j++)
966 T[] a = new T[dim + j]; // aligned on 16 byte boundary
967 a = a[j .. dim + j]; // misalign for second iteration
968 T[] b = new T[dim + j];
970 T[] c = new T[dim + j];
973 for (int i = 0; i < dim; i++)
975 b[i] = cast(T)(i + 7);
976 c[i] = cast(T)(i * 2);
982 for (int i = 0; i < dim; i++)
984 if (c[i] != cast(T)(a[i] - 6))
986 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
994 /* ======================================================================== */
996 /***********************
1001 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
1004 assert(a.length == b.length);
1005 assert(disjoint(a, b));
1009 //printf("_arraySliceExpMulSliceAssign_d()\n");
1011 auto aend = aptr + a.length;
1014 version (D_InlineAsm_X86)
1016 // SSE2 version is 304% faster
1017 if (sse2() && a.length >= 8)
1019 auto n = aptr + (a.length & ~7);
1028 shufpd XMM4, XMM4, 0;
1034 movupd XMM1, [EAX+16];
1035 movupd XMM2, [EAX+32];
1036 movupd XMM3, [EAX+48];
1042 movupd [ESI+ 0-64], XMM0;
1043 movupd [ESI+16-64], XMM1;
1044 movupd [ESI+32-64], XMM2;
1045 movupd [ESI+48-64], XMM3;
1056 *aptr++ = *bptr++ * value;
1063 printf("_arraySliceExpMulSliceAssign_d unittest\n");
1064 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1066 version (log) printf(" cpuid %d\n", cpuid);
1068 for (int j = 0; j < 2; j++)
1071 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1072 a = a[j .. dim + j]; // misalign for second iteration
1073 T[] b = new T[dim + j];
1074 b = b[j .. dim + j];
1075 T[] c = new T[dim + j];
1076 c = c[j .. dim + j];
1078 for (int i = 0; i < dim; i++)
1080 b[i] = cast(T)(i + 7);
1081 c[i] = cast(T)(i * 2);
1086 for (int i = 0; i < dim; i++)
1088 if (c[i] != cast(T)(a[i] * 6))
1090 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1098 /* ======================================================================== */
1100 /***********************
1105 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
1108 assert(a.length == b.length && b.length == c.length);
1109 assert(disjoint(a, b));
1110 assert(disjoint(a, c));
1111 assert(disjoint(b, c));
1115 //printf("_arraySliceSliceMulSliceAssign_d()\n");
1117 auto aend = aptr + a.length;
1121 version (D_InlineAsm_X86)
1123 // SSE2 version is 329% faster
1124 if (sse2() && a.length >= 8)
1126 auto n = aptr + (a.length & ~7);
1131 mov EAX, bptr; // left operand
1132 mov ECX, cptr; // right operand
1133 mov ESI, aptr; // destination operand
1134 mov EDI, n; // end comparison
1139 movupd XMM1, [EAX+16];
1140 movupd XMM2, [EAX+32];
1141 movupd XMM3, [EAX+48];
1144 movupd XMM5, [ECX+16];
1145 movupd XMM6, [ECX+32];
1146 movupd XMM7, [ECX+48];
1153 movupd [ESI+ 0-64], XMM0;
1154 movupd [ESI+16-64], XMM1;
1155 movupd [ESI+32-64], XMM2;
1156 movupd [ESI+48-64], XMM3;
1168 *aptr++ = *bptr++ * *cptr++;
1175 printf("_arraySliceSliceMulSliceAssign_d unittest\n");
1176 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1178 version (log) printf(" cpuid %d\n", cpuid);
1180 for (int j = 0; j < 2; j++)
1183 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1184 a = a[j .. dim + j]; // misalign for second iteration
1185 T[] b = new T[dim + j];
1186 b = b[j .. dim + j];
1187 T[] c = new T[dim + j];
1188 c = c[j .. dim + j];
1190 for (int i = 0; i < dim; i++)
1192 b[i] = cast(T)(i + 7);
1193 c[i] = cast(T)(i * 2);
1198 for (int i = 0; i < dim; i++)
1200 if (c[i] != cast(T)(a[i] * b[i]))
1202 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1210 /* ======================================================================== */
1212 /***********************
1217 T[] _arrayExpSliceMulass_d(T[] a, T value)
1219 //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1221 auto aend = aptr + a.length;
1223 version (D_InlineAsm_X86)
1225 // SSE2 version is 109% faster
1226 if (sse2() && a.length >= 8)
1228 auto n = cast(T*)((cast(uint)aend) & ~7);
1237 shufpd XMM4, XMM4, 0;
1242 movupd XMM1, [ESI+16];
1243 movupd XMM2, [ESI+32];
1244 movupd XMM3, [ESI+48];
1250 movupd [ESI+ 0-64], XMM0;
1251 movupd [ESI+16-64], XMM1;
1252 movupd [ESI+32-64], XMM2;
1253 movupd [ESI+48-64], XMM3;
1270 printf("_arrayExpSliceMulass_d unittest\n");
1271 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1273 version (log) printf(" cpuid %d\n", cpuid);
1275 for (int j = 0; j < 2; j++)
1278 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1279 a = a[j .. dim + j]; // misalign for second iteration
1280 T[] b = new T[dim + j];
1281 b = b[j .. dim + j];
1282 T[] c = new T[dim + j];
1283 c = c[j .. dim + j];
1285 for (int i = 0; i < dim; i++)
1287 b[i] = cast(T)(i + 7);
1288 c[i] = cast(T)(i * 2);
1294 for (int i = 0; i < dim; i++)
1296 if (c[i] != cast(T)(a[i] * 6))
1298 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1306 /* ======================================================================== */
1308 /***********************
1313 T[] _arraySliceSliceMulass_d(T[] a, T[] b)
1316 assert (a.length == b.length);
1317 assert (disjoint(a, b));
1321 //printf("_arraySliceSliceMulass_d()\n");
1323 auto aend = aptr + a.length;
1326 version (D_InlineAsm_X86)
1328 // SSE2 version is 205% faster
1329 if (sse2() && a.length >= 8)
1331 auto n = aptr + (a.length & ~7);
1336 mov ECX, bptr; // right operand
1337 mov ESI, aptr; // destination operand
1338 mov EDI, n; // end comparison
1343 movupd XMM1, [ESI+16];
1344 movupd XMM2, [ESI+32];
1345 movupd XMM3, [ESI+48];
1348 movupd XMM5, [ECX+16];
1349 movupd XMM6, [ECX+32];
1350 movupd XMM7, [ECX+48];
1356 movupd [ESI+ 0-64], XMM0;
1357 movupd [ESI+16-64], XMM1;
1358 movupd [ESI+32-64], XMM2;
1359 movupd [ESI+48-64], XMM3;
1377 printf("_arrayExpSliceMulass_d unittest\n");
1378 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1380 version (log) printf(" cpuid %d\n", cpuid);
1382 for (int j = 0; j < 2; j++)
1385 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1386 a = a[j .. dim + j]; // misalign for second iteration
1387 T[] b = new T[dim + j];
1388 b = b[j .. dim + j];
1389 T[] c = new T[dim + j];
1390 c = c[j .. dim + j];
1392 for (int i = 0; i < dim; i++)
1394 b[i] = cast(T)(i + 7);
1395 c[i] = cast(T)(i * 2);
1401 for (int i = 0; i < dim; i++)
1403 if (c[i] != cast(T)(a[i] * 6))
1405 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1413 /* ======================================================================== */
1415 /***********************
1420 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
1423 assert(a.length == b.length);
1424 assert(disjoint(a, b));
1428 //printf("_arraySliceExpDivSliceAssign_d()\n");
1430 auto aend = aptr + a.length;
1433 /* Multiplying by the reciprocal is faster, but does
1434 * not produce as accurate an answer.
1436 T recip = cast(T)1 / value;
1438 version (D_InlineAsm_X86)
1440 // SSE2 version is 299% faster
1441 if (sse2() && a.length >= 8)
1443 auto n = aptr + (a.length & ~7);
1454 shufpd XMM4, XMM4, 0;
1460 movupd XMM1, [EAX+16];
1461 movupd XMM2, [EAX+32];
1462 movupd XMM3, [EAX+48];
1472 movupd [ESI+ 0-64], XMM0;
1473 movupd [ESI+16-64], XMM1;
1474 movupd [ESI+32-64], XMM2;
1475 movupd [ESI+48-64], XMM3;
1487 *aptr++ = *bptr++ / value;
1488 //*aptr++ = *bptr++ * recip;
1496 printf("_arraySliceExpDivSliceAssign_d unittest\n");
1497 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1499 version (log) printf(" cpuid %d\n", cpuid);
1501 for (int j = 0; j < 2; j++)
1504 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1505 a = a[j .. dim + j]; // misalign for second iteration
1506 T[] b = new T[dim + j];
1507 b = b[j .. dim + j];
1508 T[] c = new T[dim + j];
1509 c = c[j .. dim + j];
1511 for (int i = 0; i < dim; i++)
1513 b[i] = cast(T)(i + 7);
1514 c[i] = cast(T)(i * 2);
1519 for (int i = 0; i < dim; i++)
1521 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
1522 if (c[i] != cast(T)(a[i] / 8))
1524 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1532 /* ======================================================================== */
1534 /***********************
1539 T[] _arrayExpSliceDivass_d(T[] a, T value)
1541 //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1543 auto aend = aptr + a.length;
1545 /* Multiplying by the reciprocal is faster, but does
1546 * not produce as accurate an answer.
1548 T recip = cast(T)1 / value;
1550 version (D_InlineAsm_X86)
1552 // SSE2 version is 65% faster
1553 if (sse2() && a.length >= 8)
1555 auto n = aptr + (a.length & ~7);
1565 shufpd XMM4, XMM4, 0;
1570 movupd XMM1, [ESI+16];
1571 movupd XMM2, [ESI+32];
1572 movupd XMM3, [ESI+48];
1582 movupd [ESI+ 0-64], XMM0;
1583 movupd [ESI+16-64], XMM1;
1584 movupd [ESI+32-64], XMM2;
1585 movupd [ESI+48-64], XMM3;
1603 printf("_arrayExpSliceDivass_d unittest\n");
1604 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1606 version (log) printf(" cpuid %d\n", cpuid);
1608 for (int j = 0; j < 2; j++)
1611 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1612 a = a[j .. dim + j]; // misalign for second iteration
1613 T[] b = new T[dim + j];
1614 b = b[j .. dim + j];
1615 T[] c = new T[dim + j];
1616 c = c[j .. dim + j];
1618 for (int i = 0; i < dim; i++)
1620 b[i] = cast(T)(i + 7);
1621 c[i] = cast(T)(i * 2);
1627 for (int i = 0; i < dim; i++)
1629 if (c[i] != cast(T)(a[i] / 8))
1631 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1640 /* ======================================================================== */
1642 /***********************
1644 * a[] -= b[] * value
1647 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
1649 return _arraySliceExpMulSliceAddass_d(a, -value, b);
1652 /***********************
1654 * a[] += b[] * value
1657 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
1660 assert(a.length == b.length);
1661 assert(disjoint(a, b));
1666 auto aend = aptr + a.length;
1671 *aptr++ += *bptr++ * value;
1678 printf("_arraySliceExpMulSliceAddass_d unittest\n");
1682 version (log) printf(" cpuid %d\n", cpuid);
1684 for (int j = 0; j < 1; j++)
1687 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1688 a = a[j .. dim + j]; // misalign for second iteration
1689 T[] b = new T[dim + j];
1690 b = b[j .. dim + j];
1691 T[] c = new T[dim + j];
1692 c = c[j .. dim + j];
1694 for (int i = 0; i < dim; i++)
1696 b[i] = cast(T)(i + 7);
1697 c[i] = cast(T)(i * 2);
1703 for (int i = 0; i < dim; i++)
1705 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1706 if (c[i] != cast(T)(b[i] + a[i] * 6))
1708 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);