1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
8 /* Contains SSE2 and MMX versions of certain operations for wchar, short,
9 * and ushort ('u', 's' and 't' suffixes).
14 private import util.cpuid;
18 /* This is so unit tests will test every CPU variant
21 const int CPUID_MAX = 4;
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
29 alias util.cpuid.mmx mmx;
30 alias util.cpuid.sse sse;
31 alias util.cpuid.sse2 sse2;
32 alias util.cpuid.sse2 sse2;
37 bool disjoint(T)(T[] a, T[] b)
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
46 /* ======================================================================== */
48 /***********************
53 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
55 return _arraySliceExpAddSliceAssign_s(a, value, b);
58 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
60 return _arraySliceExpAddSliceAssign_s(a, value, b);
63 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
66 assert(a.length == b.length);
67 assert(disjoint(a, b));
71 //printf("_arraySliceExpAddSliceAssign_s()\n");
73 auto aend = aptr + a.length;
76 version (D_InlineAsm_X86)
78 // SSE2 aligned version is 3343% faster
79 if (sse2() && a.length >= 16)
81 auto n = aptr + (a.length & ~15);
83 uint l = cast(ushort) value;
86 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
100 movdqu XMM1, [EAX+16];
104 movdqu [ESI -32], XMM0;
105 movdqu [ESI+16-32], XMM1;
121 pshufd XMM2, XMM2, 0;
127 movdqa XMM1, [EAX+16];
131 movdqa [ESI -32], XMM0;
132 movdqa [ESI+16-32], XMM1;
142 // MMX version is 3343% faster
143 if (mmx() && a.length >= 8)
145 auto n = aptr + (a.length & ~7);
147 uint l = cast(ushort) value;
166 movq [ESI+8-16], MM1;
178 *aptr++ = cast(T)(*bptr++ + value);
185 printf("_arraySliceExpAddSliceAssign_s unittest\n");
187 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
189 version (log) printf(" cpuid %d\n", cpuid);
191 for (int j = 0; j < 2; j++)
194 T[] a = new T[dim + j]; // aligned on 16 byte boundary
195 a = a[j .. dim + j]; // misalign for second iteration
196 T[] b = new T[dim + j];
198 T[] c = new T[dim + j];
201 for (int i = 0; i < dim; i++)
203 b[i] = cast(T)(i + 7);
204 c[i] = cast(T)(i * 2);
209 for (int i = 0; i < dim; i++)
211 if (c[i] != cast(T)(a[i] + 6))
213 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
222 /* ======================================================================== */
224 /***********************
229 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
231 return _arraySliceSliceAddSliceAssign_s(a, c, b);
234 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
236 return _arraySliceSliceAddSliceAssign_s(a, c, b);
239 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
242 assert(a.length == b.length && b.length == c.length);
243 assert(disjoint(a, b));
244 assert(disjoint(a, c));
245 assert(disjoint(b, c));
249 //printf("_arraySliceSliceAddSliceAssign_s()\n");
251 auto aend = aptr + a.length;
255 version (D_InlineAsm_X86)
257 // SSE2 aligned version is 3777% faster
258 if (sse2() && a.length >= 16)
260 auto n = aptr + (a.length & ~15);
262 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
264 asm // unaligned case
275 movdqu XMM1, [EAX+16];
278 movdqu XMM3, [ECX+16];
282 movdqu [ESI -32], XMM0;
283 movdqu [ESI+16-32], XMM1;
305 movdqa XMM1, [EAX+16];
308 movdqa XMM3, [ECX+16];
312 movdqa [ESI -32], XMM0;
313 movdqa [ESI+16-32], XMM1;
324 // MMX version is 2068% faster
325 if (mmx() && a.length >= 8)
327 auto n = aptr + (a.length & ~7);
348 movq [ESI+8-16], MM1;
361 *aptr++ = cast(T)(*bptr++ + *cptr++);
368 printf("_arraySliceSliceAddSliceAssign_s unittest\n");
370 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
372 version (log) printf(" cpuid %d\n", cpuid);
374 for (int j = 0; j < 2; j++)
377 T[] a = new T[dim + j]; // aligned on 16 byte boundary
378 a = a[j .. dim + j]; // misalign for second iteration
379 T[] b = new T[dim + j];
381 T[] c = new T[dim + j];
384 for (int i = 0; i < dim; i++)
386 b[i] = cast(T)(i + 7);
387 c[i] = cast(T)(i * 2);
392 for (int i = 0; i < dim; i++)
394 if (c[i] != cast(T)(a[i] + b[i]))
396 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
405 /* ======================================================================== */
407 /***********************
412 T[] _arrayExpSliceAddass_u(T[] a, T value)
414 return _arrayExpSliceAddass_s(a, value);
417 T[] _arrayExpSliceAddass_t(T[] a, T value)
419 return _arrayExpSliceAddass_s(a, value);
422 T[] _arrayExpSliceAddass_s(T[] a, T value)
424 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
426 auto aend = aptr + a.length;
428 version (D_InlineAsm_X86)
430 // SSE2 aligned version is 832% faster
431 if (sse2() && a.length >= 16)
433 auto n = aptr + (a.length & ~15);
435 uint l = cast(ushort) value;
438 if (((cast(uint) aptr) & 15) != 0)
440 asm // unaligned case
445 pshufd XMM2, XMM2, 0;
450 movdqu XMM1, [ESI+16];
454 movdqu [ESI -32], XMM0;
455 movdqu [ESI+16-32], XMM1;
469 pshufd XMM2, XMM2, 0;
474 movdqa XMM1, [ESI+16];
478 movdqa [ESI -32], XMM0;
479 movdqa [ESI+16-32], XMM1;
488 // MMX version is 826% faster
489 if (mmx() && a.length >= 8)
491 auto n = aptr + (a.length & ~7);
493 uint l = cast(ushort) value;
510 movq [ESI+8-16], MM1;
528 printf("_arrayExpSliceAddass_s unittest\n");
530 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
532 version (log) printf(" cpuid %d\n", cpuid);
534 for (int j = 0; j < 2; j++)
537 T[] a = new T[dim + j]; // aligned on 16 byte boundary
538 a = a[j .. dim + j]; // misalign for second iteration
539 T[] b = new T[dim + j];
541 T[] c = new T[dim + j];
544 for (int i = 0; i < dim; i++)
546 b[i] = cast(T)(i + 7);
547 c[i] = cast(T)(i * 2);
553 for (int i = 0; i < dim; i++)
555 if (a[i] != cast(T)(c[i] + 6))
557 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
566 /* ======================================================================== */
568 /***********************
573 T[] _arraySliceSliceAddass_u(T[] a, T[] b)
575 return _arraySliceSliceAddass_s(a, b);
578 T[] _arraySliceSliceAddass_t(T[] a, T[] b)
580 return _arraySliceSliceAddass_s(a, b);
583 T[] _arraySliceSliceAddass_s(T[] a, T[] b)
586 assert (a.length == b.length);
587 assert (disjoint(a, b));
591 //printf("_arraySliceSliceAddass_s()\n");
593 auto aend = aptr + a.length;
596 version (D_InlineAsm_X86)
598 // SSE2 aligned version is 2085% faster
599 if (sse2() && a.length >= 16)
601 auto n = aptr + (a.length & ~15);
603 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
605 asm // unaligned case
614 movdqu XMM1, [ESI+16];
617 movdqu XMM3, [ECX+16];
621 movdqu [ESI -32], XMM0;
622 movdqu [ESI+16-32], XMM1;
641 movdqa XMM1, [ESI+16];
644 movdqa XMM3, [ECX+16];
648 movdqa [ESI -32], XMM0;
649 movdqa [ESI+16-32], XMM1;
659 // MMX version is 1022% faster
660 if (mmx() && a.length >= 8)
662 auto n = aptr + (a.length & ~7);
681 movq [ESI+8-16], MM1;
700 printf("_arraySliceSliceAddass_s unittest\n");
702 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
704 version (log) printf(" cpuid %d\n", cpuid);
706 for (int j = 0; j < 2; j++)
709 T[] a = new T[dim + j]; // aligned on 16 byte boundary
710 a = a[j .. dim + j]; // misalign for second iteration
711 T[] b = new T[dim + j];
713 T[] c = new T[dim + j];
716 for (int i = 0; i < dim; i++)
718 b[i] = cast(T)(i + 7);
719 c[i] = cast(T)(i * 2);
725 for (int i = 0; i < dim; i++)
727 if (c[i] != cast(T)(b[i] + a[i]))
729 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
738 /* ======================================================================== */
740 /***********************
745 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
747 return _arraySliceExpMinSliceAssign_s(a, value, b);
750 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
752 return _arraySliceExpMinSliceAssign_s(a, value, b);
755 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
758 assert(a.length == b.length);
759 assert(disjoint(a, b));
763 //printf("_arraySliceExpMinSliceAssign_s()\n");
765 auto aend = aptr + a.length;
768 version (D_InlineAsm_X86)
770 // SSE2 aligned version is 3695% faster
771 if (sse2() && a.length >= 16)
773 auto n = aptr + (a.length & ~15);
775 uint l = cast(ushort) value;
778 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
780 asm // unaligned case
786 pshufd XMM2, XMM2, 0;
792 movdqu XMM1, [EAX+16];
796 movdqu [ESI -32], XMM0;
797 movdqu [ESI+16-32], XMM1;
813 pshufd XMM2, XMM2, 0;
819 movdqa XMM1, [EAX+16];
823 movdqa [ESI -32], XMM0;
824 movdqa [ESI+16-32], XMM1;
834 // MMX version is 3049% faster
835 if (mmx() && a.length >= 8)
837 auto n = aptr + (a.length & ~7);
839 uint l = cast(ushort) value;
858 movq [ESI+8-16], MM1;
870 *aptr++ = cast(T)(*bptr++ - value);
877 printf("_arraySliceExpMinSliceAssign_s unittest\n");
879 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
881 version (log) printf(" cpuid %d\n", cpuid);
883 for (int j = 0; j < 2; j++)
886 T[] a = new T[dim + j]; // aligned on 16 byte boundary
887 a = a[j .. dim + j]; // misalign for second iteration
888 T[] b = new T[dim + j];
890 T[] c = new T[dim + j];
893 for (int i = 0; i < dim; i++)
895 b[i] = cast(T)(i + 7);
896 c[i] = cast(T)(i * 2);
901 for (int i = 0; i < dim; i++)
903 if (c[i] != cast(T)(a[i] - 6))
905 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
914 /* ======================================================================== */
916 /***********************
921 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
923 return _arrayExpSliceMinSliceAssign_s(a, b, value);
926 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
928 return _arrayExpSliceMinSliceAssign_s(a, b, value);
931 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
934 assert(a.length == b.length);
935 assert(disjoint(a, b));
939 //printf("_arrayExpSliceMinSliceAssign_s()\n");
941 auto aend = aptr + a.length;
944 version (D_InlineAsm_X86)
946 // SSE2 aligned version is 4995% faster
947 if (sse2() && a.length >= 16)
949 auto n = aptr + (a.length & ~15);
951 uint l = cast(ushort) value;
954 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
956 asm // unaligned case
965 pshufd XMM2, XMM2, 0;
967 pshufd XMM3, XMM3, 0;
970 movdqu XMM1, [EAX+16];
974 movdqu [ESI -32], XMM2;
975 movdqu [ESI+16-32], XMM3;
994 pshufd XMM2, XMM2, 0;
996 pshufd XMM3, XMM3, 0;
999 movdqa XMM1, [EAX+16];
1003 movdqa [ESI -32], XMM2;
1004 movdqa [ESI+16-32], XMM3;
1014 // MMX version is 4562% faster
1015 if (mmx() && a.length >= 8)
1017 auto n = aptr + (a.length & ~7);
1019 uint l = cast(ushort) value;
1039 movq [ESI -16], MM0;
1040 movq [ESI+8-16], MM1;
1052 *aptr++ = cast(T)(value - *bptr++);
1059 printf("_arrayExpSliceMinSliceAssign_s unittest\n");
1061 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1063 version (log) printf(" cpuid %d\n", cpuid);
1065 for (int j = 0; j < 2; j++)
1068 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1069 a = a[j .. dim + j]; // misalign for second iteration
1070 T[] b = new T[dim + j];
1071 b = b[j .. dim + j];
1072 T[] c = new T[dim + j];
1073 c = c[j .. dim + j];
1075 for (int i = 0; i < dim; i++)
1077 b[i] = cast(T)(i + 7);
1078 c[i] = cast(T)(i * 2);
1083 for (int i = 0; i < dim; i++)
1085 if (c[i] != cast(T)(6 - a[i]))
1087 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1096 /* ======================================================================== */
1098 /***********************
1103 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
1105 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1108 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
1110 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1113 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
1116 assert(a.length == b.length && b.length == c.length);
1117 assert(disjoint(a, b));
1118 assert(disjoint(a, c));
1119 assert(disjoint(b, c));
1124 auto aend = aptr + a.length;
1128 version (D_InlineAsm_X86)
1130 // SSE2 aligned version is 4129% faster
1131 if (sse2() && a.length >= 16)
1133 auto n = aptr + (a.length & ~15);
1135 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1137 asm // unaligned case
1148 movdqu XMM1, [EAX+16];
1151 movdqu XMM3, [ECX+16];
1155 movdqu [ESI -32], XMM0;
1156 movdqu [ESI+16-32], XMM1;
1178 movdqa XMM1, [EAX+16];
1181 movdqa XMM3, [ECX+16];
1185 movdqa [ESI -32], XMM0;
1186 movdqa [ESI+16-32], XMM1;
1197 // MMX version is 2018% faster
1198 if (mmx() && a.length >= 8)
1200 auto n = aptr + (a.length & ~7);
1220 movq [ESI -16], MM0;
1221 movq [ESI+8-16], MM1;
1234 *aptr++ = cast(T)(*bptr++ - *cptr++);
1241 printf("_arraySliceSliceMinSliceAssign_s unittest\n");
1243 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1245 version (log) printf(" cpuid %d\n", cpuid);
1247 for (int j = 0; j < 2; j++)
1250 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1251 a = a[j .. dim + j]; // misalign for second iteration
1252 T[] b = new T[dim + j];
1253 b = b[j .. dim + j];
1254 T[] c = new T[dim + j];
1255 c = c[j .. dim + j];
1257 for (int i = 0; i < dim; i++)
1259 b[i] = cast(T)(i + 7);
1260 c[i] = cast(T)(i * 2);
1265 for (int i = 0; i < dim; i++)
1267 if (c[i] != cast(T)(a[i] - b[i]))
1269 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1278 /* ======================================================================== */
1280 /***********************
1285 T[] _arrayExpSliceMinass_u(T[] a, T value)
1287 return _arrayExpSliceMinass_s(a, value);
1290 T[] _arrayExpSliceMinass_t(T[] a, T value)
1292 return _arrayExpSliceMinass_s(a, value);
1295 T[] _arrayExpSliceMinass_s(T[] a, T value)
1297 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1299 auto aend = aptr + a.length;
1301 version (D_InlineAsm_X86)
1303 // SSE2 aligned version is 835% faster
1304 if (sse2() && a.length >= 16)
1306 auto n = aptr + (a.length & ~15);
1308 uint l = cast(ushort) value;
1311 if (((cast(uint) aptr) & 15) != 0)
1313 asm // unaligned case
1318 pshufd XMM2, XMM2, 0;
1323 movdqu XMM1, [ESI+16];
1327 movdqu [ESI -32], XMM0;
1328 movdqu [ESI+16-32], XMM1;
1342 pshufd XMM2, XMM2, 0;
1347 movdqa XMM1, [ESI+16];
1351 movdqa [ESI -32], XMM0;
1352 movdqa [ESI+16-32], XMM1;
1361 // MMX version is 835% faster
1362 if (mmx() && a.length >= 8)
1364 auto n = aptr + (a.length & ~7);
1366 uint l = cast(ushort) value;
1382 movq [ESI -16], MM0;
1383 movq [ESI+8-16], MM1;
1401 printf("_arrayExpSliceMinass_s unittest\n");
1403 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1405 version (log) printf(" cpuid %d\n", cpuid);
1407 for (int j = 0; j < 2; j++)
1410 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1411 a = a[j .. dim + j]; // misalign for second iteration
1412 T[] b = new T[dim + j];
1413 b = b[j .. dim + j];
1414 T[] c = new T[dim + j];
1415 c = c[j .. dim + j];
1417 for (int i = 0; i < dim; i++)
1419 b[i] = cast(T)(i + 7);
1420 c[i] = cast(T)(i * 2);
1426 for (int i = 0; i < dim; i++)
1428 if (a[i] != cast(T)(c[i] - 6))
1430 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1439 /* ======================================================================== */
1441 /***********************
1446 T[] _arraySliceSliceMinass_u(T[] a, T[] b)
1448 return _arraySliceSliceMinass_s(a, b);
1451 T[] _arraySliceSliceMinass_t(T[] a, T[] b)
1453 return _arraySliceSliceMinass_s(a, b);
1456 T[] _arraySliceSliceMinass_s(T[] a, T[] b)
1459 assert (a.length == b.length);
1460 assert (disjoint(a, b));
1464 //printf("_arraySliceSliceMinass_s()\n");
1466 auto aend = aptr + a.length;
1469 version (D_InlineAsm_X86)
1471 // SSE2 aligned version is 2121% faster
1472 if (sse2() && a.length >= 16)
1474 auto n = aptr + (a.length & ~15);
1476 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1478 asm // unaligned case
1487 movdqu XMM1, [ESI+16];
1490 movdqu XMM3, [ECX+16];
1494 movdqu [ESI -32], XMM0;
1495 movdqu [ESI+16-32], XMM1;
1514 movdqa XMM1, [ESI+16];
1517 movdqa XMM3, [ECX+16];
1521 movdqa [ESI -32], XMM0;
1522 movdqa [ESI+16-32], XMM1;
1532 // MMX version is 1116% faster
1533 if (mmx() && a.length >= 8)
1535 auto n = aptr + (a.length & ~7);
1553 movq [ESI -16], MM0;
1554 movq [ESI+8-16], MM1;
1573 printf("_arraySliceSliceMinass_s unittest\n");
1575 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1577 version (log) printf(" cpuid %d\n", cpuid);
1579 for (int j = 0; j < 2; j++)
1582 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1583 a = a[j .. dim + j]; // misalign for second iteration
1584 T[] b = new T[dim + j];
1585 b = b[j .. dim + j];
1586 T[] c = new T[dim + j];
1587 c = c[j .. dim + j];
1589 for (int i = 0; i < dim; i++)
1591 b[i] = cast(T)(i + 7);
1592 c[i] = cast(T)(i * 2);
1598 for (int i = 0; i < dim; i++)
1600 if (c[i] != cast(T)(b[i] - a[i]))
1602 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1611 /* ======================================================================== */
1613 /***********************
1618 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
1620 return _arraySliceExpMulSliceAssign_s(a, value, b);
1623 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
1625 return _arraySliceExpMulSliceAssign_s(a, value, b);
1628 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
1631 assert(a.length == b.length);
1632 assert(disjoint(a, b));
1636 //printf("_arraySliceExpMulSliceAssign_s()\n");
1638 auto aend = aptr + a.length;
1641 version (D_InlineAsm_X86)
1643 // SSE2 aligned version is 3733% faster
1644 if (sse2() && a.length >= 16)
1646 auto n = aptr + (a.length & ~15);
1648 uint l = cast(ushort) value;
1651 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1659 pshufd XMM2, XMM2, 0;
1665 movdqu XMM1, [EAX+16];
1669 movdqu [ESI -32], XMM0;
1670 movdqu [ESI+16-32], XMM1;
1686 pshufd XMM2, XMM2, 0;
1692 movdqa XMM1, [EAX+16];
1696 movdqa [ESI -32], XMM0;
1697 movdqa [ESI+16-32], XMM1;
1707 // MMX version is 3733% faster
1708 if (mmx() && a.length >= 8)
1710 auto n = aptr + (a.length & ~7);
1712 uint l = cast(ushort) value;
1730 movq [ESI -16], MM0;
1731 movq [ESI+8-16], MM1;
1743 *aptr++ = cast(T)(*bptr++ * value);
1750 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1752 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1754 version (log) printf(" cpuid %d\n", cpuid);
1756 for (int j = 0; j < 2; j++)
1759 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1760 a = a[j .. dim + j]; // misalign for second iteration
1761 T[] b = new T[dim + j];
1762 b = b[j .. dim + j];
1763 T[] c = new T[dim + j];
1764 c = c[j .. dim + j];
1766 for (int i = 0; i < dim; i++)
1768 b[i] = cast(T)(i + 7);
1769 c[i] = cast(T)(i * 2);
1774 for (int i = 0; i < dim; i++)
1776 if (c[i] != cast(T)(a[i] * 6))
1778 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1787 /* ======================================================================== */
1789 /***********************
1794 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
1796 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1799 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
1801 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1804 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
1807 assert(a.length == b.length && b.length == c.length);
1808 assert(disjoint(a, b));
1809 assert(disjoint(a, c));
1810 assert(disjoint(b, c));
1814 //printf("_arraySliceSliceMulSliceAssign_s()\n");
1816 auto aend = aptr + a.length;
1820 version (D_InlineAsm_X86)
1822 // SSE2 aligned version is 2515% faster
1823 if (sse2() && a.length >= 16)
1825 auto n = aptr + (a.length & ~15);
1827 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1841 movdqu XMM1, [EAX+16];
1842 movdqu XMM3, [ECX+16];
1847 movdqu [ESI -32], XMM0;
1848 movdqu [ESI+16-32], XMM1;
1871 movdqa XMM1, [EAX+16];
1872 movdqa XMM3, [ECX+16];
1877 movdqa [ESI -32], XMM0;
1878 movdqa [ESI+16-32], XMM1;
1889 // MMX version is 2515% faster
1890 if (mmx() && a.length >= 8)
1892 auto n = aptr + (a.length & ~7);
1912 movq [ESI -16], MM0;
1913 movq [ESI+8-16], MM1;
1926 *aptr++ = cast(T)(*bptr++ * *cptr++);
1933 printf("_arraySliceSliceMulSliceAssign_s unittest\n");
1935 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1937 version (log) printf(" cpuid %d\n", cpuid);
1939 for (int j = 0; j < 2; j++)
1942 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1943 a = a[j .. dim + j]; // misalign for second iteration
1944 T[] b = new T[dim + j];
1945 b = b[j .. dim + j];
1946 T[] c = new T[dim + j];
1947 c = c[j .. dim + j];
1949 for (int i = 0; i < dim; i++)
1951 b[i] = cast(T)(i + 7);
1952 c[i] = cast(T)(i * 2);
1957 for (int i = 0; i < dim; i++)
1959 if (c[i] != cast(T)(a[i] * b[i]))
1961 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
1970 /* ======================================================================== */
1972 /***********************
1977 T[] _arrayExpSliceMulass_u(T[] a, T value)
1979 return _arrayExpSliceMulass_s(a, value);
1982 T[] _arrayExpSliceMulass_t(T[] a, T value)
1984 return _arrayExpSliceMulass_s(a, value);
1987 T[] _arrayExpSliceMulass_s(T[] a, T value)
1989 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1991 auto aend = aptr + a.length;
1993 version (D_InlineAsm_X86)
1995 // SSE2 aligned version is 2044% faster
1996 if (sse2() && a.length >= 16)
1998 auto n = aptr + (a.length & ~15);
2000 uint l = cast(ushort) value;
2003 if (((cast(uint) aptr) & 15) != 0)
2010 pshufd XMM2, XMM2, 0;
2015 movdqu XMM1, [ESI+16];
2019 movdqu [ESI -32], XMM0;
2020 movdqu [ESI+16-32], XMM1;
2034 pshufd XMM2, XMM2, 0;
2039 movdqa XMM1, [ESI+16];
2043 movdqa [ESI -32], XMM0;
2044 movdqa [ESI+16-32], XMM1;
2053 // MMX version is 2056% faster
2054 if (mmx() && a.length >= 8)
2056 auto n = aptr + (a.length & ~7);
2058 uint l = cast(ushort) value;
2074 movq [ESI -16], MM0;
2075 movq [ESI+8-16], MM1;
2093 printf("_arrayExpSliceMulass_s unittest\n");
2095 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2097 version (log) printf(" cpuid %d\n", cpuid);
2099 for (int j = 0; j < 2; j++)
2102 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2103 a = a[j .. dim + j]; // misalign for second iteration
2104 T[] b = new T[dim + j];
2105 b = b[j .. dim + j];
2106 T[] c = new T[dim + j];
2107 c = c[j .. dim + j];
2109 for (int i = 0; i < dim; i++)
2111 b[i] = cast(T)(i + 7);
2112 c[i] = cast(T)(i * 2);
2118 for (int i = 0; i < dim; i++)
2120 if (a[i] != cast(T)(b[i] * 6))
2122 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2131 /* ======================================================================== */
2133 /***********************
2138 T[] _arraySliceSliceMulass_u(T[] a, T[] b)
2140 return _arraySliceSliceMulass_s(a, b);
2143 T[] _arraySliceSliceMulass_t(T[] a, T[] b)
2145 return _arraySliceSliceMulass_s(a, b);
2148 T[] _arraySliceSliceMulass_s(T[] a, T[] b)
2151 assert (a.length == b.length);
2152 assert (disjoint(a, b));
2156 //printf("_arraySliceSliceMulass_s()\n");
2158 auto aend = aptr + a.length;
2161 version (D_InlineAsm_X86)
2163 // SSE2 aligned version is 2519% faster
2164 if (sse2() && a.length >= 16)
2166 auto n = aptr + (a.length & ~15);
2168 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2180 movdqu XMM1, [ESI+16];
2181 movdqu XMM3, [ECX+16];
2186 movdqu [ESI -32], XMM0;
2187 movdqu [ESI+16-32], XMM1;
2207 movdqa XMM1, [ESI+16];
2208 movdqa XMM3, [ECX+16];
2213 movdqa [ESI -32], XMM0;
2214 movdqa [ESI+16-32], XMM1;
2224 // MMX version is 1712% faster
2225 if (mmx() && a.length >= 8)
2227 auto n = aptr + (a.length & ~7);
2245 movq [ESI -16], MM0;
2246 movq [ESI+8-16], MM1;
2265 printf("_arraySliceSliceMulass_s unittest\n");
2267 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2269 version (log) printf(" cpuid %d\n", cpuid);
2271 for (int j = 0; j < 2; j++)
2274 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2275 a = a[j .. dim + j]; // misalign for second iteration
2276 T[] b = new T[dim + j];
2277 b = b[j .. dim + j];
2278 T[] c = new T[dim + j];
2279 c = c[j .. dim + j];
2281 for (int i = 0; i < dim; i++)
2283 b[i] = cast(T)(i + 7);
2284 c[i] = cast(T)(i * 2);
2290 for (int i = 0; i < dim; i++)
2292 if (a[i] != cast(T)(b[i] * c[i]))
2294 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);