1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
8 /* Contains MMX versions of certain operations for dchar, int,
9 * and uint ('w', 'i' and 'k' suffixes).
14 private import util.cpuid;
18 private import core.stdc.stdio : printf;
19 /* This is so unit tests will test every CPU variant
22 const int CPUID_MAX = 4;
23 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
24 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
25 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
26 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
30 alias util.cpuid.mmx mmx;
31 alias util.cpuid.sse sse;
32 alias util.cpuid.sse2 sse2;
33 alias util.cpuid.amd3dnow amd3dnow;
38 bool disjoint(T)(T[] a, T[] b)
40 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
47 /* ======================================================================== */
49 /***********************
54 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
56 return _arraySliceExpAddSliceAssign_i(a, value, b);
59 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
61 return _arraySliceExpAddSliceAssign_i(a, value, b);
64 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
67 assert(a.length == b.length);
68 assert(disjoint(a, b));
72 //printf("_arraySliceExpAddSliceAssign_i()\n");
74 auto aend = aptr + a.length;
77 version (D_InlineAsm_X86)
79 // SSE2 aligned version is 380% faster
80 if (sse2() && a.length >= 8)
82 auto n = aptr + (a.length & ~7);
86 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
100 movdqu XMM1, [EAX+16];
104 movdqu [ESI -32], XMM0;
105 movdqu [ESI+16-32], XMM1;
121 pshufd XMM2, XMM2, 0;
127 movdqa XMM1, [EAX+16];
131 movdqa [ESI -32], XMM0;
132 movdqa [ESI+16-32], XMM1;
142 // MMX version is 298% faster
143 if (mmx() && a.length >= 4)
145 auto n = aptr + (a.length & ~3);
147 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
165 movq [ESI+8-16], MM1;
177 auto n = aptr + (a.length & ~1);
206 *aptr++ = *bptr++ + value;
213 printf("_arraySliceExpAddSliceAssign_i unittest\n");
215 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
217 version (log) printf(" cpuid %d\n", cpuid);
219 for (int j = 0; j < 2; j++)
222 T[] a = new T[dim + j]; // aligned on 16 byte boundary
223 a = a[j .. dim + j]; // misalign for second iteration
224 T[] b = new T[dim + j];
226 T[] c = new T[dim + j];
229 for (int i = 0; i < dim; i++)
231 b[i] = cast(T)(i + 7);
232 c[i] = cast(T)(i * 2);
237 for (int i = 0; i < dim; i++)
239 if (c[i] != cast(T)(a[i] + 6))
241 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
250 /* ======================================================================== */
252 /***********************
257 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
259 return _arraySliceSliceAddSliceAssign_i(a, c, b);
262 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
264 return _arraySliceSliceAddSliceAssign_i(a, c, b);
267 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
270 assert(a.length == b.length && b.length == c.length);
271 assert(disjoint(a, b));
272 assert(disjoint(a, c));
273 assert(disjoint(b, c));
277 //printf("_arraySliceSliceAddSliceAssign_i()\n");
279 auto aend = aptr + a.length;
283 version (D_InlineAsm_X86)
285 // SSE2 aligned version is 1710% faster
286 if (sse2() && a.length >= 8)
288 auto n = aptr + (a.length & ~7);
290 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
292 asm // unaligned case
304 movdqu XMM1, [EAX+16];
305 movdqu XMM3, [ECX+16];
310 movdqu [ESI -32], XMM0;
311 movdqu [ESI+16-32], XMM1;
334 movdqa XMM1, [EAX+16];
335 movdqa XMM3, [ECX+16];
340 movdqa [ESI -32], XMM0;
341 movdqa [ESI+16-32], XMM1;
352 // MMX version is 995% faster
353 if (mmx() && a.length >= 4)
355 auto n = aptr + (a.length & ~3);
376 movq [ESI+8-16], MM1;
390 *aptr++ = *bptr++ + *cptr++;
397 printf("_arraySliceSliceAddSliceAssign_i unittest\n");
399 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
401 version (log) printf(" cpuid %d\n", cpuid);
403 for (int j = 0; j < 2; j++)
406 T[] a = new T[dim + j]; // aligned on 16 byte boundary
407 a = a[j .. dim + j]; // misalign for second iteration
408 T[] b = new T[dim + j];
410 T[] c = new T[dim + j];
413 for (int i = 0; i < dim; i++)
415 b[i] = cast(T)(i + 7);
416 c[i] = cast(T)(i * 2);
421 for (int i = 0; i < dim; i++)
423 if (c[i] != cast(T)(a[i] + b[i]))
425 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
434 /* ======================================================================== */
436 /***********************
441 T[] _arrayExpSliceAddass_w(T[] a, T value)
443 return _arrayExpSliceAddass_i(a, value);
446 T[] _arrayExpSliceAddass_k(T[] a, T value)
448 return _arrayExpSliceAddass_i(a, value);
451 T[] _arrayExpSliceAddass_i(T[] a, T value)
453 //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
455 auto aend = aptr + a.length;
457 version (D_InlineAsm_X86)
459 // SSE2 aligned version is 83% faster
460 if (sse2() && a.length >= 8)
462 auto n = aptr + (a.length & ~7);
466 if (((cast(uint) aptr) & 15) != 0)
468 asm // unaligned case
473 pshufd XMM2, XMM2, 0;
478 movdqu XMM1, [ESI+16];
482 movdqu [ESI -32], XMM0;
483 movdqu [ESI+16-32], XMM1;
497 pshufd XMM2, XMM2, 0;
502 movdqa XMM1, [ESI+16];
506 movdqa [ESI -32], XMM0;
507 movdqa [ESI+16-32], XMM1;
516 // MMX version is 81% faster
517 if (mmx() && a.length >= 4)
519 auto n = aptr + (a.length & ~3);
521 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
537 movq [ESI+8-16], MM1;
548 auto n = aptr + (a.length & ~1);
581 printf("_arrayExpSliceAddass_i unittest\n");
583 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
585 version (log) printf(" cpuid %d\n", cpuid);
587 for (int j = 0; j < 2; j++)
590 T[] a = new T[dim + j]; // aligned on 16 byte boundary
591 a = a[j .. dim + j]; // misalign for second iteration
592 T[] b = new T[dim + j];
594 T[] c = new T[dim + j];
597 for (int i = 0; i < dim; i++)
599 b[i] = cast(T)(i + 7);
600 c[i] = cast(T)(i * 2);
606 for (int i = 0; i < dim; i++)
608 if (a[i] != cast(T)(c[i] + 6))
610 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
619 /* ======================================================================== */
621 /***********************
626 T[] _arraySliceSliceAddass_w(T[] a, T[] b)
628 return _arraySliceSliceAddass_i(a, b);
631 T[] _arraySliceSliceAddass_k(T[] a, T[] b)
633 return _arraySliceSliceAddass_i(a, b);
636 T[] _arraySliceSliceAddass_i(T[] a, T[] b)
639 assert (a.length == b.length);
640 assert (disjoint(a, b));
644 //printf("_arraySliceSliceAddass_i()\n");
646 auto aend = aptr + a.length;
649 version (D_InlineAsm_X86)
651 // SSE2 aligned version is 695% faster
652 if (sse2() && a.length >= 8)
654 auto n = aptr + (a.length & ~7);
656 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
658 asm // unaligned case
668 movdqu XMM1, [ESI+16];
669 movdqu XMM3, [ECX+16];
674 movdqu [ESI -32], XMM0;
675 movdqu [ESI+16-32], XMM1;
695 movdqa XMM1, [ESI+16];
696 movdqa XMM3, [ECX+16];
701 movdqa [ESI -32], XMM0;
702 movdqa [ESI+16-32], XMM1;
712 // MMX version is 471% faster
713 if (mmx() && a.length >= 4)
715 auto n = aptr + (a.length & ~3);
734 movq [ESI+8-16], MM1;
754 printf("_arraySliceSliceAddass_i unittest\n");
756 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
758 version (log) printf(" cpuid %d\n", cpuid);
760 for (int j = 0; j < 2; j++)
763 T[] a = new T[dim + j]; // aligned on 16 byte boundary
764 a = a[j .. dim + j]; // misalign for second iteration
765 T[] b = new T[dim + j];
767 T[] c = new T[dim + j];
770 for (int i = 0; i < dim; i++)
772 b[i] = cast(T)(i + 7);
773 c[i] = cast(T)(i * 2);
779 for (int i = 0; i < dim; i++)
781 if (c[i] != cast(T)(b[i] + a[i]))
783 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
792 /* ======================================================================== */
794 /***********************
799 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
801 return _arraySliceExpMinSliceAssign_i(a, value, b);
804 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
806 return _arraySliceExpMinSliceAssign_i(a, value, b);
809 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
812 assert(a.length == b.length);
813 assert(disjoint(a, b));
817 //printf("_arraySliceExpMinSliceAssign_i()\n");
819 auto aend = aptr + a.length;
822 version (D_InlineAsm_X86)
824 // SSE2 aligned version is 400% faster
825 if (sse2() && a.length >= 8)
827 auto n = aptr + (a.length & ~7);
831 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
833 asm // unaligned case
839 pshufd XMM2, XMM2, 0;
845 movdqu XMM1, [EAX+16];
849 movdqu [ESI -32], XMM0;
850 movdqu [ESI+16-32], XMM1;
866 pshufd XMM2, XMM2, 0;
872 movdqa XMM1, [EAX+16];
876 movdqa [ESI -32], XMM0;
877 movdqa [ESI+16-32], XMM1;
887 // MMX version is 315% faster
888 if (mmx() && a.length >= 4)
890 auto n = aptr + (a.length & ~3);
892 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
910 movq [ESI+8-16], MM1;
922 auto n = aptr + (a.length & ~1);
951 *aptr++ = *bptr++ - value;
958 printf("_arraySliceExpMinSliceAssign_i unittest\n");
960 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
962 version (log) printf(" cpuid %d\n", cpuid);
964 for (int j = 0; j < 2; j++)
967 T[] a = new T[dim + j]; // aligned on 16 byte boundary
968 a = a[j .. dim + j]; // misalign for second iteration
969 T[] b = new T[dim + j];
971 T[] c = new T[dim + j];
974 for (int i = 0; i < dim; i++)
976 b[i] = cast(T)(i + 7);
977 c[i] = cast(T)(i * 2);
982 for (int i = 0; i < dim; i++)
984 if (c[i] != cast(T)(a[i] - 6))
986 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
995 /* ======================================================================== */
997 /***********************
1002 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
1004 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1007 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
1009 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1012 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
1015 assert(a.length == b.length);
1016 assert(disjoint(a, b));
1020 //printf("_arrayExpSliceMinSliceAssign_i()\n");
1022 auto aend = aptr + a.length;
1025 version (D_InlineAsm_X86)
1027 // SSE2 aligned version is 1812% faster
1028 if (sse2() && a.length >= 8)
1030 auto n = aptr + (a.length & ~7);
1034 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1036 asm // unaligned case
1042 pshufd XMM4, XMM4, 0;
1048 movdqu XMM3, [EAX+16];
1054 movdqu [ESI -32], XMM0;
1055 movdqu [ESI+16-32], XMM1;
1071 pshufd XMM4, XMM4, 0;
1077 movdqa XMM3, [EAX+16];
1083 movdqa [ESI -32], XMM0;
1084 movdqa [ESI+16-32], XMM1;
1094 // MMX version is 1077% faster
1095 if (mmx() && a.length >= 4)
1097 auto n = aptr + (a.length & ~3);
1099 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1118 movq [ESI -16], MM0;
1119 movq [ESI+8-16], MM1;
1131 *aptr++ = value - *bptr++;
1138 printf("_arrayExpSliceMinSliceAssign_i unittest\n");
1140 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1142 version (log) printf(" cpuid %d\n", cpuid);
1144 for (int j = 0; j < 2; j++)
1147 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1148 a = a[j .. dim + j]; // misalign for second iteration
1149 T[] b = new T[dim + j];
1150 b = b[j .. dim + j];
1151 T[] c = new T[dim + j];
1152 c = c[j .. dim + j];
1154 for (int i = 0; i < dim; i++)
1156 b[i] = cast(T)(i + 7);
1157 c[i] = cast(T)(i * 2);
1162 for (int i = 0; i < dim; i++)
1164 if (c[i] != cast(T)(6 - a[i]))
1166 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1175 /* ======================================================================== */
1177 /***********************
1182 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
1184 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1187 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
1189 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1192 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
1195 assert(a.length == b.length && b.length == c.length);
1196 assert(disjoint(a, b));
1197 assert(disjoint(a, c));
1198 assert(disjoint(b, c));
1203 auto aend = aptr + a.length;
1207 version (D_InlineAsm_X86)
1209 // SSE2 aligned version is 1721% faster
1210 if (sse2() && a.length >= 8)
1212 auto n = aptr + (a.length & ~7);
1214 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1216 asm // unaligned case
1228 movdqu XMM1, [EAX+16];
1229 movdqu XMM3, [ECX+16];
1234 movdqu [ESI -32], XMM0;
1235 movdqu [ESI+16-32], XMM1;
1258 movdqa XMM1, [EAX+16];
1259 movdqa XMM3, [ECX+16];
1264 movdqa [ESI -32], XMM0;
1265 movdqa [ESI+16-32], XMM1;
1276 // MMX version is 1002% faster
1277 if (mmx() && a.length >= 4)
1279 auto n = aptr + (a.length & ~3);
1299 movq [ESI -16], MM0;
1300 movq [ESI+8-16], MM1;
1313 *aptr++ = *bptr++ - *cptr++;
1320 printf("_arraySliceSliceMinSliceAssign_i unittest\n");
1322 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1324 version (log) printf(" cpuid %d\n", cpuid);
1326 for (int j = 0; j < 2; j++)
1329 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1330 a = a[j .. dim + j]; // misalign for second iteration
1331 T[] b = new T[dim + j];
1332 b = b[j .. dim + j];
1333 T[] c = new T[dim + j];
1334 c = c[j .. dim + j];
1336 for (int i = 0; i < dim; i++)
1338 b[i] = cast(T)(i + 7);
1339 c[i] = cast(T)(i * 2);
1344 for (int i = 0; i < dim; i++)
1346 if (c[i] != cast(T)(a[i] - b[i]))
1348 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1357 /* ======================================================================== */
1359 /***********************
1364 T[] _arrayExpSliceMinass_w(T[] a, T value)
1366 return _arrayExpSliceMinass_i(a, value);
1369 T[] _arrayExpSliceMinass_k(T[] a, T value)
1371 return _arrayExpSliceMinass_i(a, value);
1374 T[] _arrayExpSliceMinass_i(T[] a, T value)
1376 //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1378 auto aend = aptr + a.length;
1380 version (D_InlineAsm_X86)
1382 // SSE2 aligned version is 81% faster
1383 if (sse2() && a.length >= 8)
1385 auto n = aptr + (a.length & ~7);
1389 if (((cast(uint) aptr) & 15) != 0)
1391 asm // unaligned case
1396 pshufd XMM2, XMM2, 0;
1401 movdqu XMM1, [ESI+16];
1405 movdqu [ESI -32], XMM0;
1406 movdqu [ESI+16-32], XMM1;
1420 pshufd XMM2, XMM2, 0;
1425 movdqa XMM1, [ESI+16];
1429 movdqa [ESI -32], XMM0;
1430 movdqa [ESI+16-32], XMM1;
1439 // MMX version is 81% faster
1440 if (mmx() && a.length >= 4)
1442 auto n = aptr + (a.length & ~3);
1444 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1459 movq [ESI -16], MM0;
1460 movq [ESI+8-16], MM1;
1471 auto n = aptr + (a.length & ~1);
1504 printf("_arrayExpSliceMinass_i unittest\n");
1506 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1508 version (log) printf(" cpuid %d\n", cpuid);
1510 for (int j = 0; j < 2; j++)
1513 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1514 a = a[j .. dim + j]; // misalign for second iteration
1515 T[] b = new T[dim + j];
1516 b = b[j .. dim + j];
1517 T[] c = new T[dim + j];
1518 c = c[j .. dim + j];
1520 for (int i = 0; i < dim; i++)
1522 b[i] = cast(T)(i + 7);
1523 c[i] = cast(T)(i * 2);
1529 for (int i = 0; i < dim; i++)
1531 if (a[i] != cast(T)(c[i] - 6))
1533 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1542 /* ======================================================================== */
1544 /***********************
1549 T[] _arraySliceSliceMinass_w(T[] a, T[] b)
1551 return _arraySliceSliceMinass_i(a, b);
1554 T[] _arraySliceSliceMinass_k(T[] a, T[] b)
1556 return _arraySliceSliceMinass_i(a, b);
1559 T[] _arraySliceSliceMinass_i(T[] a, T[] b)
1562 assert (a.length == b.length);
1563 assert (disjoint(a, b));
1567 //printf("_arraySliceSliceMinass_i()\n");
1569 auto aend = aptr + a.length;
1572 version (D_InlineAsm_X86)
1574 // SSE2 aligned version is 731% faster
1575 if (sse2() && a.length >= 8)
1577 auto n = aptr + (a.length & ~7);
1579 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1581 asm // unaligned case
1591 movdqu XMM1, [ESI+16];
1592 movdqu XMM3, [ECX+16];
1597 movdqu [ESI -32], XMM0;
1598 movdqu [ESI+16-32], XMM1;
1618 movdqa XMM1, [ESI+16];
1619 movdqa XMM3, [ECX+16];
1624 movdqa [ESI -32], XMM0;
1625 movdqa [ESI+16-32], XMM1;
1635 // MMX version is 441% faster
1636 if (mmx() && a.length >= 4)
1638 auto n = aptr + (a.length & ~3);
1656 movq [ESI -16], MM0;
1657 movq [ESI+8-16], MM1;
1676 printf("_arraySliceSliceMinass_i unittest\n");
1678 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1680 version (log) printf(" cpuid %d\n", cpuid);
1682 for (int j = 0; j < 2; j++)
1685 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1686 a = a[j .. dim + j]; // misalign for second iteration
1687 T[] b = new T[dim + j];
1688 b = b[j .. dim + j];
1689 T[] c = new T[dim + j];
1690 c = c[j .. dim + j];
1692 for (int i = 0; i < dim; i++)
1694 b[i] = cast(T)(i + 7);
1695 c[i] = cast(T)(i * 2);
1701 for (int i = 0; i < dim; i++)
1703 if (c[i] != cast(T)(b[i] - a[i]))
1705 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1714 /* ======================================================================== */
1716 /***********************
1721 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
1723 return _arraySliceExpMulSliceAssign_i(a, value, b);
1726 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
1728 return _arraySliceExpMulSliceAssign_i(a, value, b);
1731 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
1734 assert(a.length == b.length);
1735 assert(disjoint(a, b));
1739 //printf("_arraySliceExpMulSliceAssign_i()\n");
1741 auto aend = aptr + a.length;
1744 version (none) // multiplying a pair is not supported by MMX
1746 version (D_InlineAsm_X86)
1748 // SSE2 aligned version is 1380% faster
1749 if (sse2() && a.length >= 8)
1751 auto n = aptr + (a.length & ~7);
1755 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1763 pshufd XMM2, XMM2, 0;
1769 movdqu XMM1, [EAX+16];
1773 movdqu [ESI -32], XMM0;
1774 movdqu [ESI+16-32], XMM1;
1790 pshufd XMM2, XMM2, 0;
1796 movdqa XMM1, [EAX+16];
1800 movdqa [ESI -32], XMM0;
1801 movdqa [ESI+16-32], XMM1;
1812 // MMX version is 1380% faster
1813 if (mmx() && a.length >= 4)
1815 auto n = aptr + (a.length & ~3);
1817 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1832 pmuludq MM0, MM2; // only multiplies low 32 bits
1834 movq [ESI -16], MM0;
1835 movq [ESI+8-16], MM1;
1849 *aptr++ = *bptr++ * value;
1856 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1858 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1860 version (log) printf(" cpuid %d\n", cpuid);
1862 for (int j = 0; j < 2; j++)
1865 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1866 a = a[j .. dim + j]; // misalign for second iteration
1867 T[] b = new T[dim + j];
1868 b = b[j .. dim + j];
1869 T[] c = new T[dim + j];
1870 c = c[j .. dim + j];
1872 for (int i = 0; i < dim; i++)
1874 b[i] = cast(T)(i + 7);
1875 c[i] = cast(T)(i * 2);
1880 for (int i = 0; i < dim; i++)
1882 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
1883 if (c[i] != cast(T)(a[i] * 6))
1885 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1894 /* ======================================================================== */
1896 /***********************
1901 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
1903 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1906 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
1908 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1911 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
1914 assert(a.length == b.length && b.length == c.length);
1915 assert(disjoint(a, b));
1916 assert(disjoint(a, c));
1917 assert(disjoint(b, c));
1921 //printf("_arraySliceSliceMulSliceAssign_i()\n");
1923 auto aend = aptr + a.length;
1929 version (D_InlineAsm_X86)
1931 // SSE2 aligned version is 1407% faster
1932 if (sse2() && a.length >= 8)
1934 auto n = aptr + (a.length & ~7);
1936 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1950 movdqu XMM1, [EAX+16];
1951 movdqu XMM3, [ECX+16];
1956 movdqu [ESI -32], XMM0;
1957 movdqu [ESI+16-32], XMM1;
1980 movdqa XMM1, [EAX+16];
1981 movdqa XMM3, [ECX+16];
1986 movdqa [ESI -32], XMM0;
1987 movdqa [ESI+16-32], XMM1;
1998 // MMX version is 1029% faster
1999 if (mmx() && a.length >= 4)
2001 auto n = aptr + (a.length & ~3);
2021 movq [ESI -16], MM0;
2022 movq [ESI+8-16], MM1;
2036 *aptr++ = *bptr++ * *cptr++;
2043 printf("_arraySliceSliceMulSliceAssign_i unittest\n");
2045 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2047 version (log) printf(" cpuid %d\n", cpuid);
2049 for (int j = 0; j < 2; j++)
2052 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2053 a = a[j .. dim + j]; // misalign for second iteration
2054 T[] b = new T[dim + j];
2055 b = b[j .. dim + j];
2056 T[] c = new T[dim + j];
2057 c = c[j .. dim + j];
2059 for (int i = 0; i < dim; i++)
2061 b[i] = cast(T)(i + 7);
2062 c[i] = cast(T)(i * 2);
2067 for (int i = 0; i < dim; i++)
2069 if (c[i] != cast(T)(a[i] * b[i]))
2071 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
2080 /* ======================================================================== */
2082 /***********************
2087 T[] _arrayExpSliceMulass_w(T[] a, T value)
2089 return _arrayExpSliceMulass_i(a, value);
2092 T[] _arrayExpSliceMulass_k(T[] a, T value)
2094 return _arrayExpSliceMulass_i(a, value);
2097 T[] _arrayExpSliceMulass_i(T[] a, T value)
2099 //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2101 auto aend = aptr + a.length;
2105 version (D_InlineAsm_X86)
2107 // SSE2 aligned version is 400% faster
2108 if (sse2() && a.length >= 8)
2110 auto n = aptr + (a.length & ~7);
2114 if (((cast(uint) aptr) & 15) != 0)
2121 pshufd XMM2, XMM2, 0;
2126 movdqu XMM1, [ESI+16];
2130 movdqu [ESI -32], XMM0;
2131 movdqu [ESI+16-32], XMM1;
2145 pshufd XMM2, XMM2, 0;
2150 movdqa XMM1, [ESI+16];
2154 movdqa [ESI -32], XMM0;
2155 movdqa [ESI+16-32], XMM1;
2164 // MMX version is 402% faster
2165 if (mmx() && a.length >= 4)
2167 auto n = aptr + (a.length & ~3);
2169 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
2184 movq [ESI -16], MM0;
2185 movq [ESI+8-16], MM1;
2204 printf("_arrayExpSliceMulass_i unittest\n");
2206 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2208 version (log) printf(" cpuid %d\n", cpuid);
2210 for (int j = 0; j < 2; j++)
2213 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2214 a = a[j .. dim + j]; // misalign for second iteration
2215 T[] b = new T[dim + j];
2216 b = b[j .. dim + j];
2217 T[] c = new T[dim + j];
2218 c = c[j .. dim + j];
2220 for (int i = 0; i < dim; i++)
2222 b[i] = cast(T)(i + 7);
2223 c[i] = cast(T)(i * 2);
2229 for (int i = 0; i < dim; i++)
2231 if (a[i] != cast(T)(b[i] * 6))
2233 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2242 /* ======================================================================== */
2244 /***********************
2249 T[] _arraySliceSliceMulass_w(T[] a, T[] b)
2251 return _arraySliceSliceMulass_i(a, b);
2254 T[] _arraySliceSliceMulass_k(T[] a, T[] b)
2256 return _arraySliceSliceMulass_i(a, b);
2259 T[] _arraySliceSliceMulass_i(T[] a, T[] b)
2262 assert (a.length == b.length);
2263 assert (disjoint(a, b));
2267 //printf("_arraySliceSliceMulass_i()\n");
2269 auto aend = aptr + a.length;
2274 version (D_InlineAsm_X86)
2276 // SSE2 aligned version is 873% faster
2277 if (sse2() && a.length >= 8)
2279 auto n = aptr + (a.length & ~7);
2281 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2293 movdqu XMM1, [ESI+16];
2294 movdqu XMM3, [ECX+16];
2299 movdqu [ESI -32], XMM0;
2300 movdqu [ESI+16-32], XMM1;
2320 movdqa XMM1, [ESI+16];
2321 movdqa XMM3, [ECX+16];
2326 movdqa [ESI -32], XMM0;
2327 movdqa [ESI+16-32], XMM1;
2336 /+ BUG: comment out this section until we figure out what is going
2337 wrong with the invalid pshufd instructions.
2340 // MMX version is 573% faster
2341 if (mmx() && a.length >= 4)
2343 auto n = aptr + (a.length & ~3);
2364 pshufd MM4, MM4, 8; // ?
2365 movq [ESI -16], MM4;
2371 pshufd MM4, MM4, 8; // ?
2372 movq [ESI+8-16], MM4;
2393 printf("_arraySliceSliceMulass_i unittest\n");
2395 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2397 version (log) printf(" cpuid %d\n", cpuid);
2399 for (int j = 0; j < 2; j++)
2402 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2403 a = a[j .. dim + j]; // misalign for second iteration
2404 T[] b = new T[dim + j];
2405 b = b[j .. dim + j];
2406 T[] c = new T[dim + j];
2407 c = c[j .. dim + j];
2409 for (int i = 0; i < dim; i++)
2411 b[i] = cast(T)(i + 7);
2412 c[i] = cast(T)(i * 2);
2418 for (int i = 0; i < dim; i++)
2420 if (a[i] != cast(T)(b[i] * c[i]))
2422 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);