1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
8 /* Contains MMX versions of certain operations for dchar, int,
9 * and uint ('w', 'i' and 'k' suffixes).
14 private import util.cpuid;
18 /* This is so unit tests will test every CPU variant
21 const int CPUID_MAX = 4;
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
29 alias util.cpuid.mmx mmx;
30 alias util.cpuid.sse sse;
31 alias util.cpuid.sse2 sse2;
32 alias util.cpuid.amd3dnow amd3dnow;
37 bool disjoint(T)(T[] a, T[] b)
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
46 /* ======================================================================== */
48 /***********************
53 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
55 return _arraySliceExpAddSliceAssign_i(a, value, b);
58 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
60 return _arraySliceExpAddSliceAssign_i(a, value, b);
63 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
66 assert(a.length == b.length);
67 assert(disjoint(a, b));
71 //printf("_arraySliceExpAddSliceAssign_i()\n");
73 auto aend = aptr + a.length;
76 version (D_InlineAsm_X86)
78 // SSE2 aligned version is 380% faster
79 if (sse2() && a.length >= 8)
81 auto n = aptr + (a.length & ~7);
85 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
99 movdqu XMM1, [EAX+16];
103 movdqu [ESI -32], XMM0;
104 movdqu [ESI+16-32], XMM1;
120 pshufd XMM2, XMM2, 0;
126 movdqa XMM1, [EAX+16];
130 movdqa [ESI -32], XMM0;
131 movdqa [ESI+16-32], XMM1;
141 // MMX version is 298% faster
142 if (mmx() && a.length >= 4)
144 auto n = aptr + (a.length & ~3);
146 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
164 movq [ESI+8-16], MM1;
176 auto n = aptr + (a.length & ~1);
205 *aptr++ = *bptr++ + value;
212 printf("_arraySliceExpAddSliceAssign_i unittest\n");
214 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
216 version (log) printf(" cpuid %d\n", cpuid);
218 for (int j = 0; j < 2; j++)
221 T[] a = new T[dim + j]; // aligned on 16 byte boundary
222 a = a[j .. dim + j]; // misalign for second iteration
223 T[] b = new T[dim + j];
225 T[] c = new T[dim + j];
228 for (int i = 0; i < dim; i++)
230 b[i] = cast(T)(i + 7);
231 c[i] = cast(T)(i * 2);
236 for (int i = 0; i < dim; i++)
238 if (c[i] != cast(T)(a[i] + 6))
240 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
249 /* ======================================================================== */
251 /***********************
256 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
258 return _arraySliceSliceAddSliceAssign_i(a, c, b);
261 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
263 return _arraySliceSliceAddSliceAssign_i(a, c, b);
266 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
269 assert(a.length == b.length && b.length == c.length);
270 assert(disjoint(a, b));
271 assert(disjoint(a, c));
272 assert(disjoint(b, c));
276 //printf("_arraySliceSliceAddSliceAssign_i()\n");
278 auto aend = aptr + a.length;
282 version (D_InlineAsm_X86)
284 // SSE2 aligned version is 1710% faster
285 if (sse2() && a.length >= 8)
287 auto n = aptr + (a.length & ~7);
289 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
291 asm // unaligned case
303 movdqu XMM1, [EAX+16];
304 movdqu XMM3, [ECX+16];
309 movdqu [ESI -32], XMM0;
310 movdqu [ESI+16-32], XMM1;
333 movdqa XMM1, [EAX+16];
334 movdqa XMM3, [ECX+16];
339 movdqa [ESI -32], XMM0;
340 movdqa [ESI+16-32], XMM1;
351 // MMX version is 995% faster
352 if (mmx() && a.length >= 4)
354 auto n = aptr + (a.length & ~3);
375 movq [ESI+8-16], MM1;
389 *aptr++ = *bptr++ + *cptr++;
396 printf("_arraySliceSliceAddSliceAssign_i unittest\n");
398 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
400 version (log) printf(" cpuid %d\n", cpuid);
402 for (int j = 0; j < 2; j++)
405 T[] a = new T[dim + j]; // aligned on 16 byte boundary
406 a = a[j .. dim + j]; // misalign for second iteration
407 T[] b = new T[dim + j];
409 T[] c = new T[dim + j];
412 for (int i = 0; i < dim; i++)
414 b[i] = cast(T)(i + 7);
415 c[i] = cast(T)(i * 2);
420 for (int i = 0; i < dim; i++)
422 if (c[i] != cast(T)(a[i] + b[i]))
424 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
433 /* ======================================================================== */
435 /***********************
440 T[] _arrayExpSliceAddass_w(T[] a, T value)
442 return _arrayExpSliceAddass_i(a, value);
445 T[] _arrayExpSliceAddass_k(T[] a, T value)
447 return _arrayExpSliceAddass_i(a, value);
450 T[] _arrayExpSliceAddass_i(T[] a, T value)
452 //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
454 auto aend = aptr + a.length;
456 version (D_InlineAsm_X86)
458 // SSE2 aligned version is 83% faster
459 if (sse2() && a.length >= 8)
461 auto n = aptr + (a.length & ~7);
465 if (((cast(uint) aptr) & 15) != 0)
467 asm // unaligned case
472 pshufd XMM2, XMM2, 0;
477 movdqu XMM1, [ESI+16];
481 movdqu [ESI -32], XMM0;
482 movdqu [ESI+16-32], XMM1;
496 pshufd XMM2, XMM2, 0;
501 movdqa XMM1, [ESI+16];
505 movdqa [ESI -32], XMM0;
506 movdqa [ESI+16-32], XMM1;
515 // MMX version is 81% faster
516 if (mmx() && a.length >= 4)
518 auto n = aptr + (a.length & ~3);
520 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
536 movq [ESI+8-16], MM1;
547 auto n = aptr + (a.length & ~1);
580 printf("_arrayExpSliceAddass_i unittest\n");
582 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
584 version (log) printf(" cpuid %d\n", cpuid);
586 for (int j = 0; j < 2; j++)
589 T[] a = new T[dim + j]; // aligned on 16 byte boundary
590 a = a[j .. dim + j]; // misalign for second iteration
591 T[] b = new T[dim + j];
593 T[] c = new T[dim + j];
596 for (int i = 0; i < dim; i++)
598 b[i] = cast(T)(i + 7);
599 c[i] = cast(T)(i * 2);
605 for (int i = 0; i < dim; i++)
607 if (a[i] != cast(T)(c[i] + 6))
609 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
618 /* ======================================================================== */
620 /***********************
625 T[] _arraySliceSliceAddass_w(T[] a, T[] b)
627 return _arraySliceSliceAddass_i(a, b);
630 T[] _arraySliceSliceAddass_k(T[] a, T[] b)
632 return _arraySliceSliceAddass_i(a, b);
635 T[] _arraySliceSliceAddass_i(T[] a, T[] b)
638 assert (a.length == b.length);
639 assert (disjoint(a, b));
643 //printf("_arraySliceSliceAddass_i()\n");
645 auto aend = aptr + a.length;
648 version (D_InlineAsm_X86)
650 // SSE2 aligned version is 695% faster
651 if (sse2() && a.length >= 8)
653 auto n = aptr + (a.length & ~7);
655 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
657 asm // unaligned case
667 movdqu XMM1, [ESI+16];
668 movdqu XMM3, [ECX+16];
673 movdqu [ESI -32], XMM0;
674 movdqu [ESI+16-32], XMM1;
694 movdqa XMM1, [ESI+16];
695 movdqa XMM3, [ECX+16];
700 movdqa [ESI -32], XMM0;
701 movdqa [ESI+16-32], XMM1;
711 // MMX version is 471% faster
712 if (mmx() && a.length >= 4)
714 auto n = aptr + (a.length & ~3);
733 movq [ESI+8-16], MM1;
753 printf("_arraySliceSliceAddass_i unittest\n");
755 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
757 version (log) printf(" cpuid %d\n", cpuid);
759 for (int j = 0; j < 2; j++)
762 T[] a = new T[dim + j]; // aligned on 16 byte boundary
763 a = a[j .. dim + j]; // misalign for second iteration
764 T[] b = new T[dim + j];
766 T[] c = new T[dim + j];
769 for (int i = 0; i < dim; i++)
771 b[i] = cast(T)(i + 7);
772 c[i] = cast(T)(i * 2);
778 for (int i = 0; i < dim; i++)
780 if (c[i] != cast(T)(b[i] + a[i]))
782 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
791 /* ======================================================================== */
793 /***********************
798 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
800 return _arraySliceExpMinSliceAssign_i(a, value, b);
803 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
805 return _arraySliceExpMinSliceAssign_i(a, value, b);
808 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
811 assert(a.length == b.length);
812 assert(disjoint(a, b));
816 //printf("_arraySliceExpMinSliceAssign_i()\n");
818 auto aend = aptr + a.length;
821 version (D_InlineAsm_X86)
823 // SSE2 aligned version is 400% faster
824 if (sse2() && a.length >= 8)
826 auto n = aptr + (a.length & ~7);
830 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
832 asm // unaligned case
838 pshufd XMM2, XMM2, 0;
844 movdqu XMM1, [EAX+16];
848 movdqu [ESI -32], XMM0;
849 movdqu [ESI+16-32], XMM1;
865 pshufd XMM2, XMM2, 0;
871 movdqa XMM1, [EAX+16];
875 movdqa [ESI -32], XMM0;
876 movdqa [ESI+16-32], XMM1;
886 // MMX version is 315% faster
887 if (mmx() && a.length >= 4)
889 auto n = aptr + (a.length & ~3);
891 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
909 movq [ESI+8-16], MM1;
921 auto n = aptr + (a.length & ~1);
950 *aptr++ = *bptr++ - value;
957 printf("_arraySliceExpMinSliceAssign_i unittest\n");
959 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
961 version (log) printf(" cpuid %d\n", cpuid);
963 for (int j = 0; j < 2; j++)
966 T[] a = new T[dim + j]; // aligned on 16 byte boundary
967 a = a[j .. dim + j]; // misalign for second iteration
968 T[] b = new T[dim + j];
970 T[] c = new T[dim + j];
973 for (int i = 0; i < dim; i++)
975 b[i] = cast(T)(i + 7);
976 c[i] = cast(T)(i * 2);
981 for (int i = 0; i < dim; i++)
983 if (c[i] != cast(T)(a[i] - 6))
985 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
994 /* ======================================================================== */
996 /***********************
1001 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
1003 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1006 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
1008 return _arrayExpSliceMinSliceAssign_i(a, b, value);
1011 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
1014 assert(a.length == b.length);
1015 assert(disjoint(a, b));
1019 //printf("_arrayExpSliceMinSliceAssign_i()\n");
1021 auto aend = aptr + a.length;
1024 version (D_InlineAsm_X86)
1026 // SSE2 aligned version is 1812% faster
1027 if (sse2() && a.length >= 8)
1029 auto n = aptr + (a.length & ~7);
1033 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1035 asm // unaligned case
1041 pshufd XMM4, XMM4, 0;
1047 movdqu XMM3, [EAX+16];
1053 movdqu [ESI -32], XMM0;
1054 movdqu [ESI+16-32], XMM1;
1070 pshufd XMM4, XMM4, 0;
1076 movdqa XMM3, [EAX+16];
1082 movdqa [ESI -32], XMM0;
1083 movdqa [ESI+16-32], XMM1;
1093 // MMX version is 1077% faster
1094 if (mmx() && a.length >= 4)
1096 auto n = aptr + (a.length & ~3);
1098 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1117 movq [ESI -16], MM0;
1118 movq [ESI+8-16], MM1;
1130 *aptr++ = value - *bptr++;
1137 printf("_arrayExpSliceMinSliceAssign_i unittest\n");
1139 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1141 version (log) printf(" cpuid %d\n", cpuid);
1143 for (int j = 0; j < 2; j++)
1146 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1147 a = a[j .. dim + j]; // misalign for second iteration
1148 T[] b = new T[dim + j];
1149 b = b[j .. dim + j];
1150 T[] c = new T[dim + j];
1151 c = c[j .. dim + j];
1153 for (int i = 0; i < dim; i++)
1155 b[i] = cast(T)(i + 7);
1156 c[i] = cast(T)(i * 2);
1161 for (int i = 0; i < dim; i++)
1163 if (c[i] != cast(T)(6 - a[i]))
1165 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1174 /* ======================================================================== */
1176 /***********************
1181 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
1183 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1186 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
1188 return _arraySliceSliceMinSliceAssign_i(a, c, b);
1191 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
1194 assert(a.length == b.length && b.length == c.length);
1195 assert(disjoint(a, b));
1196 assert(disjoint(a, c));
1197 assert(disjoint(b, c));
1202 auto aend = aptr + a.length;
1206 version (D_InlineAsm_X86)
1208 // SSE2 aligned version is 1721% faster
1209 if (sse2() && a.length >= 8)
1211 auto n = aptr + (a.length & ~7);
1213 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1215 asm // unaligned case
1227 movdqu XMM1, [EAX+16];
1228 movdqu XMM3, [ECX+16];
1233 movdqu [ESI -32], XMM0;
1234 movdqu [ESI+16-32], XMM1;
1257 movdqa XMM1, [EAX+16];
1258 movdqa XMM3, [ECX+16];
1263 movdqa [ESI -32], XMM0;
1264 movdqa [ESI+16-32], XMM1;
1275 // MMX version is 1002% faster
1276 if (mmx() && a.length >= 4)
1278 auto n = aptr + (a.length & ~3);
1298 movq [ESI -16], MM0;
1299 movq [ESI+8-16], MM1;
1312 *aptr++ = *bptr++ - *cptr++;
1319 printf("_arraySliceSliceMinSliceAssign_i unittest\n");
1321 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1323 version (log) printf(" cpuid %d\n", cpuid);
1325 for (int j = 0; j < 2; j++)
1328 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1329 a = a[j .. dim + j]; // misalign for second iteration
1330 T[] b = new T[dim + j];
1331 b = b[j .. dim + j];
1332 T[] c = new T[dim + j];
1333 c = c[j .. dim + j];
1335 for (int i = 0; i < dim; i++)
1337 b[i] = cast(T)(i + 7);
1338 c[i] = cast(T)(i * 2);
1343 for (int i = 0; i < dim; i++)
1345 if (c[i] != cast(T)(a[i] - b[i]))
1347 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1356 /* ======================================================================== */
1358 /***********************
1363 T[] _arrayExpSliceMinass_w(T[] a, T value)
1365 return _arrayExpSliceMinass_i(a, value);
1368 T[] _arrayExpSliceMinass_k(T[] a, T value)
1370 return _arrayExpSliceMinass_i(a, value);
1373 T[] _arrayExpSliceMinass_i(T[] a, T value)
1375 //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1377 auto aend = aptr + a.length;
1379 version (D_InlineAsm_X86)
1381 // SSE2 aligned version is 81% faster
1382 if (sse2() && a.length >= 8)
1384 auto n = aptr + (a.length & ~7);
1388 if (((cast(uint) aptr) & 15) != 0)
1390 asm // unaligned case
1395 pshufd XMM2, XMM2, 0;
1400 movdqu XMM1, [ESI+16];
1404 movdqu [ESI -32], XMM0;
1405 movdqu [ESI+16-32], XMM1;
1419 pshufd XMM2, XMM2, 0;
1424 movdqa XMM1, [ESI+16];
1428 movdqa [ESI -32], XMM0;
1429 movdqa [ESI+16-32], XMM1;
1438 // MMX version is 81% faster
1439 if (mmx() && a.length >= 4)
1441 auto n = aptr + (a.length & ~3);
1443 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1458 movq [ESI -16], MM0;
1459 movq [ESI+8-16], MM1;
1470 auto n = aptr + (a.length & ~1);
1503 printf("_arrayExpSliceMinass_i unittest\n");
1505 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1507 version (log) printf(" cpuid %d\n", cpuid);
1509 for (int j = 0; j < 2; j++)
1512 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1513 a = a[j .. dim + j]; // misalign for second iteration
1514 T[] b = new T[dim + j];
1515 b = b[j .. dim + j];
1516 T[] c = new T[dim + j];
1517 c = c[j .. dim + j];
1519 for (int i = 0; i < dim; i++)
1521 b[i] = cast(T)(i + 7);
1522 c[i] = cast(T)(i * 2);
1528 for (int i = 0; i < dim; i++)
1530 if (a[i] != cast(T)(c[i] - 6))
1532 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1541 /* ======================================================================== */
1543 /***********************
1548 T[] _arraySliceSliceMinass_w(T[] a, T[] b)
1550 return _arraySliceSliceMinass_i(a, b);
1553 T[] _arraySliceSliceMinass_k(T[] a, T[] b)
1555 return _arraySliceSliceMinass_i(a, b);
1558 T[] _arraySliceSliceMinass_i(T[] a, T[] b)
1561 assert (a.length == b.length);
1562 assert (disjoint(a, b));
1566 //printf("_arraySliceSliceMinass_i()\n");
1568 auto aend = aptr + a.length;
1571 version (D_InlineAsm_X86)
1573 // SSE2 aligned version is 731% faster
1574 if (sse2() && a.length >= 8)
1576 auto n = aptr + (a.length & ~7);
1578 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1580 asm // unaligned case
1590 movdqu XMM1, [ESI+16];
1591 movdqu XMM3, [ECX+16];
1596 movdqu [ESI -32], XMM0;
1597 movdqu [ESI+16-32], XMM1;
1617 movdqa XMM1, [ESI+16];
1618 movdqa XMM3, [ECX+16];
1623 movdqa [ESI -32], XMM0;
1624 movdqa [ESI+16-32], XMM1;
1634 // MMX version is 441% faster
1635 if (mmx() && a.length >= 4)
1637 auto n = aptr + (a.length & ~3);
1655 movq [ESI -16], MM0;
1656 movq [ESI+8-16], MM1;
1675 printf("_arraySliceSliceMinass_i unittest\n");
1677 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1679 version (log) printf(" cpuid %d\n", cpuid);
1681 for (int j = 0; j < 2; j++)
1684 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1685 a = a[j .. dim + j]; // misalign for second iteration
1686 T[] b = new T[dim + j];
1687 b = b[j .. dim + j];
1688 T[] c = new T[dim + j];
1689 c = c[j .. dim + j];
1691 for (int i = 0; i < dim; i++)
1693 b[i] = cast(T)(i + 7);
1694 c[i] = cast(T)(i * 2);
1700 for (int i = 0; i < dim; i++)
1702 if (c[i] != cast(T)(b[i] - a[i]))
1704 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1713 /* ======================================================================== */
1715 /***********************
1720 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
1722 return _arraySliceExpMulSliceAssign_i(a, value, b);
1725 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
1727 return _arraySliceExpMulSliceAssign_i(a, value, b);
1730 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
1733 assert(a.length == b.length);
1734 assert(disjoint(a, b));
1738 //printf("_arraySliceExpMulSliceAssign_i()\n");
1740 auto aend = aptr + a.length;
1743 version (none) // multiplying a pair is not supported by MMX
1745 version (D_InlineAsm_X86)
1747 // SSE2 aligned version is 1380% faster
1748 if (sse2() && a.length >= 8)
1750 auto n = aptr + (a.length & ~7);
1754 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1762 pshufd XMM2, XMM2, 0;
1768 movdqu XMM1, [EAX+16];
1772 movdqu [ESI -32], XMM0;
1773 movdqu [ESI+16-32], XMM1;
1789 pshufd XMM2, XMM2, 0;
1795 movdqa XMM1, [EAX+16];
1799 movdqa [ESI -32], XMM0;
1800 movdqa [ESI+16-32], XMM1;
1811 // MMX version is 1380% faster
1812 if (mmx() && a.length >= 4)
1814 auto n = aptr + (a.length & ~3);
1816 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1831 pmuludq MM0, MM2; // only multiplies low 32 bits
1833 movq [ESI -16], MM0;
1834 movq [ESI+8-16], MM1;
1848 *aptr++ = *bptr++ * value;
1855 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1857 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1859 version (log) printf(" cpuid %d\n", cpuid);
1861 for (int j = 0; j < 2; j++)
1864 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1865 a = a[j .. dim + j]; // misalign for second iteration
1866 T[] b = new T[dim + j];
1867 b = b[j .. dim + j];
1868 T[] c = new T[dim + j];
1869 c = c[j .. dim + j];
1871 for (int i = 0; i < dim; i++)
1873 b[i] = cast(T)(i + 7);
1874 c[i] = cast(T)(i * 2);
1879 for (int i = 0; i < dim; i++)
1881 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
1882 if (c[i] != cast(T)(a[i] * 6))
1884 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1893 /* ======================================================================== */
1895 /***********************
1900 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
1902 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1905 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
1907 return _arraySliceSliceMulSliceAssign_i(a, c, b);
1910 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
1913 assert(a.length == b.length && b.length == c.length);
1914 assert(disjoint(a, b));
1915 assert(disjoint(a, c));
1916 assert(disjoint(b, c));
1920 //printf("_arraySliceSliceMulSliceAssign_i()\n");
1922 auto aend = aptr + a.length;
1928 version (D_InlineAsm_X86)
1930 // SSE2 aligned version is 1407% faster
1931 if (sse2() && a.length >= 8)
1933 auto n = aptr + (a.length & ~7);
1935 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1949 movdqu XMM1, [EAX+16];
1950 movdqu XMM3, [ECX+16];
1955 movdqu [ESI -32], XMM0;
1956 movdqu [ESI+16-32], XMM1;
1979 movdqa XMM1, [EAX+16];
1980 movdqa XMM3, [ECX+16];
1985 movdqa [ESI -32], XMM0;
1986 movdqa [ESI+16-32], XMM1;
1997 // MMX version is 1029% faster
1998 if (mmx() && a.length >= 4)
2000 auto n = aptr + (a.length & ~3);
2020 movq [ESI -16], MM0;
2021 movq [ESI+8-16], MM1;
2035 *aptr++ = *bptr++ * *cptr++;
2042 printf("_arraySliceSliceMulSliceAssign_i unittest\n");
2044 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2046 version (log) printf(" cpuid %d\n", cpuid);
2048 for (int j = 0; j < 2; j++)
2051 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2052 a = a[j .. dim + j]; // misalign for second iteration
2053 T[] b = new T[dim + j];
2054 b = b[j .. dim + j];
2055 T[] c = new T[dim + j];
2056 c = c[j .. dim + j];
2058 for (int i = 0; i < dim; i++)
2060 b[i] = cast(T)(i + 7);
2061 c[i] = cast(T)(i * 2);
2066 for (int i = 0; i < dim; i++)
2068 if (c[i] != cast(T)(a[i] * b[i]))
2070 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
2079 /* ======================================================================== */
2081 /***********************
2086 T[] _arrayExpSliceMulass_w(T[] a, T value)
2088 return _arrayExpSliceMulass_i(a, value);
2091 T[] _arrayExpSliceMulass_k(T[] a, T value)
2093 return _arrayExpSliceMulass_i(a, value);
2096 T[] _arrayExpSliceMulass_i(T[] a, T value)
2098 //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2100 auto aend = aptr + a.length;
2104 version (D_InlineAsm_X86)
2106 // SSE2 aligned version is 400% faster
2107 if (sse2() && a.length >= 8)
2109 auto n = aptr + (a.length & ~7);
2113 if (((cast(uint) aptr) & 15) != 0)
2120 pshufd XMM2, XMM2, 0;
2125 movdqu XMM1, [ESI+16];
2129 movdqu [ESI -32], XMM0;
2130 movdqu [ESI+16-32], XMM1;
2144 pshufd XMM2, XMM2, 0;
2149 movdqa XMM1, [ESI+16];
2153 movdqa [ESI -32], XMM0;
2154 movdqa [ESI+16-32], XMM1;
2163 // MMX version is 402% faster
2164 if (mmx() && a.length >= 4)
2166 auto n = aptr + (a.length & ~3);
2168 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
2183 movq [ESI -16], MM0;
2184 movq [ESI+8-16], MM1;
2203 printf("_arrayExpSliceMulass_i unittest\n");
2205 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2207 version (log) printf(" cpuid %d\n", cpuid);
2209 for (int j = 0; j < 2; j++)
2212 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2213 a = a[j .. dim + j]; // misalign for second iteration
2214 T[] b = new T[dim + j];
2215 b = b[j .. dim + j];
2216 T[] c = new T[dim + j];
2217 c = c[j .. dim + j];
2219 for (int i = 0; i < dim; i++)
2221 b[i] = cast(T)(i + 7);
2222 c[i] = cast(T)(i * 2);
2228 for (int i = 0; i < dim; i++)
2230 if (a[i] != cast(T)(b[i] * 6))
2232 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2241 /* ======================================================================== */
2243 /***********************
2248 T[] _arraySliceSliceMulass_w(T[] a, T[] b)
2250 return _arraySliceSliceMulass_i(a, b);
2253 T[] _arraySliceSliceMulass_k(T[] a, T[] b)
2255 return _arraySliceSliceMulass_i(a, b);
2258 T[] _arraySliceSliceMulass_i(T[] a, T[] b)
2261 assert (a.length == b.length);
2262 assert (disjoint(a, b));
2266 //printf("_arraySliceSliceMulass_i()\n");
2268 auto aend = aptr + a.length;
2273 version (D_InlineAsm_X86)
2275 // SSE2 aligned version is 873% faster
2276 if (sse2() && a.length >= 8)
2278 auto n = aptr + (a.length & ~7);
2280 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2292 movdqu XMM1, [ESI+16];
2293 movdqu XMM3, [ECX+16];
2298 movdqu [ESI -32], XMM0;
2299 movdqu [ESI+16-32], XMM1;
2319 movdqa XMM1, [ESI+16];
2320 movdqa XMM3, [ECX+16];
2325 movdqa [ESI -32], XMM0;
2326 movdqa [ESI+16-32], XMM1;
2335 /+ BUG: comment out this section until we figure out what is going
2336 wrong with the invalid pshufd instructions.
2339 // MMX version is 573% faster
2340 if (mmx() && a.length >= 4)
2342 auto n = aptr + (a.length & ~3);
2363 pshufd MM4, MM4, 8; // ?
2364 movq [ESI -16], MM4;
2370 pshufd MM4, MM4, 8; // ?
2371 movq [ESI+8-16], MM4;
2392 printf("_arraySliceSliceMulass_i unittest\n");
2394 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2396 version (log) printf(" cpuid %d\n", cpuid);
2398 for (int j = 0; j < 2; j++)
2401 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2402 a = a[j .. dim + j]; // misalign for second iteration
2403 T[] b = new T[dim + j];
2404 b = b[j .. dim + j];
2405 T[] c = new T[dim + j];
2406 c = c[j .. dim + j];
2408 for (int i = 0; i < dim; i++)
2410 b[i] = cast(T)(i + 7);
2411 c[i] = cast(T)(i * 2);
2417 for (int i = 0; i < dim; i++)
2419 if (a[i] != cast(T)(b[i] * c[i]))
2421 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);