1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
8 /* Contains SSE2 and MMX versions of certain operations for wchar, short,
9 * and ushort ('u', 's' and 't' suffixes).
14 private import util.cpuid;
18 private import core.stdc.stdio : printf;
19 /* This is so unit tests will test every CPU variant
22 const int CPUID_MAX = 4;
23 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
24 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
25 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
26 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
30 alias util.cpuid.mmx mmx;
31 alias util.cpuid.sse sse;
32 alias util.cpuid.sse2 sse2;
33 alias util.cpuid.sse2 sse2;
38 bool disjoint(T)(T[] a, T[] b)
40 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
47 /* ======================================================================== */
49 /***********************
54 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
56 return _arraySliceExpAddSliceAssign_s(a, value, b);
59 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
61 return _arraySliceExpAddSliceAssign_s(a, value, b);
64 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
67 assert(a.length == b.length);
68 assert(disjoint(a, b));
72 //printf("_arraySliceExpAddSliceAssign_s()\n");
74 auto aend = aptr + a.length;
77 version (D_InlineAsm_X86)
79 // SSE2 aligned version is 3343% faster
80 if (sse2() && a.length >= 16)
82 auto n = aptr + (a.length & ~15);
84 uint l = cast(ushort) value;
87 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
101 movdqu XMM1, [EAX+16];
105 movdqu [ESI -32], XMM0;
106 movdqu [ESI+16-32], XMM1;
122 pshufd XMM2, XMM2, 0;
128 movdqa XMM1, [EAX+16];
132 movdqa [ESI -32], XMM0;
133 movdqa [ESI+16-32], XMM1;
143 // MMX version is 3343% faster
144 if (mmx() && a.length >= 8)
146 auto n = aptr + (a.length & ~7);
148 uint l = cast(ushort) value;
167 movq [ESI+8-16], MM1;
179 *aptr++ = cast(T)(*bptr++ + value);
186 printf("_arraySliceExpAddSliceAssign_s unittest\n");
188 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
190 version (log) printf(" cpuid %d\n", cpuid);
192 for (int j = 0; j < 2; j++)
195 T[] a = new T[dim + j]; // aligned on 16 byte boundary
196 a = a[j .. dim + j]; // misalign for second iteration
197 T[] b = new T[dim + j];
199 T[] c = new T[dim + j];
202 for (int i = 0; i < dim; i++)
204 b[i] = cast(T)(i + 7);
205 c[i] = cast(T)(i * 2);
210 for (int i = 0; i < dim; i++)
212 if (c[i] != cast(T)(a[i] + 6))
214 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
223 /* ======================================================================== */
225 /***********************
230 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
232 return _arraySliceSliceAddSliceAssign_s(a, c, b);
235 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
237 return _arraySliceSliceAddSliceAssign_s(a, c, b);
240 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
243 assert(a.length == b.length && b.length == c.length);
244 assert(disjoint(a, b));
245 assert(disjoint(a, c));
246 assert(disjoint(b, c));
250 //printf("_arraySliceSliceAddSliceAssign_s()\n");
252 auto aend = aptr + a.length;
256 version (D_InlineAsm_X86)
258 // SSE2 aligned version is 3777% faster
259 if (sse2() && a.length >= 16)
261 auto n = aptr + (a.length & ~15);
263 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
265 asm // unaligned case
276 movdqu XMM1, [EAX+16];
279 movdqu XMM3, [ECX+16];
283 movdqu [ESI -32], XMM0;
284 movdqu [ESI+16-32], XMM1;
306 movdqa XMM1, [EAX+16];
309 movdqa XMM3, [ECX+16];
313 movdqa [ESI -32], XMM0;
314 movdqa [ESI+16-32], XMM1;
325 // MMX version is 2068% faster
326 if (mmx() && a.length >= 8)
328 auto n = aptr + (a.length & ~7);
349 movq [ESI+8-16], MM1;
362 *aptr++ = cast(T)(*bptr++ + *cptr++);
369 printf("_arraySliceSliceAddSliceAssign_s unittest\n");
371 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
373 version (log) printf(" cpuid %d\n", cpuid);
375 for (int j = 0; j < 2; j++)
378 T[] a = new T[dim + j]; // aligned on 16 byte boundary
379 a = a[j .. dim + j]; // misalign for second iteration
380 T[] b = new T[dim + j];
382 T[] c = new T[dim + j];
385 for (int i = 0; i < dim; i++)
387 b[i] = cast(T)(i + 7);
388 c[i] = cast(T)(i * 2);
393 for (int i = 0; i < dim; i++)
395 if (c[i] != cast(T)(a[i] + b[i]))
397 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
406 /* ======================================================================== */
408 /***********************
413 T[] _arrayExpSliceAddass_u(T[] a, T value)
415 return _arrayExpSliceAddass_s(a, value);
418 T[] _arrayExpSliceAddass_t(T[] a, T value)
420 return _arrayExpSliceAddass_s(a, value);
423 T[] _arrayExpSliceAddass_s(T[] a, T value)
425 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
427 auto aend = aptr + a.length;
429 version (D_InlineAsm_X86)
431 // SSE2 aligned version is 832% faster
432 if (sse2() && a.length >= 16)
434 auto n = aptr + (a.length & ~15);
436 uint l = cast(ushort) value;
439 if (((cast(uint) aptr) & 15) != 0)
441 asm // unaligned case
446 pshufd XMM2, XMM2, 0;
451 movdqu XMM1, [ESI+16];
455 movdqu [ESI -32], XMM0;
456 movdqu [ESI+16-32], XMM1;
470 pshufd XMM2, XMM2, 0;
475 movdqa XMM1, [ESI+16];
479 movdqa [ESI -32], XMM0;
480 movdqa [ESI+16-32], XMM1;
489 // MMX version is 826% faster
490 if (mmx() && a.length >= 8)
492 auto n = aptr + (a.length & ~7);
494 uint l = cast(ushort) value;
511 movq [ESI+8-16], MM1;
529 printf("_arrayExpSliceAddass_s unittest\n");
531 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
533 version (log) printf(" cpuid %d\n", cpuid);
535 for (int j = 0; j < 2; j++)
538 T[] a = new T[dim + j]; // aligned on 16 byte boundary
539 a = a[j .. dim + j]; // misalign for second iteration
540 T[] b = new T[dim + j];
542 T[] c = new T[dim + j];
545 for (int i = 0; i < dim; i++)
547 b[i] = cast(T)(i + 7);
548 c[i] = cast(T)(i * 2);
554 for (int i = 0; i < dim; i++)
556 if (a[i] != cast(T)(c[i] + 6))
558 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
567 /* ======================================================================== */
569 /***********************
574 T[] _arraySliceSliceAddass_u(T[] a, T[] b)
576 return _arraySliceSliceAddass_s(a, b);
579 T[] _arraySliceSliceAddass_t(T[] a, T[] b)
581 return _arraySliceSliceAddass_s(a, b);
584 T[] _arraySliceSliceAddass_s(T[] a, T[] b)
587 assert (a.length == b.length);
588 assert (disjoint(a, b));
592 //printf("_arraySliceSliceAddass_s()\n");
594 auto aend = aptr + a.length;
597 version (D_InlineAsm_X86)
599 // SSE2 aligned version is 2085% faster
600 if (sse2() && a.length >= 16)
602 auto n = aptr + (a.length & ~15);
604 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
606 asm // unaligned case
615 movdqu XMM1, [ESI+16];
618 movdqu XMM3, [ECX+16];
622 movdqu [ESI -32], XMM0;
623 movdqu [ESI+16-32], XMM1;
642 movdqa XMM1, [ESI+16];
645 movdqa XMM3, [ECX+16];
649 movdqa [ESI -32], XMM0;
650 movdqa [ESI+16-32], XMM1;
660 // MMX version is 1022% faster
661 if (mmx() && a.length >= 8)
663 auto n = aptr + (a.length & ~7);
682 movq [ESI+8-16], MM1;
701 printf("_arraySliceSliceAddass_s unittest\n");
703 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
705 version (log) printf(" cpuid %d\n", cpuid);
707 for (int j = 0; j < 2; j++)
710 T[] a = new T[dim + j]; // aligned on 16 byte boundary
711 a = a[j .. dim + j]; // misalign for second iteration
712 T[] b = new T[dim + j];
714 T[] c = new T[dim + j];
717 for (int i = 0; i < dim; i++)
719 b[i] = cast(T)(i + 7);
720 c[i] = cast(T)(i * 2);
726 for (int i = 0; i < dim; i++)
728 if (c[i] != cast(T)(b[i] + a[i]))
730 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
739 /* ======================================================================== */
741 /***********************
746 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
748 return _arraySliceExpMinSliceAssign_s(a, value, b);
751 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
753 return _arraySliceExpMinSliceAssign_s(a, value, b);
756 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
759 assert(a.length == b.length);
760 assert(disjoint(a, b));
764 //printf("_arraySliceExpMinSliceAssign_s()\n");
766 auto aend = aptr + a.length;
769 version (D_InlineAsm_X86)
771 // SSE2 aligned version is 3695% faster
772 if (sse2() && a.length >= 16)
774 auto n = aptr + (a.length & ~15);
776 uint l = cast(ushort) value;
779 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
781 asm // unaligned case
787 pshufd XMM2, XMM2, 0;
793 movdqu XMM1, [EAX+16];
797 movdqu [ESI -32], XMM0;
798 movdqu [ESI+16-32], XMM1;
814 pshufd XMM2, XMM2, 0;
820 movdqa XMM1, [EAX+16];
824 movdqa [ESI -32], XMM0;
825 movdqa [ESI+16-32], XMM1;
835 // MMX version is 3049% faster
836 if (mmx() && a.length >= 8)
838 auto n = aptr + (a.length & ~7);
840 uint l = cast(ushort) value;
859 movq [ESI+8-16], MM1;
871 *aptr++ = cast(T)(*bptr++ - value);
878 printf("_arraySliceExpMinSliceAssign_s unittest\n");
880 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
882 version (log) printf(" cpuid %d\n", cpuid);
884 for (int j = 0; j < 2; j++)
887 T[] a = new T[dim + j]; // aligned on 16 byte boundary
888 a = a[j .. dim + j]; // misalign for second iteration
889 T[] b = new T[dim + j];
891 T[] c = new T[dim + j];
894 for (int i = 0; i < dim; i++)
896 b[i] = cast(T)(i + 7);
897 c[i] = cast(T)(i * 2);
902 for (int i = 0; i < dim; i++)
904 if (c[i] != cast(T)(a[i] - 6))
906 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
915 /* ======================================================================== */
917 /***********************
922 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
924 return _arrayExpSliceMinSliceAssign_s(a, b, value);
927 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
929 return _arrayExpSliceMinSliceAssign_s(a, b, value);
932 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
935 assert(a.length == b.length);
936 assert(disjoint(a, b));
940 //printf("_arrayExpSliceMinSliceAssign_s()\n");
942 auto aend = aptr + a.length;
945 version (D_InlineAsm_X86)
947 // SSE2 aligned version is 4995% faster
948 if (sse2() && a.length >= 16)
950 auto n = aptr + (a.length & ~15);
952 uint l = cast(ushort) value;
955 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
957 asm // unaligned case
966 pshufd XMM2, XMM2, 0;
968 pshufd XMM3, XMM3, 0;
971 movdqu XMM1, [EAX+16];
975 movdqu [ESI -32], XMM2;
976 movdqu [ESI+16-32], XMM3;
995 pshufd XMM2, XMM2, 0;
997 pshufd XMM3, XMM3, 0;
1000 movdqa XMM1, [EAX+16];
1004 movdqa [ESI -32], XMM2;
1005 movdqa [ESI+16-32], XMM3;
1015 // MMX version is 4562% faster
1016 if (mmx() && a.length >= 8)
1018 auto n = aptr + (a.length & ~7);
1020 uint l = cast(ushort) value;
1040 movq [ESI -16], MM0;
1041 movq [ESI+8-16], MM1;
1053 *aptr++ = cast(T)(value - *bptr++);
1060 printf("_arrayExpSliceMinSliceAssign_s unittest\n");
1062 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1064 version (log) printf(" cpuid %d\n", cpuid);
1066 for (int j = 0; j < 2; j++)
1069 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1070 a = a[j .. dim + j]; // misalign for second iteration
1071 T[] b = new T[dim + j];
1072 b = b[j .. dim + j];
1073 T[] c = new T[dim + j];
1074 c = c[j .. dim + j];
1076 for (int i = 0; i < dim; i++)
1078 b[i] = cast(T)(i + 7);
1079 c[i] = cast(T)(i * 2);
1084 for (int i = 0; i < dim; i++)
1086 if (c[i] != cast(T)(6 - a[i]))
1088 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1097 /* ======================================================================== */
1099 /***********************
1104 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
1106 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1109 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
1111 return _arraySliceSliceMinSliceAssign_s(a, c, b);
1114 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
1117 assert(a.length == b.length && b.length == c.length);
1118 assert(disjoint(a, b));
1119 assert(disjoint(a, c));
1120 assert(disjoint(b, c));
1125 auto aend = aptr + a.length;
1129 version (D_InlineAsm_X86)
1131 // SSE2 aligned version is 4129% faster
1132 if (sse2() && a.length >= 16)
1134 auto n = aptr + (a.length & ~15);
1136 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1138 asm // unaligned case
1149 movdqu XMM1, [EAX+16];
1152 movdqu XMM3, [ECX+16];
1156 movdqu [ESI -32], XMM0;
1157 movdqu [ESI+16-32], XMM1;
1179 movdqa XMM1, [EAX+16];
1182 movdqa XMM3, [ECX+16];
1186 movdqa [ESI -32], XMM0;
1187 movdqa [ESI+16-32], XMM1;
1198 // MMX version is 2018% faster
1199 if (mmx() && a.length >= 8)
1201 auto n = aptr + (a.length & ~7);
1221 movq [ESI -16], MM0;
1222 movq [ESI+8-16], MM1;
1235 *aptr++ = cast(T)(*bptr++ - *cptr++);
1242 printf("_arraySliceSliceMinSliceAssign_s unittest\n");
1244 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1246 version (log) printf(" cpuid %d\n", cpuid);
1248 for (int j = 0; j < 2; j++)
1251 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1252 a = a[j .. dim + j]; // misalign for second iteration
1253 T[] b = new T[dim + j];
1254 b = b[j .. dim + j];
1255 T[] c = new T[dim + j];
1256 c = c[j .. dim + j];
1258 for (int i = 0; i < dim; i++)
1260 b[i] = cast(T)(i + 7);
1261 c[i] = cast(T)(i * 2);
1266 for (int i = 0; i < dim; i++)
1268 if (c[i] != cast(T)(a[i] - b[i]))
1270 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1279 /* ======================================================================== */
1281 /***********************
1286 T[] _arrayExpSliceMinass_u(T[] a, T value)
1288 return _arrayExpSliceMinass_s(a, value);
1291 T[] _arrayExpSliceMinass_t(T[] a, T value)
1293 return _arrayExpSliceMinass_s(a, value);
1296 T[] _arrayExpSliceMinass_s(T[] a, T value)
1298 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1300 auto aend = aptr + a.length;
1302 version (D_InlineAsm_X86)
1304 // SSE2 aligned version is 835% faster
1305 if (sse2() && a.length >= 16)
1307 auto n = aptr + (a.length & ~15);
1309 uint l = cast(ushort) value;
1312 if (((cast(uint) aptr) & 15) != 0)
1314 asm // unaligned case
1319 pshufd XMM2, XMM2, 0;
1324 movdqu XMM1, [ESI+16];
1328 movdqu [ESI -32], XMM0;
1329 movdqu [ESI+16-32], XMM1;
1343 pshufd XMM2, XMM2, 0;
1348 movdqa XMM1, [ESI+16];
1352 movdqa [ESI -32], XMM0;
1353 movdqa [ESI+16-32], XMM1;
1362 // MMX version is 835% faster
1363 if (mmx() && a.length >= 8)
1365 auto n = aptr + (a.length & ~7);
1367 uint l = cast(ushort) value;
1383 movq [ESI -16], MM0;
1384 movq [ESI+8-16], MM1;
1402 printf("_arrayExpSliceMinass_s unittest\n");
1404 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1406 version (log) printf(" cpuid %d\n", cpuid);
1408 for (int j = 0; j < 2; j++)
1411 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1412 a = a[j .. dim + j]; // misalign for second iteration
1413 T[] b = new T[dim + j];
1414 b = b[j .. dim + j];
1415 T[] c = new T[dim + j];
1416 c = c[j .. dim + j];
1418 for (int i = 0; i < dim; i++)
1420 b[i] = cast(T)(i + 7);
1421 c[i] = cast(T)(i * 2);
1427 for (int i = 0; i < dim; i++)
1429 if (a[i] != cast(T)(c[i] - 6))
1431 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1440 /* ======================================================================== */
1442 /***********************
1447 T[] _arraySliceSliceMinass_u(T[] a, T[] b)
1449 return _arraySliceSliceMinass_s(a, b);
1452 T[] _arraySliceSliceMinass_t(T[] a, T[] b)
1454 return _arraySliceSliceMinass_s(a, b);
1457 T[] _arraySliceSliceMinass_s(T[] a, T[] b)
1460 assert (a.length == b.length);
1461 assert (disjoint(a, b));
1465 //printf("_arraySliceSliceMinass_s()\n");
1467 auto aend = aptr + a.length;
1470 version (D_InlineAsm_X86)
1472 // SSE2 aligned version is 2121% faster
1473 if (sse2() && a.length >= 16)
1475 auto n = aptr + (a.length & ~15);
1477 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1479 asm // unaligned case
1488 movdqu XMM1, [ESI+16];
1491 movdqu XMM3, [ECX+16];
1495 movdqu [ESI -32], XMM0;
1496 movdqu [ESI+16-32], XMM1;
1515 movdqa XMM1, [ESI+16];
1518 movdqa XMM3, [ECX+16];
1522 movdqa [ESI -32], XMM0;
1523 movdqa [ESI+16-32], XMM1;
1533 // MMX version is 1116% faster
1534 if (mmx() && a.length >= 8)
1536 auto n = aptr + (a.length & ~7);
1554 movq [ESI -16], MM0;
1555 movq [ESI+8-16], MM1;
1574 printf("_arraySliceSliceMinass_s unittest\n");
1576 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1578 version (log) printf(" cpuid %d\n", cpuid);
1580 for (int j = 0; j < 2; j++)
1583 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1584 a = a[j .. dim + j]; // misalign for second iteration
1585 T[] b = new T[dim + j];
1586 b = b[j .. dim + j];
1587 T[] c = new T[dim + j];
1588 c = c[j .. dim + j];
1590 for (int i = 0; i < dim; i++)
1592 b[i] = cast(T)(i + 7);
1593 c[i] = cast(T)(i * 2);
1599 for (int i = 0; i < dim; i++)
1601 if (c[i] != cast(T)(b[i] - a[i]))
1603 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1612 /* ======================================================================== */
1614 /***********************
1619 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
1621 return _arraySliceExpMulSliceAssign_s(a, value, b);
1624 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
1626 return _arraySliceExpMulSliceAssign_s(a, value, b);
1629 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
1632 assert(a.length == b.length);
1633 assert(disjoint(a, b));
1637 //printf("_arraySliceExpMulSliceAssign_s()\n");
1639 auto aend = aptr + a.length;
1642 version (D_InlineAsm_X86)
1644 // SSE2 aligned version is 3733% faster
1645 if (sse2() && a.length >= 16)
1647 auto n = aptr + (a.length & ~15);
1649 uint l = cast(ushort) value;
1652 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1660 pshufd XMM2, XMM2, 0;
1666 movdqu XMM1, [EAX+16];
1670 movdqu [ESI -32], XMM0;
1671 movdqu [ESI+16-32], XMM1;
1687 pshufd XMM2, XMM2, 0;
1693 movdqa XMM1, [EAX+16];
1697 movdqa [ESI -32], XMM0;
1698 movdqa [ESI+16-32], XMM1;
1708 // MMX version is 3733% faster
1709 if (mmx() && a.length >= 8)
1711 auto n = aptr + (a.length & ~7);
1713 uint l = cast(ushort) value;
1731 movq [ESI -16], MM0;
1732 movq [ESI+8-16], MM1;
1744 *aptr++ = cast(T)(*bptr++ * value);
1751 printf("_arraySliceExpMulSliceAssign_s unittest\n");
1753 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1755 version (log) printf(" cpuid %d\n", cpuid);
1757 for (int j = 0; j < 2; j++)
1760 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1761 a = a[j .. dim + j]; // misalign for second iteration
1762 T[] b = new T[dim + j];
1763 b = b[j .. dim + j];
1764 T[] c = new T[dim + j];
1765 c = c[j .. dim + j];
1767 for (int i = 0; i < dim; i++)
1769 b[i] = cast(T)(i + 7);
1770 c[i] = cast(T)(i * 2);
1775 for (int i = 0; i < dim; i++)
1777 if (c[i] != cast(T)(a[i] * 6))
1779 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1788 /* ======================================================================== */
1790 /***********************
1795 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
1797 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1800 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
1802 return _arraySliceSliceMulSliceAssign_s(a, c, b);
1805 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
1808 assert(a.length == b.length && b.length == c.length);
1809 assert(disjoint(a, b));
1810 assert(disjoint(a, c));
1811 assert(disjoint(b, c));
1815 //printf("_arraySliceSliceMulSliceAssign_s()\n");
1817 auto aend = aptr + a.length;
1821 version (D_InlineAsm_X86)
1823 // SSE2 aligned version is 2515% faster
1824 if (sse2() && a.length >= 16)
1826 auto n = aptr + (a.length & ~15);
1828 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1842 movdqu XMM1, [EAX+16];
1843 movdqu XMM3, [ECX+16];
1848 movdqu [ESI -32], XMM0;
1849 movdqu [ESI+16-32], XMM1;
1872 movdqa XMM1, [EAX+16];
1873 movdqa XMM3, [ECX+16];
1878 movdqa [ESI -32], XMM0;
1879 movdqa [ESI+16-32], XMM1;
1890 // MMX version is 2515% faster
1891 if (mmx() && a.length >= 8)
1893 auto n = aptr + (a.length & ~7);
1913 movq [ESI -16], MM0;
1914 movq [ESI+8-16], MM1;
1927 *aptr++ = cast(T)(*bptr++ * *cptr++);
1934 printf("_arraySliceSliceMulSliceAssign_s unittest\n");
1936 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1938 version (log) printf(" cpuid %d\n", cpuid);
1940 for (int j = 0; j < 2; j++)
1943 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1944 a = a[j .. dim + j]; // misalign for second iteration
1945 T[] b = new T[dim + j];
1946 b = b[j .. dim + j];
1947 T[] c = new T[dim + j];
1948 c = c[j .. dim + j];
1950 for (int i = 0; i < dim; i++)
1952 b[i] = cast(T)(i + 7);
1953 c[i] = cast(T)(i * 2);
1958 for (int i = 0; i < dim; i++)
1960 if (c[i] != cast(T)(a[i] * b[i]))
1962 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
1971 /* ======================================================================== */
1973 /***********************
1978 T[] _arrayExpSliceMulass_u(T[] a, T value)
1980 return _arrayExpSliceMulass_s(a, value);
1983 T[] _arrayExpSliceMulass_t(T[] a, T value)
1985 return _arrayExpSliceMulass_s(a, value);
1988 T[] _arrayExpSliceMulass_s(T[] a, T value)
1990 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1992 auto aend = aptr + a.length;
1994 version (D_InlineAsm_X86)
1996 // SSE2 aligned version is 2044% faster
1997 if (sse2() && a.length >= 16)
1999 auto n = aptr + (a.length & ~15);
2001 uint l = cast(ushort) value;
2004 if (((cast(uint) aptr) & 15) != 0)
2011 pshufd XMM2, XMM2, 0;
2016 movdqu XMM1, [ESI+16];
2020 movdqu [ESI -32], XMM0;
2021 movdqu [ESI+16-32], XMM1;
2035 pshufd XMM2, XMM2, 0;
2040 movdqa XMM1, [ESI+16];
2044 movdqa [ESI -32], XMM0;
2045 movdqa [ESI+16-32], XMM1;
2054 // MMX version is 2056% faster
2055 if (mmx() && a.length >= 8)
2057 auto n = aptr + (a.length & ~7);
2059 uint l = cast(ushort) value;
2075 movq [ESI -16], MM0;
2076 movq [ESI+8-16], MM1;
2094 printf("_arrayExpSliceMulass_s unittest\n");
2096 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2098 version (log) printf(" cpuid %d\n", cpuid);
2100 for (int j = 0; j < 2; j++)
2103 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2104 a = a[j .. dim + j]; // misalign for second iteration
2105 T[] b = new T[dim + j];
2106 b = b[j .. dim + j];
2107 T[] c = new T[dim + j];
2108 c = c[j .. dim + j];
2110 for (int i = 0; i < dim; i++)
2112 b[i] = cast(T)(i + 7);
2113 c[i] = cast(T)(i * 2);
2119 for (int i = 0; i < dim; i++)
2121 if (a[i] != cast(T)(b[i] * 6))
2123 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2132 /* ======================================================================== */
2134 /***********************
2139 T[] _arraySliceSliceMulass_u(T[] a, T[] b)
2141 return _arraySliceSliceMulass_s(a, b);
2144 T[] _arraySliceSliceMulass_t(T[] a, T[] b)
2146 return _arraySliceSliceMulass_s(a, b);
2149 T[] _arraySliceSliceMulass_s(T[] a, T[] b)
2152 assert (a.length == b.length);
2153 assert (disjoint(a, b));
2157 //printf("_arraySliceSliceMulass_s()\n");
2159 auto aend = aptr + a.length;
2162 version (D_InlineAsm_X86)
2164 // SSE2 aligned version is 2519% faster
2165 if (sse2() && a.length >= 16)
2167 auto n = aptr + (a.length & ~15);
2169 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2181 movdqu XMM1, [ESI+16];
2182 movdqu XMM3, [ECX+16];
2187 movdqu [ESI -32], XMM0;
2188 movdqu [ESI+16-32], XMM1;
2208 movdqa XMM1, [ESI+16];
2209 movdqa XMM3, [ECX+16];
2214 movdqa [ESI -32], XMM0;
2215 movdqa [ESI+16-32], XMM1;
2225 // MMX version is 1712% faster
2226 if (mmx() && a.length >= 8)
2228 auto n = aptr + (a.length & ~7);
2246 movq [ESI -16], MM0;
2247 movq [ESI+8-16], MM1;
2266 printf("_arraySliceSliceMulass_s unittest\n");
2268 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2270 version (log) printf(" cpuid %d\n", cpuid);
2272 for (int j = 0; j < 2; j++)
2275 T[] a = new T[dim + j]; // aligned on 16 byte boundary
2276 a = a[j .. dim + j]; // misalign for second iteration
2277 T[] b = new T[dim + j];
2278 b = b[j .. dim + j];
2279 T[] c = new T[dim + j];
2280 c = c[j .. dim + j];
2282 for (int i = 0; i < dim; i++)
2284 b[i] = cast(T)(i + 7);
2285 c[i] = cast(T)(i * 2);
2291 for (int i = 0; i < dim; i++)
2293 if (a[i] != cast(T)(b[i] * c[i]))
2295 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);