1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
8 /* Contains SSE2 and MMX versions of certain operations for char, byte,
9 * and ubyte ('a', 'g' and 'h' suffixes).
18 /* This is so unit tests will test every CPU variant
21 const int CPUID_MAX = 4;
22 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
23 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
24 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
25 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
29 alias util.cpuid.mmx mmx;
30 alias util.cpuid.sse sse;
31 alias util.cpuid.sse2 sse2;
32 alias util.cpuid.amd3dnow amd3dnow;
37 bool disjoint(T)(T[] a, T[] b)
39 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
46 /* ======================================================================== */
49 /***********************
54 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
56 return _arraySliceExpAddSliceAssign_g(a, value, b);
59 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
61 return _arraySliceExpAddSliceAssign_g(a, value, b);
64 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
67 assert(a.length == b.length);
68 assert(disjoint(a, b));
72 //printf("_arraySliceExpAddSliceAssign_g()\n");
74 auto aend = aptr + a.length;
77 version (D_InlineAsm_X86)
79 // SSE2 aligned version is 1088% faster
80 if (sse2() && a.length >= 64)
82 auto n = aptr + (a.length & ~63);
84 uint l = cast(ubyte) value;
88 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
102 movdqu XMM1, [EAX+16];
103 movdqu XMM2, [EAX+32];
104 movdqu XMM3, [EAX+48];
110 movdqu [ESI -64], XMM0;
111 movdqu [ESI+16-64], XMM1;
112 movdqu [ESI+32-64], XMM2;
113 movdqu [ESI+48-64], XMM3;
129 pshufd XMM4, XMM4, 0;
135 movdqa XMM1, [EAX+16];
136 movdqa XMM2, [EAX+32];
137 movdqa XMM3, [EAX+48];
143 movdqa [ESI -64], XMM0;
144 movdqa [ESI+16-64], XMM1;
145 movdqa [ESI+32-64], XMM2;
146 movdqa [ESI+48-64], XMM3;
156 // MMX version is 1000% faster
157 if (mmx() && a.length >= 32)
159 auto n = aptr + (a.length & ~31);
161 uint l = cast(ubyte) value;
185 movq [ESI+8 -32], MM1;
186 movq [ESI+16-32], MM2;
187 movq [ESI+24-32], MM3;
196 /* trying to be fair and treat normal 32-bit cpu the same way as we do
197 * the SIMD units, with unrolled asm. There's not enough registers,
204 auto n = aptr + (a.length & ~3);
235 *aptr++ = cast(T)(*bptr++ + value);
242 printf("_arraySliceExpAddSliceAssign_g unittest\n");
244 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
246 version (log) printf(" cpuid %d\n", cpuid);
248 for (int j = 0; j < 2; j++)
251 T[] a = new T[dim + j]; // aligned on 16 byte boundary
252 a = a[j .. dim + j]; // misalign for second iteration
253 T[] b = new T[dim + j];
255 T[] c = new T[dim + j];
258 for (int i = 0; i < dim; i++)
260 b[i] = cast(T)(i + 7);
261 c[i] = cast(T)(i * 2);
266 for (int i = 0; i < dim; i++)
268 if (c[i] != cast(T)(a[i] + 6))
270 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
279 /* ======================================================================== */
281 /***********************
286 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
288 return _arraySliceSliceAddSliceAssign_g(a, c, b);
291 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
293 return _arraySliceSliceAddSliceAssign_g(a, c, b);
296 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
299 assert(a.length == b.length && b.length == c.length);
300 assert(disjoint(a, b));
301 assert(disjoint(a, c));
302 assert(disjoint(b, c));
306 //printf("_arraySliceSliceAddSliceAssign_g()\n");
308 auto aend = aptr + a.length;
312 version (D_InlineAsm_X86)
314 // SSE2 aligned version is 5739% faster
315 if (sse2() && a.length >= 64)
317 auto n = aptr + (a.length & ~63);
319 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
321 version (log) printf("\tsse2 unaligned\n");
322 asm // unaligned case
333 movdqu XMM1, [EAX+16];
334 movdqu XMM2, [EAX+32];
335 movdqu XMM3, [EAX+48];
338 movdqu XMM5, [ECX+16];
339 movdqu XMM6, [ECX+32];
340 movdqu XMM7, [ECX+48];
346 movdqu [ESI -64], XMM0;
347 movdqu [ESI+16-64], XMM1;
348 movdqu [ESI+32-64], XMM2;
349 movdqu [ESI+48-64], XMM3;
360 version (log) printf("\tsse2 aligned\n");
372 movdqa XMM1, [EAX+16];
373 movdqa XMM2, [EAX+32];
374 movdqa XMM3, [EAX+48];
377 movdqa XMM5, [ECX+16];
378 movdqa XMM6, [ECX+32];
379 movdqa XMM7, [ECX+48];
385 movdqa [ESI -64], XMM0;
386 movdqa [ESI+16-64], XMM1;
387 movdqa [ESI+32-64], XMM2;
388 movdqa [ESI+48-64], XMM3;
399 // MMX version is 4428% faster
400 if (mmx() && a.length >= 32)
402 version (log) printf("\tmmx\n");
403 auto n = aptr + (a.length & ~31);
430 movq [ESI+8 -32], MM1;
431 movq [ESI+16-32], MM2;
432 movq [ESI+24-32], MM3;
444 version (log) if (aptr < aend) printf("\tbase\n");
446 *aptr++ = cast(T)(*bptr++ + *cptr++);
453 printf("_arraySliceSliceAddSliceAssign_g unittest\n");
455 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
457 version (log) printf(" cpuid %d\n", cpuid);
459 for (int j = 0; j < 2; j++)
462 T[] a = new T[dim + j]; // aligned on 16 byte boundary
463 a = a[j .. dim + j]; // misalign for second iteration
464 T[] b = new T[dim + j];
466 T[] c = new T[dim + j];
469 for (int i = 0; i < dim; i++)
471 b[i] = cast(T)(i + 7);
472 c[i] = cast(T)(i * 2);
477 for (int i = 0; i < dim; i++)
479 if (c[i] != cast(T)(a[i] + b[i]))
481 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
490 /* ======================================================================== */
492 /***********************
497 T[] _arrayExpSliceAddass_a(T[] a, T value)
499 return _arrayExpSliceAddass_g(a, value);
502 T[] _arrayExpSliceAddass_h(T[] a, T value)
504 return _arrayExpSliceAddass_g(a, value);
507 T[] _arrayExpSliceAddass_g(T[] a, T value)
509 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
511 auto aend = aptr + a.length;
513 version (D_InlineAsm_X86)
515 // SSE2 aligned version is 1578% faster
516 if (sse2() && a.length >= 64)
518 auto n = aptr + (a.length & ~63);
520 uint l = cast(ubyte) value;
524 if (((cast(uint) aptr) & 15) != 0)
526 asm // unaligned case
531 pshufd XMM4, XMM4, 0;
536 movdqu XMM1, [ESI+16];
537 movdqu XMM2, [ESI+32];
538 movdqu XMM3, [ESI+48];
544 movdqu [ESI -64], XMM0;
545 movdqu [ESI+16-64], XMM1;
546 movdqu [ESI+32-64], XMM2;
547 movdqu [ESI+48-64], XMM3;
561 pshufd XMM4, XMM4, 0;
566 movdqa XMM1, [ESI+16];
567 movdqa XMM2, [ESI+32];
568 movdqa XMM3, [ESI+48];
574 movdqa [ESI -64], XMM0;
575 movdqa [ESI+16-64], XMM1;
576 movdqa [ESI+32-64], XMM2;
577 movdqa [ESI+48-64], XMM3;
586 // MMX version is 1721% faster
587 if (mmx() && a.length >= 32)
590 auto n = aptr + (a.length & ~31);
592 uint l = cast(ubyte) value;
614 movq [ESI+8 -32], MM1;
615 movq [ESI+16-32], MM2;
616 movq [ESI+24-32], MM3;
634 printf("_arrayExpSliceAddass_g unittest\n");
636 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
638 version (log) printf(" cpuid %d\n", cpuid);
640 for (int j = 0; j < 2; j++)
643 T[] a = new T[dim + j]; // aligned on 16 byte boundary
644 a = a[j .. dim + j]; // misalign for second iteration
645 T[] b = new T[dim + j];
647 T[] c = new T[dim + j];
650 for (int i = 0; i < dim; i++)
652 b[i] = cast(T)(i + 7);
653 c[i] = cast(T)(i * 2);
659 for (int i = 0; i < dim; i++)
661 if (c[i] != cast(T)(a[i] + 6))
663 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
672 /* ======================================================================== */
674 /***********************
679 T[] _arraySliceSliceAddass_a(T[] a, T[] b)
681 return _arraySliceSliceAddass_g(a, b);
684 T[] _arraySliceSliceAddass_h(T[] a, T[] b)
686 return _arraySliceSliceAddass_g(a, b);
689 T[] _arraySliceSliceAddass_g(T[] a, T[] b)
692 assert (a.length == b.length);
693 assert (disjoint(a, b));
697 //printf("_arraySliceSliceAddass_g()\n");
699 auto aend = aptr + a.length;
702 version (D_InlineAsm_X86)
704 // SSE2 aligned version is 4727% faster
705 if (sse2() && a.length >= 64)
707 auto n = aptr + (a.length & ~63);
709 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
711 asm // unaligned case
720 movdqu XMM1, [ESI+16];
721 movdqu XMM2, [ESI+32];
722 movdqu XMM3, [ESI+48];
725 movdqu XMM5, [ECX+16];
726 movdqu XMM6, [ECX+32];
727 movdqu XMM7, [ECX+48];
733 movdqu [ESI -64], XMM0;
734 movdqu [ESI+16-64], XMM1;
735 movdqu [ESI+32-64], XMM2;
736 movdqu [ESI+48-64], XMM3;
738 jb startaddasslsse2u;
755 movdqa XMM1, [ESI+16];
756 movdqa XMM2, [ESI+32];
757 movdqa XMM3, [ESI+48];
760 movdqa XMM5, [ECX+16];
761 movdqa XMM6, [ECX+32];
762 movdqa XMM7, [ECX+48];
768 movdqa [ESI -64], XMM0;
769 movdqa [ESI+16-64], XMM1;
770 movdqa [ESI+32-64], XMM2;
771 movdqa [ESI+48-64], XMM3;
773 jb startaddasslsse2a;
781 // MMX version is 3059% faster
782 if (mmx() && a.length >= 32)
785 auto n = aptr + (a.length & ~31);
810 movq [ESI+8 -32], MM1;
811 movq [ESI+16-32], MM2;
812 movq [ESI+24-32], MM3;
831 printf("_arraySliceSliceAddass_g unittest\n");
833 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
835 version (log) printf(" cpuid %d\n", cpuid);
837 for (int j = 0; j < 2; j++)
840 T[] a = new T[dim + j]; // aligned on 16 byte boundary
841 a = a[j .. dim + j]; // misalign for second iteration
842 T[] b = new T[dim + j];
844 T[] c = new T[dim + j];
847 for (int i = 0; i < dim; i++)
849 b[i] = cast(T)(i + 7);
850 c[i] = cast(T)(i * 2);
856 for (int i = 0; i < dim; i++)
858 if (c[i] != cast(T)(a[i] + b[i]))
860 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
869 /* ======================================================================== */
872 /***********************
877 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
879 return _arraySliceExpMinSliceAssign_g(a, value, b);
882 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
884 return _arraySliceExpMinSliceAssign_g(a, value, b);
887 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
890 assert(a.length == b.length);
891 assert(disjoint(a, b));
895 //printf("_arraySliceExpMinSliceAssign_g()\n");
897 auto aend = aptr + a.length;
900 version (D_InlineAsm_X86)
902 // SSE2 aligned version is 1189% faster
903 if (sse2() && a.length >= 64)
905 auto n = aptr + (a.length & ~63);
907 uint l = cast(ubyte) value;
911 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
913 asm // unaligned case
919 pshufd XMM4, XMM4, 0;
925 movdqu XMM1, [EAX+16];
926 movdqu XMM2, [EAX+32];
927 movdqu XMM3, [EAX+48];
933 movdqu [ESI -64], XMM0;
934 movdqu [ESI+16-64], XMM1;
935 movdqu [ESI+32-64], XMM2;
936 movdqu [ESI+48-64], XMM3;
952 pshufd XMM4, XMM4, 0;
958 movdqa XMM1, [EAX+16];
959 movdqa XMM2, [EAX+32];
960 movdqa XMM3, [EAX+48];
966 movdqa [ESI -64], XMM0;
967 movdqa [ESI+16-64], XMM1;
968 movdqa [ESI+32-64], XMM2;
969 movdqa [ESI+48-64], XMM3;
979 // MMX version is 1079% faster
980 if (mmx() && a.length >= 32)
982 auto n = aptr + (a.length & ~31);
984 uint l = cast(ubyte) value;
1007 movq [ESI -32], MM0;
1008 movq [ESI+8 -32], MM1;
1009 movq [ESI+16-32], MM2;
1010 movq [ESI+24-32], MM3;
1019 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really.
1023 auto n = aptr + (a.length & ~3);
1053 *aptr++ = cast(T)(*bptr++ - value);
1060 printf("_arraySliceExpMinSliceAssign_g unittest\n");
1062 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1064 version (log) printf(" cpuid %d\n", cpuid);
1066 for (int j = 0; j < 2; j++)
1069 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1070 a = a[j .. dim + j]; // misalign for second iteration
1071 T[] b = new T[dim + j];
1072 b = b[j .. dim + j];
1073 T[] c = new T[dim + j];
1074 c = c[j .. dim + j];
1076 for (int i = 0; i < dim; i++)
1078 b[i] = cast(T)(i + 7);
1079 c[i] = cast(T)(i * 2);
1085 for (int i = 0; i < dim; i++)
1087 if (c[i] != cast(T)(b[i] - 6))
1089 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
1098 /* ======================================================================== */
1100 /***********************
1105 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
1107 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1110 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
1112 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1115 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
1118 assert(a.length == b.length);
1119 assert(disjoint(a, b));
1123 //printf("_arrayExpSliceMinSliceAssign_g()\n");
1125 auto aend = aptr + a.length;
1128 version (D_InlineAsm_X86)
1130 // SSE2 aligned version is 8748% faster
1131 if (sse2() && a.length >= 64)
1133 auto n = aptr + (a.length & ~63);
1135 uint l = cast(ubyte) value;
1139 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1141 asm // unaligned case
1147 pshufd XMM4, XMM4, 0;
1155 movdqu XMM1, [EAX+16];
1158 movdqu [ESI -64], XMM5;
1159 movdqu [ESI+16-64], XMM6;
1162 movdqu XMM2, [EAX+32];
1163 movdqu XMM3, [EAX+48];
1167 movdqu [ESI+32-64], XMM5;
1168 movdqu [ESI+48-64], XMM6;
1184 pshufd XMM4, XMM4, 0;
1192 movdqa XMM1, [EAX+16];
1195 movdqa [ESI -64], XMM5;
1196 movdqa [ESI+16-64], XMM6;
1199 movdqa XMM2, [EAX+32];
1200 movdqa XMM3, [EAX+48];
1204 movdqa [ESI+32-64], XMM5;
1205 movdqa [ESI+48-64], XMM6;
1215 // MMX version is 7397% faster
1216 if (mmx() && a.length >= 32)
1218 auto n = aptr + (a.length & ~31);
1220 uint l = cast(ubyte) value;
1240 movq [ESI -32], MM5;
1241 movq [ESI+8 -32], MM6;
1249 movq [ESI+16-32], MM5;
1250 movq [ESI+24-32], MM6;
1263 *aptr++ = cast(T)(value - *bptr++);
1270 printf("_arrayExpSliceMinSliceAssign_g unittest\n");
1272 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1274 version (log) printf(" cpuid %d\n", cpuid);
1276 for (int j = 0; j < 2; j++)
1279 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1280 a = a[j .. dim + j]; // misalign for second iteration
1281 T[] b = new T[dim + j];
1282 b = b[j .. dim + j];
1283 T[] c = new T[dim + j];
1284 c = c[j .. dim + j];
1286 for (int i = 0; i < dim; i++)
1288 b[i] = cast(T)(i + 7);
1289 c[i] = cast(T)(i * 2);
1295 for (int i = 0; i < dim; i++)
1297 if (c[i] != cast(T)(6 - b[i]))
1299 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
1308 /* ======================================================================== */
1310 /***********************
1315 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
1317 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1320 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
1322 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1325 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
1328 assert(a.length == b.length && b.length == c.length);
1329 assert(disjoint(a, b));
1330 assert(disjoint(a, c));
1331 assert(disjoint(b, c));
1336 auto aend = aptr + a.length;
1340 version (D_InlineAsm_X86)
1342 // SSE2 aligned version is 5756% faster
1343 if (sse2() && a.length >= 64)
1345 auto n = aptr + (a.length & ~63);
1347 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1349 asm // unaligned case
1360 movdqu XMM1, [EAX+16];
1361 movdqu XMM2, [EAX+32];
1362 movdqu XMM3, [EAX+48];
1365 movdqu XMM5, [ECX+16];
1366 movdqu XMM6, [ECX+32];
1367 movdqu XMM7, [ECX+48];
1373 movdqu [ESI -64], XMM0;
1374 movdqu [ESI+16-64], XMM1;
1375 movdqu [ESI+32-64], XMM2;
1376 movdqu [ESI+48-64], XMM3;
1398 movdqa XMM1, [EAX+16];
1399 movdqa XMM2, [EAX+32];
1400 movdqa XMM3, [EAX+48];
1403 movdqa XMM5, [ECX+16];
1404 movdqa XMM6, [ECX+32];
1405 movdqa XMM7, [ECX+48];
1411 movdqa [ESI -64], XMM0;
1412 movdqa [ESI+16-64], XMM1;
1413 movdqa [ESI+32-64], XMM2;
1414 movdqa [ESI+48-64], XMM3;
1425 // MMX version is 4428% faster
1426 if (mmx() && a.length >= 32)
1428 auto n = aptr + (a.length & ~31);
1454 movq [ESI -32], MM0;
1455 movq [ESI+8 -32], MM1;
1456 movq [ESI+16-32], MM2;
1457 movq [ESI+24-32], MM3;
1470 *aptr++ = cast(T)(*bptr++ - *cptr++);
1477 printf("_arraySliceSliceMinSliceAssign_g unittest\n");
1479 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1481 version (log) printf(" cpuid %d\n", cpuid);
1483 for (int j = 0; j < 2; j++)
1486 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1487 a = a[j .. dim + j]; // misalign for second iteration
1488 T[] b = new T[dim + j];
1489 b = b[j .. dim + j];
1490 T[] c = new T[dim + j];
1491 c = c[j .. dim + j];
1493 for (int i = 0; i < dim; i++)
1495 b[i] = cast(T)(i + 7);
1496 c[i] = cast(T)(i * 2);
1501 for (int i = 0; i < dim; i++)
1503 if (c[i] != cast(T)(a[i] - b[i]))
1505 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1514 /* ======================================================================== */
1516 /***********************
1521 T[] _arrayExpSliceMinass_a(T[] a, T value)
1523 return _arrayExpSliceMinass_g(a, value);
1526 T[] _arrayExpSliceMinass_h(T[] a, T value)
1528 return _arrayExpSliceMinass_g(a, value);
1531 T[] _arrayExpSliceMinass_g(T[] a, T value)
1533 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1535 auto aend = aptr + a.length;
1537 version (D_InlineAsm_X86)
1539 // SSE2 aligned version is 1577% faster
1540 if (sse2() && a.length >= 64)
1542 auto n = aptr + (a.length & ~63);
1544 uint l = cast(ubyte) value;
1548 if (((cast(uint) aptr) & 15) != 0)
1550 asm // unaligned case
1555 pshufd XMM4, XMM4, 0;
1560 movdqu XMM1, [ESI+16];
1561 movdqu XMM2, [ESI+32];
1562 movdqu XMM3, [ESI+48];
1568 movdqu [ESI -64], XMM0;
1569 movdqu [ESI+16-64], XMM1;
1570 movdqu [ESI+32-64], XMM2;
1571 movdqu [ESI+48-64], XMM3;
1573 jb startsubasssse2u;
1585 pshufd XMM4, XMM4, 0;
1590 movdqa XMM1, [ESI+16];
1591 movdqa XMM2, [ESI+32];
1592 movdqa XMM3, [ESI+48];
1598 movdqa [ESI -64], XMM0;
1599 movdqa [ESI+16-64], XMM1;
1600 movdqa [ESI+32-64], XMM2;
1601 movdqa [ESI+48-64], XMM3;
1603 jb startsubasssse2a;
1610 // MMX version is 1577% faster
1611 if (mmx() && a.length >= 32)
1614 auto n = aptr + (a.length & ~31);
1616 uint l = cast(ubyte) value;
1637 movq [ESI -32], MM0;
1638 movq [ESI+8 -32], MM1;
1639 movq [ESI+16-32], MM2;
1640 movq [ESI+24-32], MM3;
1658 printf("_arrayExpSliceMinass_g unittest\n");
1660 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1662 version (log) printf(" cpuid %d\n", cpuid);
1664 for (int j = 0; j < 2; j++)
1667 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1668 a = a[j .. dim + j]; // misalign for second iteration
1669 T[] b = new T[dim + j];
1670 b = b[j .. dim + j];
1671 T[] c = new T[dim + j];
1672 c = c[j .. dim + j];
1674 for (int i = 0; i < dim; i++)
1676 b[i] = cast(T)(i + 7);
1677 c[i] = cast(T)(i * 2);
1683 for (int i = 0; i < dim; i++)
1685 if (c[i] != cast(T)(a[i] - 6))
1687 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
1696 /* ======================================================================== */
1698 /***********************
1703 T[] _arraySliceSliceMinass_a(T[] a, T[] b)
1705 return _arraySliceSliceMinass_g(a, b);
1708 T[] _arraySliceSliceMinass_h(T[] a, T[] b)
1710 return _arraySliceSliceMinass_g(a, b);
1713 T[] _arraySliceSliceMinass_g(T[] a, T[] b)
1716 assert (a.length == b.length);
1717 assert (disjoint(a, b));
1721 //printf("_arraySliceSliceMinass_g()\n");
1723 auto aend = aptr + a.length;
1726 version (D_InlineAsm_X86)
1728 // SSE2 aligned version is 4800% faster
1729 if (sse2() && a.length >= 64)
1731 auto n = aptr + (a.length & ~63);
1733 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1735 asm // unaligned case
1744 movdqu XMM1, [ESI+16];
1745 movdqu XMM2, [ESI+32];
1746 movdqu XMM3, [ESI+48];
1749 movdqu XMM5, [ECX+16];
1750 movdqu XMM6, [ECX+32];
1751 movdqu XMM7, [ECX+48];
1757 movdqu [ESI -64], XMM0;
1758 movdqu [ESI+16-64], XMM1;
1759 movdqu [ESI+32-64], XMM2;
1760 movdqu [ESI+48-64], XMM3;
1762 jb startsubasslsse2u;
1779 movdqa XMM1, [ESI+16];
1780 movdqa XMM2, [ESI+32];
1781 movdqa XMM3, [ESI+48];
1784 movdqa XMM5, [ECX+16];
1785 movdqa XMM6, [ECX+32];
1786 movdqa XMM7, [ECX+48];
1792 movdqa [ESI -64], XMM0;
1793 movdqa [ESI+16-64], XMM1;
1794 movdqa [ESI+32-64], XMM2;
1795 movdqa [ESI+48-64], XMM3;
1797 jb startsubasslsse2a;
1805 // MMX version is 3107% faster
1806 if (mmx() && a.length >= 32)
1809 auto n = aptr + (a.length & ~31);
1833 movq [ESI -32], MM0;
1834 movq [ESI+8 -32], MM1;
1835 movq [ESI+16-32], MM2;
1836 movq [ESI+24-32], MM3;
1855 printf("_arraySliceSliceMinass_g unittest\n");
1857 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1859 version (log) printf(" cpuid %d\n", cpuid);
1861 for (int j = 0; j < 2; j++)
1864 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1865 a = a[j .. dim + j]; // misalign for second iteration
1866 T[] b = new T[dim + j];
1867 b = b[j .. dim + j];
1868 T[] c = new T[dim + j];
1869 c = c[j .. dim + j];
1871 for (int i = 0; i < dim; i++)
1873 b[i] = cast(T)(i + 7);
1874 c[i] = cast(T)(i * 2);
1880 for (int i = 0; i < dim; i++)
1882 if (c[i] != cast(T)(a[i] - b[i]))
1884 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);