1 /***************************
2 * D programming language http://www.digitalmars.com/d/
3 * Runtime support for byte array operations.
4 * Based on code originally written by Burton Radons.
5 * Placed in public domain.
8 /* Contains SSE2 and MMX versions of certain operations for char, byte,
9 * and ubyte ('a', 'g' and 'h' suffixes).
18 private import core.stdc.stdio : printf;
19 /* This is so unit tests will test every CPU variant
22 const int CPUID_MAX = 4;
23 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); }
24 bool sse() { return cpuid == 2 && util.cpuid.sse(); }
25 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); }
26 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
30 alias util.cpuid.mmx mmx;
31 alias util.cpuid.sse sse;
32 alias util.cpuid.sse2 sse2;
33 alias util.cpuid.amd3dnow amd3dnow;
38 bool disjoint(T)(T[] a, T[] b)
40 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
47 /* ======================================================================== */
50 /***********************
55 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
57 return _arraySliceExpAddSliceAssign_g(a, value, b);
60 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
62 return _arraySliceExpAddSliceAssign_g(a, value, b);
65 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
68 assert(a.length == b.length);
69 assert(disjoint(a, b));
73 //printf("_arraySliceExpAddSliceAssign_g()\n");
75 auto aend = aptr + a.length;
78 version (D_InlineAsm_X86)
80 // SSE2 aligned version is 1088% faster
81 if (sse2() && a.length >= 64)
83 auto n = aptr + (a.length & ~63);
85 uint l = cast(ubyte) value;
89 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
103 movdqu XMM1, [EAX+16];
104 movdqu XMM2, [EAX+32];
105 movdqu XMM3, [EAX+48];
111 movdqu [ESI -64], XMM0;
112 movdqu [ESI+16-64], XMM1;
113 movdqu [ESI+32-64], XMM2;
114 movdqu [ESI+48-64], XMM3;
130 pshufd XMM4, XMM4, 0;
136 movdqa XMM1, [EAX+16];
137 movdqa XMM2, [EAX+32];
138 movdqa XMM3, [EAX+48];
144 movdqa [ESI -64], XMM0;
145 movdqa [ESI+16-64], XMM1;
146 movdqa [ESI+32-64], XMM2;
147 movdqa [ESI+48-64], XMM3;
157 // MMX version is 1000% faster
158 if (mmx() && a.length >= 32)
160 auto n = aptr + (a.length & ~31);
162 uint l = cast(ubyte) value;
186 movq [ESI+8 -32], MM1;
187 movq [ESI+16-32], MM2;
188 movq [ESI+24-32], MM3;
197 /* trying to be fair and treat normal 32-bit cpu the same way as we do
198 * the SIMD units, with unrolled asm. There's not enough registers,
205 auto n = aptr + (a.length & ~3);
236 *aptr++ = cast(T)(*bptr++ + value);
243 printf("_arraySliceExpAddSliceAssign_g unittest\n");
245 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
247 version (log) printf(" cpuid %d\n", cpuid);
249 for (int j = 0; j < 2; j++)
252 T[] a = new T[dim + j]; // aligned on 16 byte boundary
253 a = a[j .. dim + j]; // misalign for second iteration
254 T[] b = new T[dim + j];
256 T[] c = new T[dim + j];
259 for (int i = 0; i < dim; i++)
261 b[i] = cast(T)(i + 7);
262 c[i] = cast(T)(i * 2);
267 for (int i = 0; i < dim; i++)
269 if (c[i] != cast(T)(a[i] + 6))
271 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
280 /* ======================================================================== */
282 /***********************
287 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
289 return _arraySliceSliceAddSliceAssign_g(a, c, b);
292 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
294 return _arraySliceSliceAddSliceAssign_g(a, c, b);
297 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
300 assert(a.length == b.length && b.length == c.length);
301 assert(disjoint(a, b));
302 assert(disjoint(a, c));
303 assert(disjoint(b, c));
307 //printf("_arraySliceSliceAddSliceAssign_g()\n");
309 auto aend = aptr + a.length;
313 version (D_InlineAsm_X86)
315 // SSE2 aligned version is 5739% faster
316 if (sse2() && a.length >= 64)
318 auto n = aptr + (a.length & ~63);
320 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
322 version (log) printf("\tsse2 unaligned\n");
323 asm // unaligned case
334 movdqu XMM1, [EAX+16];
335 movdqu XMM2, [EAX+32];
336 movdqu XMM3, [EAX+48];
339 movdqu XMM5, [ECX+16];
340 movdqu XMM6, [ECX+32];
341 movdqu XMM7, [ECX+48];
347 movdqu [ESI -64], XMM0;
348 movdqu [ESI+16-64], XMM1;
349 movdqu [ESI+32-64], XMM2;
350 movdqu [ESI+48-64], XMM3;
361 version (log) printf("\tsse2 aligned\n");
373 movdqa XMM1, [EAX+16];
374 movdqa XMM2, [EAX+32];
375 movdqa XMM3, [EAX+48];
378 movdqa XMM5, [ECX+16];
379 movdqa XMM6, [ECX+32];
380 movdqa XMM7, [ECX+48];
386 movdqa [ESI -64], XMM0;
387 movdqa [ESI+16-64], XMM1;
388 movdqa [ESI+32-64], XMM2;
389 movdqa [ESI+48-64], XMM3;
400 // MMX version is 4428% faster
401 if (mmx() && a.length >= 32)
403 version (log) printf("\tmmx\n");
404 auto n = aptr + (a.length & ~31);
431 movq [ESI+8 -32], MM1;
432 movq [ESI+16-32], MM2;
433 movq [ESI+24-32], MM3;
445 version (log) if (aptr < aend) printf("\tbase\n");
447 *aptr++ = cast(T)(*bptr++ + *cptr++);
454 printf("_arraySliceSliceAddSliceAssign_g unittest\n");
456 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
458 version (log) printf(" cpuid %d\n", cpuid);
460 for (int j = 0; j < 2; j++)
463 T[] a = new T[dim + j]; // aligned on 16 byte boundary
464 a = a[j .. dim + j]; // misalign for second iteration
465 T[] b = new T[dim + j];
467 T[] c = new T[dim + j];
470 for (int i = 0; i < dim; i++)
472 b[i] = cast(T)(i + 7);
473 c[i] = cast(T)(i * 2);
478 for (int i = 0; i < dim; i++)
480 if (c[i] != cast(T)(a[i] + b[i]))
482 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
491 /* ======================================================================== */
493 /***********************
498 T[] _arrayExpSliceAddass_a(T[] a, T value)
500 return _arrayExpSliceAddass_g(a, value);
503 T[] _arrayExpSliceAddass_h(T[] a, T value)
505 return _arrayExpSliceAddass_g(a, value);
508 T[] _arrayExpSliceAddass_g(T[] a, T value)
510 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
512 auto aend = aptr + a.length;
514 version (D_InlineAsm_X86)
516 // SSE2 aligned version is 1578% faster
517 if (sse2() && a.length >= 64)
519 auto n = aptr + (a.length & ~63);
521 uint l = cast(ubyte) value;
525 if (((cast(uint) aptr) & 15) != 0)
527 asm // unaligned case
532 pshufd XMM4, XMM4, 0;
537 movdqu XMM1, [ESI+16];
538 movdqu XMM2, [ESI+32];
539 movdqu XMM3, [ESI+48];
545 movdqu [ESI -64], XMM0;
546 movdqu [ESI+16-64], XMM1;
547 movdqu [ESI+32-64], XMM2;
548 movdqu [ESI+48-64], XMM3;
562 pshufd XMM4, XMM4, 0;
567 movdqa XMM1, [ESI+16];
568 movdqa XMM2, [ESI+32];
569 movdqa XMM3, [ESI+48];
575 movdqa [ESI -64], XMM0;
576 movdqa [ESI+16-64], XMM1;
577 movdqa [ESI+32-64], XMM2;
578 movdqa [ESI+48-64], XMM3;
587 // MMX version is 1721% faster
588 if (mmx() && a.length >= 32)
591 auto n = aptr + (a.length & ~31);
593 uint l = cast(ubyte) value;
615 movq [ESI+8 -32], MM1;
616 movq [ESI+16-32], MM2;
617 movq [ESI+24-32], MM3;
635 printf("_arrayExpSliceAddass_g unittest\n");
637 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
639 version (log) printf(" cpuid %d\n", cpuid);
641 for (int j = 0; j < 2; j++)
644 T[] a = new T[dim + j]; // aligned on 16 byte boundary
645 a = a[j .. dim + j]; // misalign for second iteration
646 T[] b = new T[dim + j];
648 T[] c = new T[dim + j];
651 for (int i = 0; i < dim; i++)
653 b[i] = cast(T)(i + 7);
654 c[i] = cast(T)(i * 2);
660 for (int i = 0; i < dim; i++)
662 if (c[i] != cast(T)(a[i] + 6))
664 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
673 /* ======================================================================== */
675 /***********************
680 T[] _arraySliceSliceAddass_a(T[] a, T[] b)
682 return _arraySliceSliceAddass_g(a, b);
685 T[] _arraySliceSliceAddass_h(T[] a, T[] b)
687 return _arraySliceSliceAddass_g(a, b);
690 T[] _arraySliceSliceAddass_g(T[] a, T[] b)
693 assert (a.length == b.length);
694 assert (disjoint(a, b));
698 //printf("_arraySliceSliceAddass_g()\n");
700 auto aend = aptr + a.length;
703 version (D_InlineAsm_X86)
705 // SSE2 aligned version is 4727% faster
706 if (sse2() && a.length >= 64)
708 auto n = aptr + (a.length & ~63);
710 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
712 asm // unaligned case
721 movdqu XMM1, [ESI+16];
722 movdqu XMM2, [ESI+32];
723 movdqu XMM3, [ESI+48];
726 movdqu XMM5, [ECX+16];
727 movdqu XMM6, [ECX+32];
728 movdqu XMM7, [ECX+48];
734 movdqu [ESI -64], XMM0;
735 movdqu [ESI+16-64], XMM1;
736 movdqu [ESI+32-64], XMM2;
737 movdqu [ESI+48-64], XMM3;
739 jb startaddasslsse2u;
756 movdqa XMM1, [ESI+16];
757 movdqa XMM2, [ESI+32];
758 movdqa XMM3, [ESI+48];
761 movdqa XMM5, [ECX+16];
762 movdqa XMM6, [ECX+32];
763 movdqa XMM7, [ECX+48];
769 movdqa [ESI -64], XMM0;
770 movdqa [ESI+16-64], XMM1;
771 movdqa [ESI+32-64], XMM2;
772 movdqa [ESI+48-64], XMM3;
774 jb startaddasslsse2a;
782 // MMX version is 3059% faster
783 if (mmx() && a.length >= 32)
786 auto n = aptr + (a.length & ~31);
811 movq [ESI+8 -32], MM1;
812 movq [ESI+16-32], MM2;
813 movq [ESI+24-32], MM3;
832 printf("_arraySliceSliceAddass_g unittest\n");
834 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
836 version (log) printf(" cpuid %d\n", cpuid);
838 for (int j = 0; j < 2; j++)
841 T[] a = new T[dim + j]; // aligned on 16 byte boundary
842 a = a[j .. dim + j]; // misalign for second iteration
843 T[] b = new T[dim + j];
845 T[] c = new T[dim + j];
848 for (int i = 0; i < dim; i++)
850 b[i] = cast(T)(i + 7);
851 c[i] = cast(T)(i * 2);
857 for (int i = 0; i < dim; i++)
859 if (c[i] != cast(T)(a[i] + b[i]))
861 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
870 /* ======================================================================== */
873 /***********************
878 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
880 return _arraySliceExpMinSliceAssign_g(a, value, b);
883 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
885 return _arraySliceExpMinSliceAssign_g(a, value, b);
888 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
891 assert(a.length == b.length);
892 assert(disjoint(a, b));
896 //printf("_arraySliceExpMinSliceAssign_g()\n");
898 auto aend = aptr + a.length;
901 version (D_InlineAsm_X86)
903 // SSE2 aligned version is 1189% faster
904 if (sse2() && a.length >= 64)
906 auto n = aptr + (a.length & ~63);
908 uint l = cast(ubyte) value;
912 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
914 asm // unaligned case
920 pshufd XMM4, XMM4, 0;
926 movdqu XMM1, [EAX+16];
927 movdqu XMM2, [EAX+32];
928 movdqu XMM3, [EAX+48];
934 movdqu [ESI -64], XMM0;
935 movdqu [ESI+16-64], XMM1;
936 movdqu [ESI+32-64], XMM2;
937 movdqu [ESI+48-64], XMM3;
953 pshufd XMM4, XMM4, 0;
959 movdqa XMM1, [EAX+16];
960 movdqa XMM2, [EAX+32];
961 movdqa XMM3, [EAX+48];
967 movdqa [ESI -64], XMM0;
968 movdqa [ESI+16-64], XMM1;
969 movdqa [ESI+32-64], XMM2;
970 movdqa [ESI+48-64], XMM3;
980 // MMX version is 1079% faster
981 if (mmx() && a.length >= 32)
983 auto n = aptr + (a.length & ~31);
985 uint l = cast(ubyte) value;
1008 movq [ESI -32], MM0;
1009 movq [ESI+8 -32], MM1;
1010 movq [ESI+16-32], MM2;
1011 movq [ESI+24-32], MM3;
1020 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really.
1024 auto n = aptr + (a.length & ~3);
1054 *aptr++ = cast(T)(*bptr++ - value);
1061 printf("_arraySliceExpMinSliceAssign_g unittest\n");
1063 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1065 version (log) printf(" cpuid %d\n", cpuid);
1067 for (int j = 0; j < 2; j++)
1070 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1071 a = a[j .. dim + j]; // misalign for second iteration
1072 T[] b = new T[dim + j];
1073 b = b[j .. dim + j];
1074 T[] c = new T[dim + j];
1075 c = c[j .. dim + j];
1077 for (int i = 0; i < dim; i++)
1079 b[i] = cast(T)(i + 7);
1080 c[i] = cast(T)(i * 2);
1086 for (int i = 0; i < dim; i++)
1088 if (c[i] != cast(T)(b[i] - 6))
1090 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
1099 /* ======================================================================== */
1101 /***********************
1106 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
1108 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1111 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
1113 return _arrayExpSliceMinSliceAssign_g(a, b, value);
1116 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
1119 assert(a.length == b.length);
1120 assert(disjoint(a, b));
1124 //printf("_arrayExpSliceMinSliceAssign_g()\n");
1126 auto aend = aptr + a.length;
1129 version (D_InlineAsm_X86)
1131 // SSE2 aligned version is 8748% faster
1132 if (sse2() && a.length >= 64)
1134 auto n = aptr + (a.length & ~63);
1136 uint l = cast(ubyte) value;
1140 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1142 asm // unaligned case
1148 pshufd XMM4, XMM4, 0;
1156 movdqu XMM1, [EAX+16];
1159 movdqu [ESI -64], XMM5;
1160 movdqu [ESI+16-64], XMM6;
1163 movdqu XMM2, [EAX+32];
1164 movdqu XMM3, [EAX+48];
1168 movdqu [ESI+32-64], XMM5;
1169 movdqu [ESI+48-64], XMM6;
1185 pshufd XMM4, XMM4, 0;
1193 movdqa XMM1, [EAX+16];
1196 movdqa [ESI -64], XMM5;
1197 movdqa [ESI+16-64], XMM6;
1200 movdqa XMM2, [EAX+32];
1201 movdqa XMM3, [EAX+48];
1205 movdqa [ESI+32-64], XMM5;
1206 movdqa [ESI+48-64], XMM6;
1216 // MMX version is 7397% faster
1217 if (mmx() && a.length >= 32)
1219 auto n = aptr + (a.length & ~31);
1221 uint l = cast(ubyte) value;
1241 movq [ESI -32], MM5;
1242 movq [ESI+8 -32], MM6;
1250 movq [ESI+16-32], MM5;
1251 movq [ESI+24-32], MM6;
1264 *aptr++ = cast(T)(value - *bptr++);
1271 printf("_arrayExpSliceMinSliceAssign_g unittest\n");
1273 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1275 version (log) printf(" cpuid %d\n", cpuid);
1277 for (int j = 0; j < 2; j++)
1280 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1281 a = a[j .. dim + j]; // misalign for second iteration
1282 T[] b = new T[dim + j];
1283 b = b[j .. dim + j];
1284 T[] c = new T[dim + j];
1285 c = c[j .. dim + j];
1287 for (int i = 0; i < dim; i++)
1289 b[i] = cast(T)(i + 7);
1290 c[i] = cast(T)(i * 2);
1296 for (int i = 0; i < dim; i++)
1298 if (c[i] != cast(T)(6 - b[i]))
1300 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
1309 /* ======================================================================== */
1311 /***********************
1316 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
1318 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1321 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
1323 return _arraySliceSliceMinSliceAssign_g(a, c, b);
1326 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
1329 assert(a.length == b.length && b.length == c.length);
1330 assert(disjoint(a, b));
1331 assert(disjoint(a, c));
1332 assert(disjoint(b, c));
1337 auto aend = aptr + a.length;
1341 version (D_InlineAsm_X86)
1343 // SSE2 aligned version is 5756% faster
1344 if (sse2() && a.length >= 64)
1346 auto n = aptr + (a.length & ~63);
1348 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1350 asm // unaligned case
1361 movdqu XMM1, [EAX+16];
1362 movdqu XMM2, [EAX+32];
1363 movdqu XMM3, [EAX+48];
1366 movdqu XMM5, [ECX+16];
1367 movdqu XMM6, [ECX+32];
1368 movdqu XMM7, [ECX+48];
1374 movdqu [ESI -64], XMM0;
1375 movdqu [ESI+16-64], XMM1;
1376 movdqu [ESI+32-64], XMM2;
1377 movdqu [ESI+48-64], XMM3;
1399 movdqa XMM1, [EAX+16];
1400 movdqa XMM2, [EAX+32];
1401 movdqa XMM3, [EAX+48];
1404 movdqa XMM5, [ECX+16];
1405 movdqa XMM6, [ECX+32];
1406 movdqa XMM7, [ECX+48];
1412 movdqa [ESI -64], XMM0;
1413 movdqa [ESI+16-64], XMM1;
1414 movdqa [ESI+32-64], XMM2;
1415 movdqa [ESI+48-64], XMM3;
1426 // MMX version is 4428% faster
1427 if (mmx() && a.length >= 32)
1429 auto n = aptr + (a.length & ~31);
1455 movq [ESI -32], MM0;
1456 movq [ESI+8 -32], MM1;
1457 movq [ESI+16-32], MM2;
1458 movq [ESI+24-32], MM3;
1471 *aptr++ = cast(T)(*bptr++ - *cptr++);
1478 printf("_arraySliceSliceMinSliceAssign_g unittest\n");
1480 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1482 version (log) printf(" cpuid %d\n", cpuid);
1484 for (int j = 0; j < 2; j++)
1487 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1488 a = a[j .. dim + j]; // misalign for second iteration
1489 T[] b = new T[dim + j];
1490 b = b[j .. dim + j];
1491 T[] c = new T[dim + j];
1492 c = c[j .. dim + j];
1494 for (int i = 0; i < dim; i++)
1496 b[i] = cast(T)(i + 7);
1497 c[i] = cast(T)(i * 2);
1502 for (int i = 0; i < dim; i++)
1504 if (c[i] != cast(T)(a[i] - b[i]))
1506 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1515 /* ======================================================================== */
1517 /***********************
1522 T[] _arrayExpSliceMinass_a(T[] a, T value)
1524 return _arrayExpSliceMinass_g(a, value);
1527 T[] _arrayExpSliceMinass_h(T[] a, T value)
1529 return _arrayExpSliceMinass_g(a, value);
1532 T[] _arrayExpSliceMinass_g(T[] a, T value)
1534 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1536 auto aend = aptr + a.length;
1538 version (D_InlineAsm_X86)
1540 // SSE2 aligned version is 1577% faster
1541 if (sse2() && a.length >= 64)
1543 auto n = aptr + (a.length & ~63);
1545 uint l = cast(ubyte) value;
1549 if (((cast(uint) aptr) & 15) != 0)
1551 asm // unaligned case
1556 pshufd XMM4, XMM4, 0;
1561 movdqu XMM1, [ESI+16];
1562 movdqu XMM2, [ESI+32];
1563 movdqu XMM3, [ESI+48];
1569 movdqu [ESI -64], XMM0;
1570 movdqu [ESI+16-64], XMM1;
1571 movdqu [ESI+32-64], XMM2;
1572 movdqu [ESI+48-64], XMM3;
1574 jb startsubasssse2u;
1586 pshufd XMM4, XMM4, 0;
1591 movdqa XMM1, [ESI+16];
1592 movdqa XMM2, [ESI+32];
1593 movdqa XMM3, [ESI+48];
1599 movdqa [ESI -64], XMM0;
1600 movdqa [ESI+16-64], XMM1;
1601 movdqa [ESI+32-64], XMM2;
1602 movdqa [ESI+48-64], XMM3;
1604 jb startsubasssse2a;
1611 // MMX version is 1577% faster
1612 if (mmx() && a.length >= 32)
1615 auto n = aptr + (a.length & ~31);
1617 uint l = cast(ubyte) value;
1638 movq [ESI -32], MM0;
1639 movq [ESI+8 -32], MM1;
1640 movq [ESI+16-32], MM2;
1641 movq [ESI+24-32], MM3;
1659 printf("_arrayExpSliceMinass_g unittest\n");
1661 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1663 version (log) printf(" cpuid %d\n", cpuid);
1665 for (int j = 0; j < 2; j++)
1668 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1669 a = a[j .. dim + j]; // misalign for second iteration
1670 T[] b = new T[dim + j];
1671 b = b[j .. dim + j];
1672 T[] c = new T[dim + j];
1673 c = c[j .. dim + j];
1675 for (int i = 0; i < dim; i++)
1677 b[i] = cast(T)(i + 7);
1678 c[i] = cast(T)(i * 2);
1684 for (int i = 0; i < dim; i++)
1686 if (c[i] != cast(T)(a[i] - 6))
1688 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
1697 /* ======================================================================== */
1699 /***********************
1704 T[] _arraySliceSliceMinass_a(T[] a, T[] b)
1706 return _arraySliceSliceMinass_g(a, b);
1709 T[] _arraySliceSliceMinass_h(T[] a, T[] b)
1711 return _arraySliceSliceMinass_g(a, b);
1714 T[] _arraySliceSliceMinass_g(T[] a, T[] b)
1717 assert (a.length == b.length);
1718 assert (disjoint(a, b));
1722 //printf("_arraySliceSliceMinass_g()\n");
1724 auto aend = aptr + a.length;
1727 version (D_InlineAsm_X86)
1729 // SSE2 aligned version is 4800% faster
1730 if (sse2() && a.length >= 64)
1732 auto n = aptr + (a.length & ~63);
1734 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1736 asm // unaligned case
1745 movdqu XMM1, [ESI+16];
1746 movdqu XMM2, [ESI+32];
1747 movdqu XMM3, [ESI+48];
1750 movdqu XMM5, [ECX+16];
1751 movdqu XMM6, [ECX+32];
1752 movdqu XMM7, [ECX+48];
1758 movdqu [ESI -64], XMM0;
1759 movdqu [ESI+16-64], XMM1;
1760 movdqu [ESI+32-64], XMM2;
1761 movdqu [ESI+48-64], XMM3;
1763 jb startsubasslsse2u;
1780 movdqa XMM1, [ESI+16];
1781 movdqa XMM2, [ESI+32];
1782 movdqa XMM3, [ESI+48];
1785 movdqa XMM5, [ECX+16];
1786 movdqa XMM6, [ECX+32];
1787 movdqa XMM7, [ECX+48];
1793 movdqa [ESI -64], XMM0;
1794 movdqa [ESI+16-64], XMM1;
1795 movdqa [ESI+32-64], XMM2;
1796 movdqa [ESI+48-64], XMM3;
1798 jb startsubasslsse2a;
1806 // MMX version is 3107% faster
1807 if (mmx() && a.length >= 32)
1810 auto n = aptr + (a.length & ~31);
1834 movq [ESI -32], MM0;
1835 movq [ESI+8 -32], MM1;
1836 movq [ESI+16-32], MM2;
1837 movq [ESI+24-32], MM3;
1856 printf("_arraySliceSliceMinass_g unittest\n");
1858 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1860 version (log) printf(" cpuid %d\n", cpuid);
1862 for (int j = 0; j < 2; j++)
1865 T[] a = new T[dim + j]; // aligned on 16 byte boundary
1866 a = a[j .. dim + j]; // misalign for second iteration
1867 T[] b = new T[dim + j];
1868 b = b[j .. dim + j];
1869 T[] c = new T[dim + j];
1870 c = c[j .. dim + j];
1872 for (int i = 0; i < dim; i++)
1874 b[i] = cast(T)(i + 7);
1875 c[i] = cast(T)(i * 2);
1881 for (int i = 0; i < dim; i++)
1883 if (c[i] != cast(T)(a[i] - b[i]))
1885 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);