src/compiler/dmd/arrayfloat.d

   1 /***************************
   2  * D programming language http://www.digitalmars.com/d/
   3  * Runtime support for float array operations.
   4  * Based on code originally written by Burton Radons.
   5  * Placed in public domain.
   6  */
   7
   8 module rt.arrayfloat;
   9
  10 private import util.cpuid;
  11
  12 version (Unittest)
  13 {
  14     /* This is so unit tests will test every CPU variant
  15      */
  16     int cpuid;
  17     const int CPUID_MAX = 5;
  18     bool mmx()      { return cpuid == 1 && util.cpuid.mmx(); }
  19     bool sse()      { return cpuid == 2 && util.cpuid.sse(); }
  20     bool sse2()     { return cpuid == 3 && util.cpuid.sse2(); }
  21     bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); }
  22 }
  23 else
  24 {
  25     alias util.cpuid.mmx mmx;
  26     alias util.cpuid.sse sse;
  27     alias util.cpuid.sse2 sse2;
  28     alias util.cpuid.amd3dnow amd3dnow;
  29 }
  30
  31 //version = log;
  32
  33 bool disjoint(T)(T[] a, T[] b)
  34 {
  35     return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
  36 }
  37
  38 alias float T;
  39
  40 extern (C):
  41
  42 /* ======================================================================== */
  43
  44 /***********************
  45  * Computes:
  46  *      a[] = b[] + c[]
  47  */
  48
  49 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
  50 in
  51 {
  52         assert(a.length == b.length && b.length == c.length);
  53         assert(disjoint(a, b));
  54         assert(disjoint(a, c));
  55         assert(disjoint(b, c));
  56 }
  57 body
  58 {
  59     //printf("_arraySliceSliceAddSliceAssign_f()\n");
  60     auto aptr = a.ptr;
  61     auto aend = aptr + a.length;
  62     auto bptr = b.ptr;
  63     auto cptr = c.ptr;
  64
  65     version (D_InlineAsm_X86)
  66     {
  67         // SSE version is 834% faster
  68         if (sse() && b.length >= 16)
  69         {
  70             version (log) printf("\tsse unaligned\n");
  71             auto n = aptr + (b.length & ~15);
  72
  73             // Unaligned case
  74             asm
  75             {
  76                 mov EAX, bptr; // left operand
  77                 mov ECX, cptr; // right operand
  78                 mov ESI, aptr; // destination operand
  79                 mov EDI, n;    // end comparison
  80
  81                 align 8;
  82             startsseloopb:
  83                 movups XMM0, [EAX];
  84                 movups XMM1, [EAX+16];
  85                 movups XMM2, [EAX+32];
  86                 movups XMM3, [EAX+48];
  87                 add EAX, 64;
  88                 movups XMM4, [ECX];
  89                 movups XMM5, [ECX+16];
  90                 movups XMM6, [ECX+32];
  91                 movups XMM7, [ECX+48];
  92                 add ESI, 64;
  93                 addps XMM0, XMM4;
  94                 addps XMM1, XMM5;
  95                 addps XMM2, XMM6;
  96                 addps XMM3, XMM7;
  97                 add ECX, 64;
  98                 movups [ESI+ 0-64], XMM0;
  99                 movups [ESI+16-64], XMM1;
 100                 movups [ESI+32-64], XMM2;
 101                 movups [ESI+48-64], XMM3;
 102                 cmp ESI, EDI;
 103                 jb startsseloopb;
 104
 105                 mov aptr, ESI;
 106                 mov bptr, EAX;
 107                 mov cptr, ECX;
 108             }
 109         }
 110         else
 111         // 3DNow! version is only 13% faster
 112         if (amd3dnow() && b.length >= 8)
 113         {
 114             version (log) printf("\tamd3dnow\n");
 115             auto n = aptr + (b.length & ~7);
 116
 117             asm
 118             {
 119                 mov ESI, aptr; // destination operand
 120                 mov EDI, n;    // end comparison
 121                 mov EAX, bptr; // left operand
 122                 mov ECX, cptr; // right operand
 123
 124                 align 4;
 125             start3dnow:
 126                 movq MM0, [EAX];
 127                 movq MM1, [EAX+8];
 128                 movq MM2, [EAX+16];
 129                 movq MM3, [EAX+24];
 130                 pfadd MM0, [ECX];
 131                 pfadd MM1, [ECX+8];
 132                 pfadd MM2, [ECX+16];
 133                 pfadd MM3, [ECX+24];
 134                 movq [ESI], MM0;
 135                 movq [ESI+8], MM1;
 136                 movq [ESI+16], MM2;
 137                 movq [ESI+24], MM3;
 138                 add ECX, 32;
 139                 add ESI, 32;
 140                 add EAX, 32;
 141                 cmp ESI, EDI;
 142                 jb start3dnow;
 143
 144                 emms;
 145                 mov aptr, ESI;
 146                 mov bptr, EAX;
 147                 mov cptr, ECX;
 148             }
 149         }
 150     }
 151
 152     // Handle remainder
 153     version (log) if (aptr < aend) printf("\tbase\n");
 154     while (aptr < aend)
 155         *aptr++ = *bptr++ + *cptr++;
 156
 157     return a;
 158 }
 159
 160
 161 unittest
 162 {
 163     printf("_arraySliceSliceAddSliceAssign_f unittest\n");
 164     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 165     {
 166         version (log) printf("    cpuid %d\n", cpuid);
 167
 168         for (int j = 0; j < 2; j++)
 169         {
 170             const int dim = 67;
 171             T[] a = new T[dim + j];     // aligned on 16 byte boundary
 172             a = a[j .. dim + j];        // misalign for second iteration
 173             T[] b = new T[dim + j];
 174             b = b[j .. dim + j];
 175             T[] c = new T[dim + j];
 176             c = c[j .. dim + j];
 177
 178             for (int i = 0; i < dim; i++)
 179             {   a[i] = cast(T)i;
 180                 b[i] = cast(T)(i + 7);
 181                 c[i] = cast(T)(i * 2);
 182             }
 183
 184             c[] = a[] + b[];
 185
 186             for (int i = 0; i < dim; i++)
 187             {
 188                 if (c[i] != cast(T)(a[i] + b[i]))
 189                 {
 190                     printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
 191                     assert(0);
 192                 }
 193             }
 194         }
 195     }
 196 }
 197
 198 /* ======================================================================== */
 199
 200 /***********************
 201  * Computes:
 202  *      a[] = b[] - c[]
 203  */
 204
 205 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
 206 in
 207 {
 208         assert(a.length == b.length && b.length == c.length);
 209         assert(disjoint(a, b));
 210         assert(disjoint(a, c));
 211         assert(disjoint(b, c));
 212 }
 213 body
 214 {
 215     auto aptr = a.ptr;
 216     auto aend = aptr + a.length;
 217     auto bptr = b.ptr;
 218     auto cptr = c.ptr;
 219
 220     version (D_InlineAsm_X86)
 221     {
 222         // SSE version is 834% faster
 223         if (sse() && b.length >= 16)
 224         {
 225             auto n = aptr + (b.length & ~15);
 226
 227             // Unaligned case
 228             asm
 229             {
 230                 mov EAX, bptr; // left operand
 231                 mov ECX, cptr; // right operand
 232                 mov ESI, aptr; // destination operand
 233                 mov EDI, n;    // end comparison
 234
 235                 align 8;
 236             startsseloopb:
 237                 movups XMM0, [EAX];
 238                 movups XMM1, [EAX+16];
 239                 movups XMM2, [EAX+32];
 240                 movups XMM3, [EAX+48];
 241                 add EAX, 64;
 242                 movups XMM4, [ECX];
 243                 movups XMM5, [ECX+16];
 244                 movups XMM6, [ECX+32];
 245                 movups XMM7, [ECX+48];
 246                 add ESI, 64;
 247                 subps XMM0, XMM4;
 248                 subps XMM1, XMM5;
 249                 subps XMM2, XMM6;
 250                 subps XMM3, XMM7;
 251                 add ECX, 64;
 252                 movups [ESI+ 0-64], XMM0;
 253                 movups [ESI+16-64], XMM1;
 254                 movups [ESI+32-64], XMM2;
 255                 movups [ESI+48-64], XMM3;
 256                 cmp ESI, EDI;
 257                 jb startsseloopb;
 258
 259                 mov aptr, ESI;
 260                 mov bptr, EAX;
 261                 mov cptr, ECX;
 262             }
 263         }
 264         else
 265         // 3DNow! version is only 13% faster
 266         if (amd3dnow() && b.length >= 8)
 267         {
 268             auto n = aptr + (b.length & ~7);
 269
 270             asm
 271             {
 272                 mov ESI, aptr; // destination operand
 273                 mov EDI, n;    // end comparison
 274                 mov EAX, bptr; // left operand
 275                 mov ECX, cptr; // right operand
 276
 277                 align 4;
 278             start3dnow:
 279                 movq MM0, [EAX];
 280                 movq MM1, [EAX+8];
 281                 movq MM2, [EAX+16];
 282                 movq MM3, [EAX+24];
 283                 pfsub MM0, [ECX];
 284                 pfsub MM1, [ECX+8];
 285                 pfsub MM2, [ECX+16];
 286                 pfsub MM3, [ECX+24];
 287                 movq [ESI], MM0;
 288                 movq [ESI+8], MM1;
 289                 movq [ESI+16], MM2;
 290                 movq [ESI+24], MM3;
 291                 add ECX, 32;
 292                 add ESI, 32;
 293                 add EAX, 32;
 294                 cmp ESI, EDI;
 295                 jb start3dnow;
 296
 297                 emms;
 298                 mov aptr, ESI;
 299                 mov bptr, EAX;
 300                 mov cptr, ECX;
 301             }
 302         }
 303     }
 304
 305     // Handle remainder
 306     while (aptr < aend)
 307         *aptr++ = *bptr++ - *cptr++;
 308
 309     return a;
 310 }
 311
 312
 313 unittest
 314 {
 315     printf("_arraySliceSliceMinSliceAssign_f unittest\n");
 316     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 317     {
 318         version (log) printf("    cpuid %d\n", cpuid);
 319
 320         for (int j = 0; j < 2; j++)
 321         {
 322             const int dim = 67;
 323             T[] a = new T[dim + j];     // aligned on 16 byte boundary
 324             a = a[j .. dim + j];        // misalign for second iteration
 325             T[] b = new T[dim + j];
 326             b = b[j .. dim + j];
 327             T[] c = new T[dim + j];
 328             c = c[j .. dim + j];
 329
 330             for (int i = 0; i < dim; i++)
 331             {   a[i] = cast(T)i;
 332                 b[i] = cast(T)(i + 7);
 333                 c[i] = cast(T)(i * 2);
 334             }
 335
 336             c[] = a[] - b[];
 337
 338             for (int i = 0; i < dim; i++)
 339             {
 340                 if (c[i] != cast(T)(a[i] - b[i]))
 341                 {
 342                     printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
 343                     assert(0);
 344                 }
 345             }
 346         }
 347     }
 348 }
 349
 350 /* ======================================================================== */
 351
 352 /***********************
 353  * Computes:
 354  *      a[] = b[] + value
 355  */
 356
 357 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
 358 in
 359 {
 360     assert(a.length == b.length);
 361     assert(disjoint(a, b));
 362 }
 363 body
 364 {
 365     //printf("_arraySliceExpAddSliceAssign_f()\n");
 366     auto aptr = a.ptr;
 367     auto aend = aptr + a.length;
 368     auto bptr = b.ptr;
 369
 370     version (D_InlineAsm_X86)
 371     {
 372         // SSE version is 665% faster
 373         if (sse() && a.length >= 16)
 374         {
 375             auto n = aptr + (a.length & ~15);
 376
 377             // Unaligned case
 378             asm
 379             {
 380                 mov EAX, bptr;
 381                 mov ESI, aptr;
 382                 mov EDI, n;
 383                 movss XMM4, value;
 384                 shufps XMM4, XMM4, 0;
 385
 386                 align 8;
 387             startsseloop:
 388                 add ESI, 64;
 389                 movups XMM0, [EAX];
 390                 movups XMM1, [EAX+16];
 391                 movups XMM2, [EAX+32];
 392                 movups XMM3, [EAX+48];
 393                 add EAX, 64;
 394                 addps XMM0, XMM4;
 395                 addps XMM1, XMM4;
 396                 addps XMM2, XMM4;
 397                 addps XMM3, XMM4;
 398                 movups [ESI+ 0-64], XMM0;
 399                 movups [ESI+16-64], XMM1;
 400                 movups [ESI+32-64], XMM2;
 401                 movups [ESI+48-64], XMM3;
 402                 cmp ESI, EDI;
 403                 jb startsseloop;
 404
 405                 mov aptr, ESI;
 406                 mov bptr, EAX;
 407             }
 408         }
 409         else
 410         // 3DNow! version is 69% faster
 411         if (amd3dnow() && a.length >= 8)
 412         {
 413             auto n = aptr + (a.length & ~7);
 414
 415             ulong w = *cast(uint *) &value;
 416             ulong v = w | (w << 32L);
 417
 418             asm
 419             {
 420                 mov ESI, aptr;
 421                 mov EDI, n;
 422                 mov EAX, bptr;
 423                 movq MM4, qword ptr [v];
 424
 425                 align 8;
 426             start3dnow:
 427                 movq MM0, [EAX];
 428                 movq MM1, [EAX+8];
 429                 movq MM2, [EAX+16];
 430                 movq MM3, [EAX+24];
 431                 pfadd MM0, MM4;
 432                 pfadd MM1, MM4;
 433                 pfadd MM2, MM4;
 434                 pfadd MM3, MM4;
 435                 movq [ESI],    MM0;
 436                 movq [ESI+8],  MM1;
 437                 movq [ESI+16], MM2;
 438                 movq [ESI+24], MM3;
 439                 add ESI, 32;
 440                 add EAX, 32;
 441                 cmp ESI, EDI;
 442                 jb start3dnow;
 443
 444                 emms;
 445                 mov aptr, ESI;
 446                 mov bptr, EAX;
 447             }
 448         }
 449     }
 450
 451     while (aptr < aend)
 452         *aptr++ = *bptr++ + value;
 453
 454     return a;
 455 }
 456
 457 unittest
 458 {
 459     printf("_arraySliceExpAddSliceAssign_f unittest\n");
 460     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 461     {
 462         version (log) printf("    cpuid %d\n", cpuid);
 463
 464         for (int j = 0; j < 2; j++)
 465         {
 466             const int dim = 67;
 467             T[] a = new T[dim + j];     // aligned on 16 byte boundary
 468             a = a[j .. dim + j];        // misalign for second iteration
 469             T[] b = new T[dim + j];
 470             b = b[j .. dim + j];
 471             T[] c = new T[dim + j];
 472             c = c[j .. dim + j];
 473
 474             for (int i = 0; i < dim; i++)
 475             {   a[i] = cast(T)i;
 476                 b[i] = cast(T)(i + 7);
 477                 c[i] = cast(T)(i * 2);
 478             }
 479
 480             c[] = a[] + 6;
 481
 482             for (int i = 0; i < dim; i++)
 483             {
 484                 if (c[i] != cast(T)(a[i] + 6))
 485                 {
 486                     printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
 487                     assert(0);
 488                 }
 489             }
 490         }
 491     }
 492 }
 493
 494 /* ======================================================================== */
 495
 496 /***********************
 497  * Computes:
 498  *      a[] += value
 499  */
 500
 501 T[] _arrayExpSliceAddass_f(T[] a, T value)
 502 {
 503     //printf("_arrayExpSliceAddass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
 504     auto aptr = a.ptr;
 505     auto aend = aptr + a.length;
 506
 507     version (D_InlineAsm_X86)
 508     {
 509         // SSE version is 302% faster
 510         if (sse() && a.length >= 16)
 511         {
 512             // align pointer
 513             auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
 514             while (aptr < n)
 515                 *aptr++ += value;
 516             n = cast(T*)((cast(uint)aend) & ~15);
 517             if (aptr < n)
 518
 519             // Aligned case
 520             asm
 521             {
 522                 mov ESI, aptr;
 523                 mov EDI, n;
 524                 movss XMM4, value;
 525                 shufps XMM4, XMM4, 0;
 526
 527                 align 8;
 528             startsseloopa:
 529                 movaps XMM0, [ESI];
 530                 movaps XMM1, [ESI+16];
 531                 movaps XMM2, [ESI+32];
 532                 movaps XMM3, [ESI+48];
 533                 add ESI, 64;
 534                 addps XMM0, XMM4;
 535                 addps XMM1, XMM4;
 536                 addps XMM2, XMM4;
 537                 addps XMM3, XMM4;
 538                 movaps [ESI+ 0-64], XMM0;
 539                 movaps [ESI+16-64], XMM1;
 540                 movaps [ESI+32-64], XMM2;
 541                 movaps [ESI+48-64], XMM3;
 542                 cmp ESI, EDI;
 543                 jb startsseloopa;
 544
 545                 mov aptr, ESI;
 546             }
 547         }
 548         else
 549         // 3DNow! version is 63% faster
 550         if (amd3dnow() && a.length >= 8)
 551         {
 552             auto n = aptr + (a.length & ~7);
 553
 554             ulong w = *cast(uint *) &value;
 555             ulong v = w | (w << 32L);
 556
 557             asm
 558             {
 559                 mov ESI, dword ptr [aptr];
 560                 mov EDI, dword ptr [n];
 561                 movq MM4, qword ptr [v];
 562
 563                 align 8;
 564             start3dnow:
 565                 movq MM0, [ESI];
 566                 movq MM1, [ESI+8];
 567                 movq MM2, [ESI+16];
 568                 movq MM3, [ESI+24];
 569                 pfadd MM0, MM4;
 570                 pfadd MM1, MM4;
 571                 pfadd MM2, MM4;
 572                 pfadd MM3, MM4;
 573                 movq [ESI], MM0;
 574                 movq [ESI+8], MM1;
 575                 movq [ESI+16], MM2;
 576                 movq [ESI+24], MM3;
 577                 add ESI, 32;
 578                 cmp ESI, EDI;
 579                 jb start3dnow;
 580
 581                 emms;
 582                 mov dword ptr [aptr], ESI;
 583             }
 584         }
 585     }
 586
 587     while (aptr < aend)
 588         *aptr++ += value;
 589
 590     return a;
 591 }
 592
 593 unittest
 594 {
 595     printf("_arrayExpSliceAddass_f unittest\n");
 596     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 597     {
 598         version (log) printf("    cpuid %d\n", cpuid);
 599
 600         for (int j = 0; j < 2; j++)
 601         {
 602             const int dim = 67;
 603             T[] a = new T[dim + j];     // aligned on 16 byte boundary
 604             a = a[j .. dim + j];        // misalign for second iteration
 605             T[] b = new T[dim + j];
 606             b = b[j .. dim + j];
 607             T[] c = new T[dim + j];
 608             c = c[j .. dim + j];
 609
 610             for (int i = 0; i < dim; i++)
 611             {   a[i] = cast(T)i;
 612                 b[i] = cast(T)(i + 7);
 613                 c[i] = cast(T)(i * 2);
 614             }
 615
 616             a[] = c[];
 617             c[] += 6;
 618
 619             for (int i = 0; i < dim; i++)
 620             {
 621                 if (c[i] != cast(T)(a[i] + 6))
 622                 {
 623                     printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
 624                     assert(0);
 625                 }
 626             }
 627         }
 628     }
 629 }
 630
 631 /* ======================================================================== */
 632
 633 /***********************
 634  * Computes:
 635  *      a[] += b[]
 636  */
 637
 638 T[] _arraySliceSliceAddass_f(T[] a, T[] b)
 639 in
 640 {
 641     assert (a.length == b.length);
 642     assert (disjoint(a, b));
 643 }
 644 body
 645 {
 646     //printf("_arraySliceSliceAddass_f()\n");
 647     auto aptr = a.ptr;
 648     auto aend = aptr + a.length;
 649     auto bptr = b.ptr;
 650
 651     version (D_InlineAsm_X86)
 652     {
 653         // SSE version is 468% faster
 654         if (sse() && a.length >= 16)
 655         {
 656             auto n = aptr + (a.length & ~15);
 657
 658             // Unaligned case
 659             asm
 660             {
 661                 mov ECX, bptr; // right operand
 662                 mov ESI, aptr; // destination operand
 663                 mov EDI, n; // end comparison
 664
 665                 align 8;
 666             startsseloopb:
 667                 movups XMM0, [ESI];
 668                 movups XMM1, [ESI+16];
 669                 movups XMM2, [ESI+32];
 670                 movups XMM3, [ESI+48];
 671                 add ESI, 64;
 672                 movups XMM4, [ECX];
 673                 movups XMM5, [ECX+16];
 674                 movups XMM6, [ECX+32];
 675                 movups XMM7, [ECX+48];
 676                 add ECX, 64;
 677                 addps XMM0, XMM4;
 678                 addps XMM1, XMM5;
 679                 addps XMM2, XMM6;
 680                 addps XMM3, XMM7;
 681                 movups [ESI+ 0-64], XMM0;
 682                 movups [ESI+16-64], XMM1;
 683                 movups [ESI+32-64], XMM2;
 684                 movups [ESI+48-64], XMM3;
 685                 cmp ESI, EDI;
 686                 jb startsseloopb;
 687
 688                 mov aptr, ESI;
 689                 mov bptr, ECX;
 690             }
 691         }
 692         else
 693         // 3DNow! version is 57% faster
 694         if (amd3dnow() && a.length >= 8)
 695         {
 696             auto n = aptr + (a.length & ~7);
 697
 698             asm
 699             {
 700                 mov ESI, dword ptr [aptr]; // destination operand
 701                 mov EDI, dword ptr [n];    // end comparison
 702                 mov ECX, dword ptr [bptr]; // right operand
 703
 704                 align 4;
 705             start3dnow:
 706                 movq MM0, [ESI];
 707                 movq MM1, [ESI+8];
 708                 movq MM2, [ESI+16];
 709                 movq MM3, [ESI+24];
 710                 pfadd MM0, [ECX];
 711                 pfadd MM1, [ECX+8];
 712                 pfadd MM2, [ECX+16];
 713                 pfadd MM3, [ECX+24];
 714                 movq [ESI], MM0;
 715                 movq [ESI+8], MM1;
 716                 movq [ESI+16], MM2;
 717                 movq [ESI+24], MM3;
 718                 add ESI, 32;
 719                 add ECX, 32;
 720                 cmp ESI, EDI;
 721                 jb start3dnow;
 722
 723                 emms;
 724                 mov dword ptr [aptr], ESI;
 725                 mov dword ptr [bptr], ECX;
 726             }
 727         }
 728     }
 729
 730     while (aptr < aend)
 731         *aptr++ += *bptr++;
 732
 733     return a;
 734 }
 735
 736 unittest
 737 {
 738     printf("_arraySliceSliceAddass_f unittest\n");
 739     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 740     {
 741         version (log) printf("    cpuid %d\n", cpuid);
 742
 743         for (int j = 0; j < 2; j++)
 744         {
 745             const int dim = 67;
 746             T[] a = new T[dim + j];     // aligned on 16 byte boundary
 747             a = a[j .. dim + j];        // misalign for second iteration
 748             T[] b = new T[dim + j];
 749             b = b[j .. dim + j];
 750             T[] c = new T[dim + j];
 751             c = c[j .. dim + j];
 752
 753             for (int i = 0; i < dim; i++)
 754             {   a[i] = cast(T)i;
 755                 b[i] = cast(T)(i + 7);
 756                 c[i] = cast(T)(i * 2);
 757             }
 758
 759             a[] = c[];
 760             c[] += b[];
 761
 762             for (int i = 0; i < dim; i++)
 763             {
 764                 if (c[i] != cast(T)(a[i] + b[i]))
 765                 {
 766                     printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
 767                     assert(0);
 768                 }
 769             }
 770         }
 771     }
 772 }
 773
 774 /* ======================================================================== */
 775
 776 /***********************
 777  * Computes:
 778  *      a[] = b[] - value
 779  */
 780
 781 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
 782 in
 783 {
 784     assert (a.length == b.length);
 785     assert (disjoint(a, b));
 786 }
 787 body
 788 {
 789     //printf("_arraySliceExpMinSliceAssign_f()\n");
 790     auto aptr = a.ptr;
 791     auto aend = aptr + a.length;
 792     auto bptr = b.ptr;
 793
 794     version (D_InlineAsm_X86)
 795     {
 796         // SSE version is 622% faster
 797         if (sse() && a.length >= 16)
 798         {
 799             auto n = aptr + (a.length & ~15);
 800
 801             // Unaligned case
 802             asm
 803             {
 804                 mov EAX, bptr;
 805                 mov ESI, aptr;
 806                 mov EDI, n;
 807                 movss XMM4, value;
 808                 shufps XMM4, XMM4, 0;
 809
 810                 align 8;
 811             startsseloop:
 812                 add ESI, 64;
 813                 movups XMM0, [EAX];
 814                 movups XMM1, [EAX+16];
 815                 movups XMM2, [EAX+32];
 816                 movups XMM3, [EAX+48];
 817                 add EAX, 64;
 818                 subps XMM0, XMM4;
 819                 subps XMM1, XMM4;
 820                 subps XMM2, XMM4;
 821                 subps XMM3, XMM4;
 822                 movups [ESI+ 0-64], XMM0;
 823                 movups [ESI+16-64], XMM1;
 824                 movups [ESI+32-64], XMM2;
 825                 movups [ESI+48-64], XMM3;
 826                 cmp ESI, EDI;
 827                 jb startsseloop;
 828
 829                 mov aptr, ESI;
 830                 mov bptr, EAX;
 831             }
 832         }
 833         else
 834         // 3DNow! version is 67% faster
 835         if (amd3dnow() && a.length >= 8)
 836         {
 837             auto n = aptr + (a.length & ~7);
 838
 839             T[2] w;
 840
 841             w[0] = w[1] = value;
 842
 843             asm
 844             {
 845                 mov ESI, dword ptr [aptr];
 846                 mov EDI, dword ptr [n];
 847                 mov EAX, dword ptr [bptr];
 848                 movq MM4, qword ptr [w];
 849
 850                 align 8;
 851             start3dnow:
 852                 movq MM0, [EAX];
 853                 movq MM1, [EAX+8];
 854                 movq MM2, [EAX+16];
 855                 movq MM3, [EAX+24];
 856                 pfsub MM0, MM4;
 857                 pfsub MM1, MM4;
 858                 pfsub MM2, MM4;
 859                 pfsub MM3, MM4;
 860                 movq [ESI], MM0;
 861                 movq [ESI+8], MM1;
 862                 movq [ESI+16], MM2;
 863                 movq [ESI+24], MM3;
 864                 add ESI, 32;
 865                 add EAX, 32;
 866                 cmp ESI, EDI;
 867                 jb start3dnow;
 868
 869                 emms;
 870                 mov dword ptr [aptr], ESI;
 871                 mov dword ptr [bptr], EAX;
 872             }
 873         }
 874     }
 875
 876     while (aptr < aend)
 877         *aptr++ = *bptr++ - value;
 878
 879     return a;
 880 }
 881
 882 unittest
 883 {
 884     printf("_arraySliceExpMinSliceAssign_f unittest\n");
 885     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
 886     {
 887         version (log) printf("    cpuid %d\n", cpuid);
 888
 889         for (int j = 0; j < 2; j++)
 890         {
 891             const int dim = 67;
 892             T[] a = new T[dim + j];     // aligned on 16 byte boundary
 893             a = a[j .. dim + j];        // misalign for second iteration
 894             T[] b = new T[dim + j];
 895             b = b[j .. dim + j];
 896             T[] c = new T[dim + j];
 897             c = c[j .. dim + j];
 898
 899             for (int i = 0; i < dim; i++)
 900             {   a[i] = cast(T)i;
 901                 b[i] = cast(T)(i + 7);
 902                 c[i] = cast(T)(i * 2);
 903             }
 904
 905             c[] = a[] - 6;
 906
 907             for (int i = 0; i < dim; i++)
 908             {
 909                 if (c[i] != cast(T)(a[i] - 6))
 910                 {
 911                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
 912                     assert(0);
 913                 }
 914             }
 915         }
 916     }
 917 }
 918
 919 /* ======================================================================== */
 920
 921 /***********************
 922  * Computes:
 923  *      a[] = value - b[]
 924  */
 925
 926 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
 927 in
 928 {
 929     assert (a.length == b.length);
 930     assert (disjoint(a, b));
 931 }
 932 body
 933 {
 934     //printf("_arrayExpSliceMinSliceAssign_f()\n");
 935     auto aptr = a.ptr;
 936     auto aend = aptr + a.length;
 937     auto bptr = b.ptr;
 938
 939     version (D_InlineAsm_X86)
 940     {
 941         // SSE version is 690% faster
 942         if (sse() && a.length >= 16)
 943         {
 944             auto n = aptr + (a.length & ~15);
 945
 946             // Unaligned case
 947             asm
 948             {
 949                 mov EAX, bptr;
 950                 mov ESI, aptr;
 951                 mov EDI, n;
 952                 movss XMM4, value;
 953                 shufps XMM4, XMM4, 0;
 954
 955                 align 8;
 956             startsseloop:
 957                 add ESI, 64;
 958                 movaps XMM5, XMM4;
 959                 movaps XMM6, XMM4;
 960                 movups XMM0, [EAX];
 961                 movups XMM1, [EAX+16];
 962                 movups XMM2, [EAX+32];
 963                 movups XMM3, [EAX+48];
 964                 add EAX, 64;
 965                 subps XMM5, XMM0;
 966                 subps XMM6, XMM1;
 967                 movups [ESI+ 0-64], XMM5;
 968                 movups [ESI+16-64], XMM6;
 969                 movaps XMM5, XMM4;
 970                 movaps XMM6, XMM4;
 971                 subps XMM5, XMM2;
 972                 subps XMM6, XMM3;
 973                 movups [ESI+32-64], XMM5;
 974                 movups [ESI+48-64], XMM6;
 975                 cmp ESI, EDI;
 976                 jb startsseloop;
 977
 978                 mov aptr, ESI;
 979                 mov bptr, EAX;
 980             }
 981         }
 982         else
 983         // 3DNow! version is 67% faster
 984         if (amd3dnow() && a.length >= 8)
 985         {
 986             auto n = aptr + (a.length & ~7);
 987
 988             ulong w = *cast(uint *) &value;
 989             ulong v = w | (w << 32L);
 990
 991             asm
 992             {
 993                 mov ESI, aptr;
 994                 mov EDI, n;
 995                 mov EAX, bptr;
 996                 movq MM4, qword ptr [v];
 997
 998                 align 8;
 999             start3dnow:
1000                 movq MM0, [EAX];
1001                 movq MM1, [EAX+8];
1002                 movq MM2, [EAX+16];
1003                 movq MM3, [EAX+24];
1004                 pfsubr MM0, MM4;
1005                 pfsubr MM1, MM4;
1006                 pfsubr MM2, MM4;
1007                 pfsubr MM3, MM4;
1008                 movq [ESI], MM0;
1009                 movq [ESI+8], MM1;
1010                 movq [ESI+16], MM2;
1011                 movq [ESI+24], MM3;
1012                 add ESI, 32;
1013                 add EAX, 32;
1014                 cmp ESI, EDI;
1015                 jb start3dnow;
1016
1017                 emms;
1018                 mov aptr, ESI;
1019                 mov bptr, EAX;
1020             }
1021         }
1022     }
1023
1024     while (aptr < aend)
1025         *aptr++ = value - *bptr++;
1026
1027     return a;
1028 }
1029
1030 unittest
1031 {
1032     printf("_arrayExpSliceMinSliceAssign_f unittest\n");
1033     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1034     {
1035         version (log) printf("    cpuid %d\n", cpuid);
1036
1037         for (int j = 0; j < 2; j++)
1038         {
1039             const int dim = 67;
1040             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1041             a = a[j .. dim + j];        // misalign for second iteration
1042             T[] b = new T[dim + j];
1043             b = b[j .. dim + j];
1044             T[] c = new T[dim + j];
1045             c = c[j .. dim + j];
1046
1047             for (int i = 0; i < dim; i++)
1048             {   a[i] = cast(T)i;
1049                 b[i] = cast(T)(i + 7);
1050                 c[i] = cast(T)(i * 2);
1051             }
1052
1053             c[] = 6 - a[];
1054
1055             for (int i = 0; i < dim; i++)
1056             {
1057                 if (c[i] != cast(T)(6 - a[i]))
1058                 {
1059                     printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
1060                     assert(0);
1061                 }
1062             }
1063         }
1064     }
1065 }
1066
1067 /* ======================================================================== */
1068
1069 /***********************
1070  * Computes:
1071  *      a[] -= value
1072  */
1073
1074 T[] _arrayExpSliceMinass_f(T[] a, T value)
1075 {
1076     //printf("_arrayExpSliceMinass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1077     auto aptr = a.ptr;
1078     auto aend = aptr + a.length;
1079
1080     version (D_InlineAsm_X86)
1081     {
1082         // SSE version is 304% faster
1083         if (sse() && a.length >= 16)
1084         {
1085             // align pointer
1086             auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1087             while (aptr < n)
1088                 *aptr++ -= value;
1089             n = cast(T*)((cast(uint)aend) & ~15);
1090             if (aptr < n)
1091
1092             // Aligned case
1093             asm
1094             {
1095                 mov ESI, aptr;
1096                 mov EDI, n;
1097                 movss XMM4, value;
1098                 shufps XMM4, XMM4, 0;
1099
1100                 align 8;
1101             startsseloopa:
1102                 movaps XMM0, [ESI];
1103                 movaps XMM1, [ESI+16];
1104                 movaps XMM2, [ESI+32];
1105                 movaps XMM3, [ESI+48];
1106                 add ESI, 64;
1107                 subps XMM0, XMM4;
1108                 subps XMM1, XMM4;
1109                 subps XMM2, XMM4;
1110                 subps XMM3, XMM4;
1111                 movaps [ESI+ 0-64], XMM0;
1112                 movaps [ESI+16-64], XMM1;
1113                 movaps [ESI+32-64], XMM2;
1114                 movaps [ESI+48-64], XMM3;
1115                 cmp ESI, EDI;
1116                 jb startsseloopa;
1117
1118                 mov aptr, ESI;
1119             }
1120         }
1121         else
1122         // 3DNow! version is 63% faster
1123         if (amd3dnow() && a.length >= 8)
1124         {
1125             auto n = aptr + (a.length & ~7);
1126
1127             ulong w = *cast(uint *) &value;
1128             ulong v = w | (w << 32L);
1129
1130             asm
1131             {
1132                 mov ESI, dword ptr [aptr];
1133                 mov EDI, dword ptr [n];
1134                 movq MM4, qword ptr [v];
1135
1136                 align 8;
1137             start:
1138                 movq MM0, [ESI];
1139                 movq MM1, [ESI+8];
1140                 movq MM2, [ESI+16];
1141                 movq MM3, [ESI+24];
1142                 pfsub MM0, MM4;
1143                 pfsub MM1, MM4;
1144                 pfsub MM2, MM4;
1145                 pfsub MM3, MM4;
1146                 movq [ESI], MM0;
1147                 movq [ESI+8], MM1;
1148                 movq [ESI+16], MM2;
1149                 movq [ESI+24], MM3;
1150                 add ESI, 32;
1151                 cmp ESI, EDI;
1152                 jb start;
1153
1154                 emms;
1155                 mov dword ptr [aptr], ESI;
1156             }
1157         }
1158     }
1159
1160     while (aptr < aend)
1161         *aptr++ -= value;
1162
1163     return a;
1164 }
1165
1166 unittest
1167 {
1168     printf("_arrayExpSliceminass_f unittest\n");
1169     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1170     {
1171         version (log) printf("    cpuid %d\n", cpuid);
1172
1173         for (int j = 0; j < 2; j++)
1174         {
1175             const int dim = 67;
1176             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1177             a = a[j .. dim + j];        // misalign for second iteration
1178             T[] b = new T[dim + j];
1179             b = b[j .. dim + j];
1180             T[] c = new T[dim + j];
1181             c = c[j .. dim + j];
1182
1183             for (int i = 0; i < dim; i++)
1184             {   a[i] = cast(T)i;
1185                 b[i] = cast(T)(i + 7);
1186                 c[i] = cast(T)(i * 2);
1187             }
1188
1189             a[] = c[];
1190             c[] -= 6;
1191
1192             for (int i = 0; i < dim; i++)
1193             {
1194                 if (c[i] != cast(T)(a[i] - 6))
1195                 {
1196                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1197                     assert(0);
1198                 }
1199             }
1200         }
1201     }
1202 }
1203
1204 /* ======================================================================== */
1205
1206 /***********************
1207  * Computes:
1208  *      a[] -= b[]
1209  */
1210
1211 T[] _arraySliceSliceMinass_f(T[] a, T[] b)
1212 in
1213 {
1214     assert (a.length == b.length);
1215     assert (disjoint(a, b));
1216 }
1217 body
1218 {
1219     //printf("_arraySliceSliceMinass_f()\n");
1220     auto aptr = a.ptr;
1221     auto aend = aptr + a.length;
1222     auto bptr = b.ptr;
1223
1224     version (D_InlineAsm_X86)
1225     {
1226         // SSE version is 468% faster
1227         if (sse() && a.length >= 16)
1228         {
1229             auto n = aptr + (a.length & ~15);
1230
1231             // Unaligned case
1232             asm
1233             {
1234                 mov ECX, bptr; // right operand
1235                 mov ESI, aptr; // destination operand
1236                 mov EDI, n; // end comparison
1237
1238                 align 8;
1239             startsseloopb:
1240                 movups XMM0, [ESI];
1241                 movups XMM1, [ESI+16];
1242                 movups XMM2, [ESI+32];
1243                 movups XMM3, [ESI+48];
1244                 add ESI, 64;
1245                 movups XMM4, [ECX];
1246                 movups XMM5, [ECX+16];
1247                 movups XMM6, [ECX+32];
1248                 movups XMM7, [ECX+48];
1249                 add ECX, 64;
1250                 subps XMM0, XMM4;
1251                 subps XMM1, XMM5;
1252                 subps XMM2, XMM6;
1253                 subps XMM3, XMM7;
1254                 movups [ESI+ 0-64], XMM0;
1255                 movups [ESI+16-64], XMM1;
1256                 movups [ESI+32-64], XMM2;
1257                 movups [ESI+48-64], XMM3;
1258                 cmp ESI, EDI;
1259                 jb startsseloopb;
1260
1261                 mov aptr, ESI;
1262                 mov bptr, ECX;
1263             }
1264         }
1265         else
1266         // 3DNow! version is 57% faster
1267         if (amd3dnow() && a.length >= 8)
1268         {
1269             auto n = aptr + (a.length & ~7);
1270
1271             asm
1272             {
1273                 mov ESI, dword ptr [aptr]; // destination operand
1274                 mov EDI, dword ptr [n]; // end comparison
1275                 mov ECX, dword ptr [bptr]; // right operand
1276
1277                 align 4;
1278             start:
1279                 movq MM0, [ESI];
1280                 movq MM1, [ESI+8];
1281                 movq MM2, [ESI+16];
1282                 movq MM3, [ESI+24];
1283                 pfsub MM0, [ECX];
1284                 pfsub MM1, [ECX+8];
1285                 pfsub MM2, [ECX+16];
1286                 pfsub MM3, [ECX+24];
1287                 movq [ESI], MM0;
1288                 movq [ESI+8], MM1;
1289                 movq [ESI+16], MM2;
1290                 movq [ESI+24], MM3;
1291                 add ESI, 32;
1292                 add ECX, 32;
1293                 cmp ESI, EDI;
1294                 jb start;
1295
1296                 emms;
1297                 mov aptr, ESI;
1298                 mov bptr, ECX;
1299             }
1300         }
1301     }
1302
1303     while (aptr < aend)
1304         *aptr++ -= *bptr++;
1305
1306     return a;
1307 }
1308
1309 unittest
1310 {
1311     printf("_arrayExpSliceMinass_f unittest\n");
1312     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1313     {
1314         version (log) printf("    cpuid %d\n", cpuid);
1315
1316         for (int j = 0; j < 2; j++)
1317         {
1318             const int dim = 67;
1319             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1320             a = a[j .. dim + j];        // misalign for second iteration
1321             T[] b = new T[dim + j];
1322             b = b[j .. dim + j];
1323             T[] c = new T[dim + j];
1324             c = c[j .. dim + j];
1325
1326             for (int i = 0; i < dim; i++)
1327             {   a[i] = cast(T)i;
1328                 b[i] = cast(T)(i + 7);
1329                 c[i] = cast(T)(i * 2);
1330             }
1331
1332             a[] = c[];
1333             c[] -= 6;
1334
1335             for (int i = 0; i < dim; i++)
1336             {
1337                 if (c[i] != cast(T)(a[i] - 6))
1338                 {
1339                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1340                     assert(0);
1341                 }
1342             }
1343         }
1344     }
1345 }
1346
1347 /* ======================================================================== */
1348
1349 /***********************
1350  * Computes:
1351  *      a[] = b[] * value
1352  */
1353
1354 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
1355 in
1356 {
1357     assert(a.length == b.length);
1358     assert(disjoint(a, b));
1359 }
1360 body
1361 {
1362     //printf("_arraySliceExpMulSliceAssign_f()\n");
1363     auto aptr = a.ptr;
1364     auto aend = aptr + a.length;
1365     auto bptr = b.ptr;
1366
1367     version (D_InlineAsm_X86)
1368     {
1369         // SSE version is 607% faster
1370         if (sse() && a.length >= 16)
1371         {
1372             auto n = aptr + (a.length & ~15);
1373
1374             // Unaligned case
1375             asm
1376             {
1377                 mov EAX, bptr;
1378                 mov ESI, aptr;
1379                 mov EDI, n;
1380                 movss XMM4, value;
1381                 shufps XMM4, XMM4, 0;
1382
1383                 align 8;
1384             startsseloop:
1385                 add ESI, 64;
1386                 movups XMM0, [EAX];
1387                 movups XMM1, [EAX+16];
1388                 movups XMM2, [EAX+32];
1389                 movups XMM3, [EAX+48];
1390                 add EAX, 64;
1391                 mulps XMM0, XMM4;
1392                 mulps XMM1, XMM4;
1393                 mulps XMM2, XMM4;
1394                 mulps XMM3, XMM4;
1395                 movups [ESI+ 0-64], XMM0;
1396                 movups [ESI+16-64], XMM1;
1397                 movups [ESI+32-64], XMM2;
1398                 movups [ESI+48-64], XMM3;
1399                 cmp ESI, EDI;
1400                 jb startsseloop;
1401
1402                 mov aptr, ESI;
1403                 mov bptr, EAX;
1404             }
1405         }
1406         else
1407         // 3DNow! version is 69% faster
1408         if (amd3dnow() && a.length >= 8)
1409         {
1410             auto n = aptr + (a.length & ~7);
1411
1412             ulong w = *cast(uint *) &value;
1413             ulong v = w | (w << 32L);
1414
1415             asm
1416             {
1417                 mov ESI, dword ptr [aptr];
1418                 mov EDI, dword ptr [n];
1419                 mov EAX, dword ptr [bptr];
1420                 movq MM4, qword ptr [v];
1421
1422                 align 8;
1423             start:
1424                 movq MM0, [EAX];
1425                 movq MM1, [EAX+8];
1426                 movq MM2, [EAX+16];
1427                 movq MM3, [EAX+24];
1428                 pfmul MM0, MM4;
1429                 pfmul MM1, MM4;
1430                 pfmul MM2, MM4;
1431                 pfmul MM3, MM4;
1432                 movq [ESI], MM0;
1433                 movq [ESI+8], MM1;
1434                 movq [ESI+16], MM2;
1435                 movq [ESI+24], MM3;
1436                 add ESI, 32;
1437                 add EAX, 32;
1438                 cmp ESI, EDI;
1439                 jb start;
1440
1441                 emms;
1442                 mov aptr, ESI;
1443                 mov bptr, EAX;
1444             }
1445         }
1446     }
1447
1448     while (aptr < aend)
1449         *aptr++ = *bptr++ * value;
1450
1451     return a;
1452 }
1453
1454 unittest
1455 {
1456     printf("_arraySliceExpMulSliceAssign_f unittest\n");
1457     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1458     {
1459         version (log) printf("    cpuid %d\n", cpuid);
1460
1461         for (int j = 0; j < 2; j++)
1462         {
1463             const int dim = 67;
1464             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1465             a = a[j .. dim + j];        // misalign for second iteration
1466             T[] b = new T[dim + j];
1467             b = b[j .. dim + j];
1468             T[] c = new T[dim + j];
1469             c = c[j .. dim + j];
1470
1471             for (int i = 0; i < dim; i++)
1472             {   a[i] = cast(T)i;
1473                 b[i] = cast(T)(i + 7);
1474                 c[i] = cast(T)(i * 2);
1475             }
1476
1477             c[] = a[] * 6;
1478
1479             for (int i = 0; i < dim; i++)
1480             {
1481                 if (c[i] != cast(T)(a[i] * 6))
1482                 {
1483                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1484                     assert(0);
1485                 }
1486             }
1487         }
1488     }
1489 }
1490
1491 /* ======================================================================== */
1492
1493 /***********************
1494  * Computes:
1495  *      a[] = b[] * c[]
1496  */
1497
1498 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
1499 in
1500 {
1501         assert(a.length == b.length && b.length == c.length);
1502         assert(disjoint(a, b));
1503         assert(disjoint(a, c));
1504         assert(disjoint(b, c));
1505 }
1506 body
1507 {
1508     //printf("_arraySliceSliceMulSliceAssign_f()\n");
1509     auto aptr = a.ptr;
1510     auto aend = aptr + a.length;
1511     auto bptr = b.ptr;
1512     auto cptr = c.ptr;
1513
1514     version (D_InlineAsm_X86)
1515     {
1516         // SSE version is 833% faster
1517         if (sse() && a.length >= 16)
1518         {
1519             auto n = aptr + (a.length & ~15);
1520
1521             // Unaligned case
1522             asm
1523             {
1524                 mov EAX, bptr; // left operand
1525                 mov ECX, cptr; // right operand
1526                 mov ESI, aptr; // destination operand
1527                 mov EDI, n; // end comparison
1528
1529                 align 8;
1530             startsseloopb:
1531                 movups XMM0, [EAX];
1532                 movups XMM1, [EAX+16];
1533                 movups XMM2, [EAX+32];
1534                 movups XMM3, [EAX+48];
1535                 add ESI, 64;
1536                 movups XMM4, [ECX];
1537                 movups XMM5, [ECX+16];
1538                 movups XMM6, [ECX+32];
1539                 movups XMM7, [ECX+48];
1540                 add EAX, 64;
1541                 mulps XMM0, XMM4;
1542                 mulps XMM1, XMM5;
1543                 mulps XMM2, XMM6;
1544                 mulps XMM3, XMM7;
1545                 add ECX, 64;
1546                 movups [ESI+ 0-64], XMM0;
1547                 movups [ESI+16-64], XMM1;
1548                 movups [ESI+32-64], XMM2;
1549                 movups [ESI+48-64], XMM3;
1550                 cmp ESI, EDI;
1551                 jb startsseloopb;
1552
1553                 mov aptr, ESI;
1554                 mov bptr, EAX;
1555                 mov cptr, ECX;
1556             }
1557         }
1558         else
1559         // 3DNow! version is only 13% faster
1560         if (amd3dnow() && a.length >= 8)
1561         {
1562             auto n = aptr + (a.length & ~7);
1563
1564             asm
1565             {
1566                 mov ESI, dword ptr [aptr]; // destination operand
1567                 mov EDI, dword ptr [n]; // end comparison
1568                 mov EAX, dword ptr [bptr]; // left operand
1569                 mov ECX, dword ptr [cptr]; // right operand
1570
1571                 align 4;
1572             start:
1573                 movq MM0, [EAX];
1574                 movq MM1, [EAX+8];
1575                 movq MM2, [EAX+16];
1576                 movq MM3, [EAX+24];
1577                 pfmul MM0, [ECX];
1578                 pfmul MM1, [ECX+8];
1579                 pfmul MM2, [ECX+16];
1580                 pfmul MM3, [ECX+24];
1581                 movq [ESI], MM0;
1582                 movq [ESI+8], MM1;
1583                 movq [ESI+16], MM2;
1584                 movq [ESI+24], MM3;
1585                 add ECX, 32;
1586                 add ESI, 32;
1587                 add EAX, 32;
1588                 cmp ESI, EDI;
1589                 jb start;
1590
1591                 emms;
1592                 mov aptr, ESI;
1593                 mov bptr, EAX;
1594                 mov cptr, ECX;
1595             }
1596         }
1597     }
1598
1599     while (aptr < aend)
1600         *aptr++ = *bptr++ * *cptr++;
1601
1602     return a;
1603 }
1604
1605 unittest
1606 {
1607     printf("_arraySliceSliceMulSliceAssign_f unittest\n");
1608     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1609     {
1610         version (log) printf("    cpuid %d\n", cpuid);
1611
1612         for (int j = 0; j < 2; j++)
1613         {
1614             const int dim = 67;
1615             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1616             a = a[j .. dim + j];        // misalign for second iteration
1617             T[] b = new T[dim + j];
1618             b = b[j .. dim + j];
1619             T[] c = new T[dim + j];
1620             c = c[j .. dim + j];
1621
1622             for (int i = 0; i < dim; i++)
1623             {   a[i] = cast(T)i;
1624                 b[i] = cast(T)(i + 7);
1625                 c[i] = cast(T)(i * 2);
1626             }
1627
1628             c[] = a[] * b[];
1629
1630             for (int i = 0; i < dim; i++)
1631             {
1632                 if (c[i] != cast(T)(a[i] * b[i]))
1633                 {
1634                     printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1635                     assert(0);
1636                 }
1637             }
1638         }
1639     }
1640 }
1641
1642 /* ======================================================================== */
1643
1644 /***********************
1645  * Computes:
1646  *      a[] *= value
1647  */
1648
1649 T[] _arrayExpSliceMulass_f(T[] a, T value)
1650 {
1651     //printf("_arrayExpSliceMulass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1652     auto aptr = a.ptr;
1653     auto aend = aptr + a.length;
1654
1655     version (D_InlineAsm_X86)
1656     {
1657         // SSE version is 303% faster
1658         if (sse() && a.length >= 16)
1659         {
1660             // align pointer
1661             auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
1662             while (aptr < n)
1663                 *aptr++ *= value;
1664             n = cast(T*)((cast(uint)aend) & ~15);
1665             if (aptr < n)
1666
1667             // Aligned case
1668             asm
1669             {
1670                 mov ESI, aptr;
1671                 mov EDI, n;
1672                 movss XMM4, value;
1673                 shufps XMM4, XMM4, 0;
1674
1675                 align 8;
1676             startsseloopa:
1677                 movaps XMM0, [ESI];
1678                 movaps XMM1, [ESI+16];
1679                 movaps XMM2, [ESI+32];
1680                 movaps XMM3, [ESI+48];
1681                 add ESI, 64;
1682                 mulps XMM0, XMM4;
1683                 mulps XMM1, XMM4;
1684                 mulps XMM2, XMM4;
1685                 mulps XMM3, XMM4;
1686                 movaps [ESI+ 0-64], XMM0;
1687                 movaps [ESI+16-64], XMM1;
1688                 movaps [ESI+32-64], XMM2;
1689                 movaps [ESI+48-64], XMM3;
1690                 cmp ESI, EDI;
1691                 jb startsseloopa;
1692
1693                 mov aptr, ESI;
1694             }
1695         }
1696         else
1697         // 3DNow! version is 63% faster
1698         if (amd3dnow() && a.length >= 8)
1699         {
1700             auto n = aptr + (a.length & ~7);
1701
1702             ulong w = *cast(uint *) &value;
1703             ulong v = w | (w << 32L);
1704
1705             asm
1706             {
1707                 mov ESI, dword ptr [aptr];
1708                 mov EDI, dword ptr [n];
1709                 movq MM4, qword ptr [v];
1710
1711                 align 8;
1712             start:
1713                 movq MM0, [ESI];
1714                 movq MM1, [ESI+8];
1715                 movq MM2, [ESI+16];
1716                 movq MM3, [ESI+24];
1717                 pfmul MM0, MM4;
1718                 pfmul MM1, MM4;
1719                 pfmul MM2, MM4;
1720                 pfmul MM3, MM4;
1721                 movq [ESI], MM0;
1722                 movq [ESI+8], MM1;
1723                 movq [ESI+16], MM2;
1724                 movq [ESI+24], MM3;
1725                 add ESI, 32;
1726                 cmp ESI, EDI;
1727                 jb start;
1728
1729                 emms;
1730                 mov dword ptr [aptr], ESI;
1731             }
1732         }
1733     }
1734
1735     while (aptr < aend)
1736         *aptr++ *= value;
1737
1738     return a;
1739 }
1740
1741 unittest
1742 {
1743     printf("_arrayExpSliceMulass_f unittest\n");
1744     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1745     {
1746         version (log) printf("    cpuid %d\n", cpuid);
1747
1748         for (int j = 0; j < 2; j++)
1749         {
1750             const int dim = 67;
1751             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1752             a = a[j .. dim + j];        // misalign for second iteration
1753             T[] b = new T[dim + j];
1754             b = b[j .. dim + j];
1755             T[] c = new T[dim + j];
1756             c = c[j .. dim + j];
1757
1758             for (int i = 0; i < dim; i++)
1759             {   a[i] = cast(T)i;
1760                 b[i] = cast(T)(i + 7);
1761                 c[i] = cast(T)(i * 2);
1762             }
1763
1764             a[] = c[];
1765             c[] *= 6;
1766
1767             for (int i = 0; i < dim; i++)
1768             {
1769                 if (c[i] != cast(T)(a[i] * 6))
1770                 {
1771                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1772                     assert(0);
1773                 }
1774             }
1775         }
1776     }
1777 }
1778
1779 /* ======================================================================== */
1780
1781 /***********************
1782  * Computes:
1783  *      a[] *= b[]
1784  */
1785
1786 T[] _arraySliceSliceMulass_f(T[] a, T[] b)
1787 in
1788 {
1789     assert (a.length == b.length);
1790     assert (disjoint(a, b));
1791 }
1792 body
1793 {
1794     //printf("_arraySliceSliceMulass_f()\n");
1795     auto aptr = a.ptr;
1796     auto aend = aptr + a.length;
1797     auto bptr = b.ptr;
1798
1799     version (D_InlineAsm_X86)
1800     {
1801         // SSE version is 525% faster
1802         if (sse() && a.length >= 16)
1803         {
1804             auto n = aptr + (a.length & ~15);
1805
1806             // Unaligned case
1807             asm
1808             {
1809                 mov ECX, bptr; // right operand
1810                 mov ESI, aptr; // destination operand
1811                 mov EDI, n; // end comparison
1812
1813                 align 8;
1814             startsseloopb:
1815                 movups XMM0, [ESI];
1816                 movups XMM1, [ESI+16];
1817                 movups XMM2, [ESI+32];
1818                 movups XMM3, [ESI+48];
1819                 add ESI, 64;
1820                 movups XMM4, [ECX];
1821                 movups XMM5, [ECX+16];
1822                 movups XMM6, [ECX+32];
1823                 movups XMM7, [ECX+48];
1824                 add ECX, 64;
1825                 mulps XMM0, XMM4;
1826                 mulps XMM1, XMM5;
1827                 mulps XMM2, XMM6;
1828                 mulps XMM3, XMM7;
1829                 movups [ESI+ 0-64], XMM0;
1830                 movups [ESI+16-64], XMM1;
1831                 movups [ESI+32-64], XMM2;
1832                 movups [ESI+48-64], XMM3;
1833                 cmp ESI, EDI;
1834                 jb startsseloopb;
1835
1836                 mov aptr, ESI;
1837                 mov bptr, ECX;
1838             }
1839         }
1840         else
1841         // 3DNow! version is 57% faster
1842         if (amd3dnow() && a.length >= 8)
1843         {
1844             auto n = aptr + (a.length & ~7);
1845
1846             asm
1847             {
1848                 mov ESI, dword ptr [aptr]; // destination operand
1849                 mov EDI, dword ptr [n]; // end comparison
1850                 mov ECX, dword ptr [bptr]; // right operand
1851
1852                 align 4;
1853             start:
1854                 movq MM0, [ESI];
1855                 movq MM1, [ESI+8];
1856                 movq MM2, [ESI+16];
1857                 movq MM3, [ESI+24];
1858                 pfmul MM0, [ECX];
1859                 pfmul MM1, [ECX+8];
1860                 pfmul MM2, [ECX+16];
1861                 pfmul MM3, [ECX+24];
1862                 movq [ESI], MM0;
1863                 movq [ESI+8], MM1;
1864                 movq [ESI+16], MM2;
1865                 movq [ESI+24], MM3;
1866                 add ESI, 32;
1867                 add ECX, 32;
1868                 cmp ESI, EDI;
1869                 jb start;
1870
1871                 emms;
1872                 mov dword ptr [aptr], ESI;
1873                 mov dword ptr [bptr], ECX;
1874             }
1875         }
1876     }
1877
1878     while (aptr < aend)
1879         *aptr++ *= *bptr++;
1880
1881     return a;
1882 }
1883
1884 unittest
1885 {
1886     printf("_arrayExpSliceMulass_f unittest\n");
1887     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1888     {
1889         version (log) printf("    cpuid %d\n", cpuid);
1890
1891         for (int j = 0; j < 2; j++)
1892         {
1893             const int dim = 67;
1894             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1895             a = a[j .. dim + j];        // misalign for second iteration
1896             T[] b = new T[dim + j];
1897             b = b[j .. dim + j];
1898             T[] c = new T[dim + j];
1899             c = c[j .. dim + j];
1900
1901             for (int i = 0; i < dim; i++)
1902             {   a[i] = cast(T)i;
1903                 b[i] = cast(T)(i + 7);
1904                 c[i] = cast(T)(i * 2);
1905             }
1906
1907             a[] = c[];
1908             c[] *= 6;
1909
1910             for (int i = 0; i < dim; i++)
1911             {
1912                 if (c[i] != cast(T)(a[i] * 6))
1913                 {
1914                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1915                     assert(0);
1916                 }
1917             }
1918         }
1919     }
1920 }
1921
1922 /* ======================================================================== */
1923
1924 /***********************
1925  * Computes:
1926  *      a[] = b[] / value
1927  */
1928
1929 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
1930 in
1931 {
1932     assert(a.length == b.length);
1933     assert(disjoint(a, b));
1934 }
1935 body
1936 {
1937     //printf("_arraySliceExpDivSliceAssign_f()\n");
1938     auto aptr = a.ptr;
1939     auto aend = aptr + a.length;
1940     auto bptr = b.ptr;
1941
1942     /* Multiplying by the reciprocal is faster, but does
1943      * not produce as accurate an answer.
1944      */
1945     T recip = cast(T)1 / value;
1946
1947     version (D_InlineAsm_X86)
1948     {
1949         // SSE version is 587% faster
1950         if (sse() && a.length >= 16)
1951         {
1952             auto n = aptr + (a.length & ~15);
1953
1954             // Unaligned case
1955             asm
1956             {
1957                 mov EAX, bptr;
1958                 mov ESI, aptr;
1959                 mov EDI, n;
1960                 movss XMM4, recip;
1961                 //movss XMM4, value
1962                 //rcpss XMM4, XMM4
1963                 shufps XMM4, XMM4, 0;
1964
1965                 align 8;
1966             startsseloop:
1967                 add ESI, 64;
1968                 movups XMM0, [EAX];
1969                 movups XMM1, [EAX+16];
1970                 movups XMM2, [EAX+32];
1971                 movups XMM3, [EAX+48];
1972                 add EAX, 64;
1973                 mulps XMM0, XMM4;
1974                 mulps XMM1, XMM4;
1975                 mulps XMM2, XMM4;
1976                 mulps XMM3, XMM4;
1977                 //divps XMM0, XMM4;
1978                 //divps XMM1, XMM4;
1979                 //divps XMM2, XMM4;
1980                 //divps XMM3, XMM4;
1981                 movups [ESI+ 0-64], XMM0;
1982                 movups [ESI+16-64], XMM1;
1983                 movups [ESI+32-64], XMM2;
1984                 movups [ESI+48-64], XMM3;
1985                 cmp ESI, EDI;
1986                 jb startsseloop;
1987
1988                 mov aptr, ESI;
1989                 mov bptr, EAX;
1990             }
1991         }
1992         else
1993         // 3DNow! version is 72% faster
1994         if (amd3dnow() && a.length >= 8)
1995         {
1996             auto n = aptr + (a.length & ~7);
1997
1998             T[2] w = void;
1999
2000             w[0] = recip;
2001             w[1] = recip;
2002
2003             asm
2004             {
2005                 mov ESI, dword ptr [aptr];
2006                 mov EDI, dword ptr [n];
2007                 mov EAX, dword ptr [bptr];
2008                 movq MM4, qword ptr [w];
2009
2010                 align 8;
2011             start:
2012                 movq MM0, [EAX];
2013                 movq MM1, [EAX+8];
2014                 movq MM2, [EAX+16];
2015                 movq MM3, [EAX+24];
2016                 pfmul MM0, MM4;
2017                 pfmul MM1, MM4;
2018                 pfmul MM2, MM4;
2019                 pfmul MM3, MM4;
2020                 movq [ESI], MM0;
2021                 movq [ESI+8], MM1;
2022                 movq [ESI+16], MM2;
2023                 movq [ESI+24], MM3;
2024                 add ESI, 32;
2025                 add EAX, 32;
2026                 cmp ESI, EDI;
2027                 jb start;
2028
2029                 emms;
2030                 mov dword ptr [aptr], ESI;
2031                 mov dword ptr [bptr], EAX;
2032             }
2033         }
2034     }
2035
2036     while (aptr < aend)
2037         *aptr++ = *bptr++ * recip;
2038
2039     return a;
2040 }
2041
2042 unittest
2043 {
2044     printf("_arraySliceExpDivSliceAssign_f unittest\n");
2045     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2046     {
2047         version (log) printf("    cpuid %d\n", cpuid);
2048
2049         for (int j = 0; j < 2; j++)
2050         {
2051             const int dim = 67;
2052             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2053             a = a[j .. dim + j];        // misalign for second iteration
2054             T[] b = new T[dim + j];
2055             b = b[j .. dim + j];
2056             T[] c = new T[dim + j];
2057             c = c[j .. dim + j];
2058
2059             for (int i = 0; i < dim; i++)
2060             {   a[i] = cast(T)i;
2061                 b[i] = cast(T)(i + 7);
2062                 c[i] = cast(T)(i * 2);
2063             }
2064
2065             c[] = a[] / 8;
2066
2067             for (int i = 0; i < dim; i++)
2068             {
2069                 if (c[i] != cast(T)(a[i] / 8))
2070                 {
2071                     printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2072                     assert(0);
2073                 }
2074             }
2075         }
2076     }
2077 }
2078
2079 /* ======================================================================== */
2080
2081 /***********************
2082  * Computes:
2083  *      a[] /= value
2084  */
2085
2086 T[] _arrayExpSliceDivass_f(T[] a, T value)
2087 {
2088     //printf("_arrayExpSliceDivass_f(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2089     auto aptr = a.ptr;
2090     auto aend = aptr + a.length;
2091
2092     /* Multiplying by the reciprocal is faster, but does
2093      * not produce as accurate an answer.
2094      */
2095     T recip = cast(T)1 / value;
2096
2097     version (D_InlineAsm_X86)
2098     {
2099         // SSE version is 245% faster
2100         if (sse() && a.length >= 16)
2101         {
2102             // align pointer
2103             auto n = cast(T*)((cast(uint)aptr + 15) & ~15);
2104             while (aptr < n)
2105                 *aptr++ *= recip;
2106             n = cast(T*)((cast(uint)aend) & ~15);
2107             if (aptr < n)
2108
2109             // Aligned case
2110             asm
2111             {
2112                 mov ESI, aptr;
2113                 mov EDI, n;
2114                 movss XMM4, recip;
2115                 //movss XMM4, value
2116                 //rcpss XMM4, XMM4
2117                 shufps XMM4, XMM4, 0;
2118
2119                 align 8;
2120             startsseloopa:
2121                 movaps XMM0, [ESI];
2122                 movaps XMM1, [ESI+16];
2123                 movaps XMM2, [ESI+32];
2124                 movaps XMM3, [ESI+48];
2125                 add ESI, 64;
2126                 mulps XMM0, XMM4;
2127                 mulps XMM1, XMM4;
2128                 mulps XMM2, XMM4;
2129                 mulps XMM3, XMM4;
2130                 //divps XMM0, XMM4;
2131                 //divps XMM1, XMM4;
2132                 //divps XMM2, XMM4;
2133                 //divps XMM3, XMM4;
2134                 movaps [ESI+ 0-64], XMM0;
2135                 movaps [ESI+16-64], XMM1;
2136                 movaps [ESI+32-64], XMM2;
2137                 movaps [ESI+48-64], XMM3;
2138                 cmp ESI, EDI;
2139                 jb startsseloopa;
2140
2141                 mov aptr, ESI;
2142             }
2143         }
2144         else
2145         // 3DNow! version is 57% faster
2146         if (amd3dnow() && a.length >= 8)
2147         {
2148             auto n = aptr + (a.length & ~7);
2149
2150             T[2] w = void;
2151
2152             w[0] = w[1] = recip;
2153
2154             asm
2155             {
2156                 mov ESI, dword ptr [aptr];
2157                 mov EDI, dword ptr [n];
2158                 movq MM4, qword ptr [w];
2159
2160                 align 8;
2161             start:
2162                 movq MM0, [ESI];
2163                 movq MM1, [ESI+8];
2164                 movq MM2, [ESI+16];
2165                 movq MM3, [ESI+24];
2166                 pfmul MM0, MM4;
2167                 pfmul MM1, MM4;
2168                 pfmul MM2, MM4;
2169                 pfmul MM3, MM4;
2170                 movq [ESI], MM0;
2171                 movq [ESI+8], MM1;
2172                 movq [ESI+16], MM2;
2173                 movq [ESI+24], MM3;
2174                 add ESI, 32;
2175                 cmp ESI, EDI;
2176                 jb start;
2177
2178                 emms;
2179                 mov dword ptr [aptr], ESI;
2180             }
2181         }
2182     }
2183
2184     while (aptr < aend)
2185         *aptr++ *= recip;
2186
2187     return a;
2188 }
2189
2190 unittest
2191 {
2192     printf("_arrayExpSliceDivass_f unittest\n");
2193     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2194     {
2195         version (log) printf("    cpuid %d\n", cpuid);
2196
2197         for (int j = 0; j < 2; j++)
2198         {
2199             const int dim = 67;
2200             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2201             a = a[j .. dim + j];        // misalign for second iteration
2202             T[] b = new T[dim + j];
2203             b = b[j .. dim + j];
2204             T[] c = new T[dim + j];
2205             c = c[j .. dim + j];
2206
2207             for (int i = 0; i < dim; i++)
2208             {   a[i] = cast(T)i;
2209                 b[i] = cast(T)(i + 7);
2210                 c[i] = cast(T)(i * 2);
2211             }
2212
2213             a[] = c[];
2214             c[] /= 8;
2215
2216             for (int i = 0; i < dim; i++)
2217             {
2218                 if (c[i] != cast(T)(a[i] / 8))
2219                 {
2220                     printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
2221                     assert(0);
2222                 }
2223             }
2224         }
2225     }
2226 }
2227
2228
2229 /* ======================================================================== */
2230
2231 /***********************
2232  * Computes:
2233  *      a[] -= b[] * value
2234  */
2235
2236 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
2237 {
2238     return _arraySliceExpMulSliceAddass_f(a, -value, b);
2239 }
2240
2241 /***********************
2242  * Computes:
2243  *      a[] += b[] * value
2244  */
2245
2246 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
2247 in
2248 {
2249         assert(a.length == b.length);
2250         assert(disjoint(a, b));
2251 }
2252 body
2253 {
2254     auto aptr = a.ptr;
2255     auto aend = aptr + a.length;
2256     auto bptr = b.ptr;
2257
2258     // Handle remainder
2259     while (aptr < aend)
2260         *aptr++ += *bptr++ * value;
2261
2262     return a;
2263 }
2264
2265 unittest
2266 {
2267     printf("_arraySliceExpMulSliceAddass_f unittest\n");
2268
2269     cpuid = 1;
2270     {
2271         version (log) printf("    cpuid %d\n", cpuid);
2272
2273         for (int j = 0; j < 1; j++)
2274         {
2275             const int dim = 67;
2276             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2277             a = a[j .. dim + j];        // misalign for second iteration
2278             T[] b = new T[dim + j];
2279             b = b[j .. dim + j];
2280             T[] c = new T[dim + j];
2281             c = c[j .. dim + j];
2282
2283             for (int i = 0; i < dim; i++)
2284             {   a[i] = cast(T)i;
2285                 b[i] = cast(T)(i + 7);
2286                 c[i] = cast(T)(i * 2);
2287             }
2288
2289             b[] = c[];
2290             c[] += a[] * 6;
2291
2292             for (int i = 0; i < dim; i++)
2293             {
2294                 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2295                 if (c[i] != cast(T)(b[i] + a[i] * 6))
2296                 {
2297                     printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
2298                     assert(0);
2299                 }
2300             }
2301         }
2302     }
2303 }