| |||
| [Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] |
for( i = 0; i < Size16; i += 16 ) { vSrc1_1 = vec_ld( 16 - 1, inSrc1 ); vSrc2_1 = vec_ld( 16 - 1, inSrc2 ); vSrc1_2 = vec_ld( 32 - 1, inSrc1 ); vSrc2_2 = vec_ld( 32 - 1, inSrc2 ); vSrc1_3 = vec_ld( 48 - 1, inSrc1 ); vSrc2_3 = vec_ld( 48 - 1, inSrc2 ); vSrc1_E = vec_ld( 64 - 1, inSrc1 ); vSrc2_E = vec_ld( 64 - 1, inSrc2 ); inSrc1 += 16; inSrc2 += 16;
vSrc1_0 = vec_perm( vSrc1_0, vSrc1_1, permSrc1Mask ); vSrc2_0 = vec_perm( vSrc2_0, vSrc2_1, permSrc2Mask ); vSrc1_1 = vec_perm( vSrc1_1, vSrc1_2, permSrc1Mask ); vSrc2_1 = vec_perm( vSrc2_1, vSrc2_2, permSrc2Mask ); vSrc1_2 = vec_perm( vSrc1_2, vSrc1_3, permSrc1Mask ); vSrc2_2 = vec_perm( vSrc2_2, vSrc2_3, permSrc2Mask ); vSrc1_3 = vec_perm( vSrc1_3, vSrc1_E, permSrc1Mask ); vSrc2_3 = vec_perm( vSrc2_3, vSrc2_E, permSrc2Mask );
vSum = vec_madd( vSrc1_0, vSrc2_0, vSum ); vSum = vec_madd( vSrc1_1, vSrc2_1, vSum ); vSum = vec_madd( vSrc1_2, vSrc2_2, vSum ); vSum = vec_madd( vSrc1_3, vSrc2_3, vSum );
vSrc1_0 = vSrc1_E; vSrc2_0 = vSrc2_E; } // for (vector)
for( i = Size16; i < Size4; i +=4 ) { vSrc1_E= vec_ld( 16 - 1, inSrc1 ); vSrc2_E = vec_ld( 16 - 1, inSrc2 ); inSrc1 += 4; inSrc2 += 4;
vSrc1_0 = vec_perm( vSrc1_0, vSrc1_E, permSrc1Mask ); vSrc2_0 = vec_perm( vSrc2_0, vSrc2_E, permSrc2Mask );
vSum = vec_madd( vSrc1_0, vSrc2_0, vSum );
vSrc1_0 = vSrc1_E; vSrc2_0 = vSrc2_E; } // for (vector)
3. Scalar "trailer" for any floats left
13.1% 0x2d460 lvx v13,r10,r3 5:1 Stall=2, Loop start[1] 0x2d464 lvx v0,r10,r4 5:1 0x2d468 lvx v10,r11,r3 5:1
0x2d46c vperm v11,v11,v13,v8 3:1
2.7% 0x2d470 lvx v12,r11,r4 5:1
0x2d474 vperm v1,v1,v0,v7 3:1 Stall=1
0x2d478 vperm v13,v13,v10,v8 3:1
0x2d47c vmaddfp v9,v11,v1,v9 8:1 Stall=3
11.6% 0x2d480 vperm v0,v0,v12,v7 3:1 0x2d484 lvx v11,r9,r3 5:1 Stall=3 0x2d488 lvx v1,r9,r4 5:1 Stall=2 0x2d48c vperm v10,v10,v11,v8 3:1
2.2% 0x2d490 vmaddfp v9,v13,v0,v9 8:1
0x2d494 vperm v12,v12,v1,v7 3:1
0x2d498 lvx v0,r2,r3 5:1
0x2d49c lvx v13,r2,r4 5:1
12.6% 0x2d4a0 addi r3,r3,64 2:1
0x2d4a4 addi r4,r4,64 2:1
0x2d4a8 vmaddfp v9,v10,v12,v9 8:1 Stall=3 0x2d4ac vperm v10,v11,v0,v8 3:1
15.5% 0x2d4b0 vperm v12,v1,v13,v7 3:1 0x2d4b4 vmr v11,v0 2:1
0x2d4b8 vsldoi v1,v13,v13,0 3:1 0x2d4bc vmaddfp v9,v10,v12,v9 8:1 0x2d4c0 bdnz $-96 <DotProductVecNew(float const*, float const*, int) + 96> 2:1 Loop end[1]
0.7% 0x2d4e0 lvx v13,r2,r3 5:1 Loop start[2]
0x2d4e4 addi r3,r3,16 2:1
0.7% 0x2d4e8 lvx v10,r2,r4 5:1
0x2d4ec addi r4,r4,16 2:1
0x2d4f0 vperm v0,v11,v13,v8 3:1
0x2d4f4 vmr v11,v13 2:1
1.0% 0x2d4f8 vperm v12,v1,v10,v7 3:1 Stall=1
0x2d4fc vmr v1,v10 2:1
0.7% 0x2d500 vmaddfp v9,v0,v12,v9 8:1
0x2d504 bdnz $-36 <DotProductVecNew(float const*, float const*, int) + 224> 2:1 Loop end[2]
-- Sincerely, Rustam Muginov
_______________________________________________ Do not post admin requests to the list. They will be ignored. PerfOptimization-dev mailing list (email@hidden) Help/Unsubscribe/Update your Subscription: http://lists.apple.com/mailman/options/perfoptimization-dev/email@hidden
| Home | Archives | FAQ | Terms/Conditions | Contact | RSS | Lists | About |
Visit the Apple Store online or at retail locations.
1-800-MY-APPLE
Contact Apple | Terms of Use | Privacy Policy
Copyright © 2007 Apple Inc. All rights reserved.