short->float conversion
short->float conversion
- Subject: short->float conversion
- From: Ian Ollmann <email@hidden>
- Date: Fri, 24 Aug 2001 00:21:21 -0700
I tried two methods of doing short->float conversion. I found that a
method similar to what Metrowkers does (see PPC Complier Writers Guide,
p83) works the best. Critical to its speedy function however was to
achieve proper pipelining.
Even when executing nearly perfectly (two instructions dispatched and
retired each cycle), the method that operates strictly in the integer unit
was only able to get a throughput of about one conversion per ten cycles:
//Convert SInt16 sound sample to float in the range -1.0 ... 1.0
//Caution, this method does not work properly for the value 0
void Convert1( SInt16 *src, float *dest, int count )
{
long loopCount = count / 2;
long value1, value2, mask1, mask2, zeros1, zeros2, exponent1, exponent2;
while( loopCount-- )
{
value1 = src[0];
value2 = src[1];
mask1 = value1 >> 31;
mask2 = value2 >> 31;
value1 = ((-value1) & mask1) | (value1 & ~mask1);
value2 = ((-value2) & mask2) | (value2 & ~mask2);
zeros1 = __cntlzw( value1 );
zeros2 = __cntlzw( value2 );
exponent1 = (mask1 & 256) + (143 - zeros1);
exponent2 = (mask2 & 256) + (143 - zeros2);
value1 <<= zeros1 - 8;
value2 <<= zeros2 - 8;
value1 = __rlwimi( value1, exponent1, 23, 0, 8 );
value2 = __rlwimi( value2, exponent2, 23, 0, 8 );
((SInt32*) dest)[0] = value1;
((SInt32*) dest)[1] = value2;
src+=2;
dest+=2;
}
if( count & 1 )
{
value1 = src[0];
mask1 = value1 >> 31;
value1 = ((-value1) & mask1) | (value1 & ~mask1);
zeros1 = __cntlzw( value1 );
exponent1 = (mask1 & 256) + (143 - zeros1);
value1 <<= zeros1 - 8;
value1 = __rlwimi( value1, exponent1, 23, 0, 8 );
((SInt32*) dest)[0] = value1;
}
}
3067:lha R9,0x4(R3) | 2494 | IIIDER................. | 2499
3068:lha R10,0x6(R3) | 2495 | .IIIDER................ | 2500
3069:addi R3,R3,0x8 | 2495 | .IIIDFR................ | 2500
3070:srawi R11,R9,31 | 2496 | ..IIIDFR............... | 2501
3071:neg R0,R9 | 2497 | ...IIDFR............... | 2501
3072:and R8,R0,R11 | 2497 | ...IIIDFR.............. | 2502
3073:andc R0,R9,R11 | 2498 | ....IIDFR.............. | 2502
3074:or R9,R8,R0 | 2499 | .....IIDFR............. | 2503
3075:cntlzw R8,R9 | 2499 | .....IIDDR............. | 2503
3076:addi R0,R8,0xfff8 | 2500 | ......IIDDR............ | 2504
3077:srawi R12,R10,31 | 2500 | ......IIIDR............ | 2504
3078:neg R7,R10 | 2501 | .......IIIDR........... | 2505
3079:slw R9,R9,R0 | 2501 | .......IIIDR........... | 2505
3080:and R7,R7,R12 | 2502 | ........IIIDR.......... | 2506
3081:andc R0,R10,R12 | 2502 | ........IIIDR.......... | 2506
3082:or R10,R7,R0 | 2503 | .........IIIDR......... | 2507
3083:subfic R0,R8,0x8f | 2504 | ..........IIDR......... | 2507
3084:rlwinm R7,R11,0,23,23 | 2505 | ...........IIDR........ | 2508
3085:add R0,R7,R0 | 2505 | ...........IIDDR....... | 2509
3086:cntlzw R31,R10 | 2506 | ............IIDR....... | 2509
3087:rlwimi R9,R0,23,0,8 | 2506 | ............IIIDR...... | 2510
3088:addi R8,R31,0xfff8 | 2507 | .............IIDR...... | 2510
3089:stw R9,0x8(R4) | 2507 | .............IIIDER.... | 2512
3090:rlwinm R7,R12,0,23,23 | 2508 | ..............IIDFR.... | 2512
3091:subfic R0,R31,0x8f | 2508 | ..............IIIDFR... | 2513
3092:add R0,R7,R0 | 2509 | ...............IIDDR... | 2513
3093:slw R10,R10,R8 | 2510 | ................IIDFR.. | 2514
3094:rlwimi R10,R0,23,0,8 | 2510 | ................IIIDR.. | 2514
3095:stw R10,0xc(R4) | 2511 | .................IIDER. | 2515
Because the second method involves fewer calculations per datum and has
longer pipelines, it can get a throughput of about one conversion per
7.5-8 cycles. It still has quite a few pipeline holes and may benefit from
reading and writing a whole cacheline at a time, especially on the 7450.
In addition, calling dcbt might be a helpful addition. Cold cache SimG4
simulations show a lot of stalls. With a warm cache, it causes about a 5%
slowdown.
//Convert SInt16's into floats in the range -1.0 ... 1.0
void Convert2( SInt16 *src, float *dest, int count )
{
register long value1, value2, value3, value4, exp;
register float temp1, temp2, temp3, temp4;
register float sub;
long loopCount = count / 4;
union
{
float f;
SInt32 i;
}constant;
constant.i = 0x43808000;
sub = constant.f;
exp = constant.i;
while( loopCount--)
{
value1 = src[0];
value2 = src[1];
value3 = src[2];
value4 = src[3];
value1 &= 0xFFFF;
value2 &= 0xFFFF;
value3 &= 0xFFFF;
value4 &= 0xFFFF;
value1 ^= exp;
value2 ^= exp;
value3 ^= exp;
value4 ^= exp;
((SInt32*) dest)[0] = value1;
((SInt32*) dest)[1] = value2;
((SInt32*) dest)[2] = value3;
((SInt32*) dest)[3] = value4;
temp1 = dest[0];
temp2 = dest[1];
temp3 = dest[2];
temp4 = dest[3];
temp1 -= sub;
temp2 -= sub;
temp3 -= sub;
temp4 -= sub;
dest[0] = temp1;
dest[1] = temp2;
dest[2] = temp3;
dest[3] = temp4;
src += 4;
dest += 4;
}
if( count & 2 )
{
value1 = src[0];
value2 = src[1];
value1 &= 0xFFFF;
value2 &= 0xFFFF;
value1 ^= exp;
value2 ^= exp;
((SInt32*) dest)[0] = value1;
((SInt32*) dest)[1] = value2;
dest[0] -= sub;
dest[1] -= sub;
src += 2;
dest += 2;
}
if( count & 1 )
{
value1 = src[0];
value1 &= 0xFFFF;
value1 ^= exp;
((SInt32*) dest)[0] = value1;
dest[0] -= sub;
}
}
Convert2 (convert four shorts):
1389:lha R0,0x0(R3) | 1863 | IIIDDDDEFR.......................| 1872
1390:lha R8,0x2(R3) | 1864 | .IIIIIDDEFR......................| 1873
1391:rlwinm R0,R0,0,16,31 | 1865 | ..IIIIDDDFR......................| 1873
1392:lha R9,0x4(R3) | 1866 | ...IIIIDDEFR.....................| 1874
1393:lha R10,0x6(R3) | 1866 | ...IIIIIDDER.....................| 1874
1394:xor R0,R0,R7 | 1867 | ....IIIIIDFFR....................| 1875
1395:rlwinm R8,R8,0,16,31 | 1867 | ....IIIIIIDFR....................| 1875
1396:stw R0,0x0(R4) | 1870 | .......IIIDEER...................| 1876
1397:xor R8,R8,R7 | 1870 | .......IIIIDFR...................| 1876
1398:rlwinm R9,R9,0,16,31 | 1871 | ........IIIDFFR..................| 1877
1399:stw R8,0x4(R4) | 1872 | .........IIIDEER.................| 1878
1400:xor R9,R9,R7 | 1873 | ..........IIDFFR.................| 1878
1401:rlwinm R10,R10,0,16,31 | 1874 | ...........IIDFFR................| 1879
1402:stw R9,0x8(R4) | 1874 | ...........IIDEEER...............| 1880
1403:xor R10,R10,R7 | 1875 | ............IIDFFR...............| 1880
1404:stw R10,0xc(R4) | 1875 | ............IIDEEER..............| 1881
1405:lfs F1,0x0(R4) | 1876 | .............IIDDDER.............| 1882
1406:lfs F2,0x4(R4) | 1876 | .............IIIDDDER............| 1883
1407:lfs F3,0x8(R4) | 1877 | ..............IIIIDDER...........| 1884
1408:fsubs F1,F1,F0 | 1878 | ...............IIIDDEEFR.........| 1886
1409:lfs F4,0xc(R4) | 1878 | ...............IIIIDDDDDER.......| 1888
1410:fsubs F2,F2,F0 | 1878 | ...............IIIIIDEEFFFR......| 1889
1411:fsubs F3,F3,F0 | 1879 | ................IIIIIDEEFFR......| 1889
1412:stfs F1,0x0(R4) | 1880 | .................IIIIDDDDEER.....| 1890
1413:fsubs F4,F4,F0 | 1882 | ...................IIIIDDDEEFR...| 1892
1414:stfs F2,0x4(R4) | 1882 | ...................IIIIIDDEEEER..| 1893
1415:stfs F3,0x8(R4) | 1883 | ....................IIIIIDDEEEER.| 1894
1416:stfs F4,0xc(R4) | 1884 | .....................IIIIIDDDDEER| 1895
---------------------------------------------------
Ian Ollmann, Ph.D. email@hidden
---------------------------------------------------