# Slow conversion of float to 64-bit integer

Hi!

Since there is no FPU instruction for it, implicit or explicit conversion from float to signed or unsigned long long generates calls to the library functions __aeabi_f2lz() and __aeabi_f2ulz(). These are implemented in libgcc by the generic versions of __fixsfdi() and __fixunssfdi().

These functions start by converting their argument to double and do all operations in double precision. This pulls in the big and slow soft-float library, causing a conversion to take a whopping 350 CM4F cycles or more.

I've written a more optimized set of conversion routines that assumes IEEE754 format and performs the conversion in around 50 cycles. Can this, or something to the same effect, be included upstream? I guess I can post a proper patch if someone tells me how to correctly integrate it in libgcc.

Bonus question: Can someone explain where and why it is specified that the AEABI/GCC functions above that take a float as argument shall have this passed in a core register, even when building with the hard fp ABI? What kind of magic in libgcc causes the functions to be built as if they were passed an uint32_t?

## Question information

- Language:
- English Edit question

- Status:
- Expired

- Assignee:
- No assignee Edit question

- Last query:
- 2018-06-07

- Last reply:
- 2018-06-23

/**

* @file aeabi_f2lz.c

* @author Andreas Fritiofson <email address hidden>

*

* A replacement of the float to 64-bit integer routines in libgcc that

* are suboptimal for a processor with a single precision FPU, because

* it converts the argument and does the calculations in double precision.

*

* On a Cortex-M4F, the libgcc routines takes about 350 cycles, while

* this version manages the same job in around 50. As a bonus, these

* versions have the same nice saturating behaviour on inputs that are

* out-of-range (undefined in C) as the FPU instructions for float to

* integer conversion.

*/

#include <stdint.h>

#define unlikely(x) __builtin_

#define BIAS 126

int64_t __aeabi_

{

uint32_t e = (f >> 23) & 0xff;

_Bool neg = f & 0x80000000;

int64_t r;

if (unlikely(e < 1 + BIAS)) {

r = 0;

} else if (unlikely(e > 63 + BIAS)) {

r = neg ? INT64_MIN : INT64_MAX;

} else {

uint32_t u = (f & 0x007fffff) << 8 | 0x80000000;

if (e > 32 + BIAS) {

r = (int64_t)u << (e - (32 + BIAS));

r = neg ? -r : r;

} else {

u >>= (32 + BIAS - e);

r = neg ? -(int64_t)u : u;

}

}

return r;

}

uint64_t __aeabi_

{

uint32_t e = (f >> 23) & 0xff;

_Bool neg = f & 0x80000000;

uint64_t r;

if (neg || unlikely(e < 1 + BIAS)) {

r = 0;

} else if (unlikely(e > 64 + BIAS)) {

r = UINT64_MAX;

} else {

uint32_t u = (f & 0x007fffff) << 8 | 0x80000000;

if (e > 32 + BIAS) {

r = (uint64_t)u << (e - (32 + BIAS));

} else {

r = u >> (32 + BIAS - e);

}

}

return r;

}

#if 0

/* Attempt at an optimized variant of the f2ulz using the FPU to do the scaling

* to the correct bit position and handling saturation. However the performance

* is on par with the portable version above. Perhaps it can be optimized more.

*/

uint64_t __attribute__ ((naked, noinline)) __aeabi_

{

asm volatile (

"vldr s15, =0x2f800000 \n"

"vmov s0, r0 \n"

"vmul.f32 s15, s0, s15 \n"

"vcvt.u32.f32 s15, s15 \n"

"vmov r1, s15 \n"

"cmp r1, r1, ASR #31 \n"

"beq.n 1f \n"

"vldr s14, =0x4f800000 \n"

"vcvt.f32.u32 s15, s15 \n"

"vfms.f32 s0, s15, s14 \n"

"vcvt.u32.f32 s0, s0 \n"

"vmov r0, s0 \n"

"bx lr \n"

"1: \n"

"mov r0, r1 \n"

"bx lr \n"

);

}

#endif

Launchpad Janitor (janitor) said : | #2 |

This question was expired because it remained in the 'Open' state without activity for the last 15 days.