for some firmware running on an rpi pico, i had to find the number of consecutive bits set in some 32-bit integer.
because of realtime constraints, the upper limit on its execution time had to be low.
the native implementation looks like this:
because of realtime constraints, the upper limit on its execution time had to be low.
the native implementation looks like this:
int seq_bits32_naive(uint32_t x1) {
int i = 0;
while (x1) {
x1 &= x1 << 1;
i++;
}
return i;
}
however, on an cortex m0+, the code gcc generates takes at most 203 cycles, which is too high.
instead, we can do a sort of exponential search:
static inline int seq_bits32_2(int offset, uint32_t x1, uint32_t x2) {
uint32_t x3 = x2 >> 1 & x1;
if (!x3) return offset;
return offset + 1;
}
static inline int seq_bits32_4(int offset, uint32_t x1, uint32_t x2, uint32_t x4) {
uint32_t x6 = x4 >> 2 & x2;
if (!x6) return seq_bits32_2(offset, x1, x4);
return seq_bits32_2(offset + 2, x1, x6);
}
static inline int seq_bits32_8(int offset, uint32_t x1, uint32_t x2, uint32_t x4, uint32_t x8) {
uint32_t x12 = x8 >> 4 & x4;
if (!x12) return seq_bits32_4(offset, x1, x2, x8);
return seq_bits32_4(offset + 4, x1, x2, x12);
}
static inline int seq_bits32_16(int offset, uint32_t x1, uint32_t x2, uint32_t x4, uint32_t x8, uint32_t x16) {
uint32_t x24 = x16 >> 8 & x8;
if (!x24) return seq_bits32_8(offset, x1, x2, x4, x16);
return seq_bits32_8(offset + 8, x1, x2, x4, x24);
}
int seq_bits32(uint32_t x1) {
if (!x1) return 0;
uint32_t x2 = x1 >> 1 & x1;
if (!x2) return 1;
uint32_t x4 = x2 >> 2 & x2;
if (!x4) return seq_bits32_2(2, x1, x2);
uint32_t x8 = x4 >> 4 & x4;
if (!x8) return seq_bits32_4(4, x1, x2, x4);
uint32_t x16 = x8 >> 8 & x8;
if (!x16) return seq_bits32_8(8, x1, x2, x4, x8);
uint32_t x32 = x16 >> 16 & x16;
if (!x32) return seq_bits32_16(16, x1, x2, x4, x8, x16);
return 32;
}
this looks long, but note that there are actually no loops.
this is very branchy, however this is not really a problem for a cortex m0+ which has no branch prediction.
with this approach, the maximum number of cycles is just 61 for the generated assembly, less than a third of the original!!