A popcnt/test/jmp cycle should have roughly 4 cycles of latency: a test+jmp can be fused into one uop, and popcnt would be 3 cycles of latency. n&(n-1) == 0 would compile down into a DEC, TEST, JMP, which is 2 cycles of latency (again, test+jmp are fused into one uop), as dec is just 1 cycle of latency.
So n&(n-1) is faster for checking if it's a power of two.
In the following I confirm that n&(n-1) is faster than popcount (compiled with "cc -march=haswell check_pow2.c -O3"), at 699 vs. 1016 microseconds, respectively:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
const int N = 2*1000*1000;
int count_popcnt(unsigned int *data) {
int sum = 0;
int i;
for (i=0; i<N; i+=8) {
sum += (
(__builtin_popcount((data+i)[0]) == 1) +
(__builtin_popcount((data+i)[1]) == 1) +
(__builtin_popcount((data+i)[2]) == 1) +
(__builtin_popcount((data+i)[3]) == 1) +
(__builtin_popcount((data+i)[4]) == 1) +
(__builtin_popcount((data+i)[5]) == 1) +
(__builtin_popcount((data+i)[6]) == 1) +
(__builtin_popcount((data+i)[7]) == 1)
);
}
return sum;
}
int count_wegner(unsigned int *data) {
int sum = 0;
int i;
for (i=0; i<N; i+=8) {
sum += (
((((data+i)[0]) & ((data+i)[0]-1)) == 0) +
((((data+i)[1]) & ((data+i)[1]-1)) == 0) +
((((data+i)[2]) & ((data+i)[2]-1)) == 0) +
((((data+i)[3]) & ((data+i)[3]-1)) == 0) +
((((data+i)[4]) & ((data+i)[4]-1)) == 0) +
((((data+i)[5]) & ((data+i)[5]-1)) == 0) +
((((data+i)[6]) & ((data+i)[6]-1)) == 0) +
((((data+i)[7]) & ((data+i)[7]-1)) == 0)
);
}
return sum;
}
int main(void) {
unsigned int data[N];
FILE *f = fopen("/dev/urandom", "rb");
if (!f) {
perror("Cannot open /dev/urandom");
exit(1);
}
if (fread(data, sizeof(unsigned int), N, f) != N) {
perror("Cannot read enough items");
exit(1);
}
/* Use only a few bits */
for (int i=0; i<N; i++) {
data[i] = data[i] & 31;
/* Don't worry about handling 0 */
if (data[i] == 0) {
data[i] = i + 1;
}
}
struct timeval time1, time2;
int n;
gettimeofday(&time1, NULL);
n = count_popcnt(data);
gettimeofday(&time2, NULL);
printf(" popcnt: %d in %8ld us\n",
n,
(time2.tv_sec-time1.tv_sec) * 1000*1000 +
(time2.tv_usec-time1.tv_usec));
gettimeofday(&time1, NULL);
n = count_wegner(data);
gettimeofday(&time2, NULL);
printf("n&(n-1): %d in %8ld us\n",
n,
(time2.tv_sec-time1.tv_sec) * 1000*1000 +
(time2.tv_usec-time1.tv_usec));
return 0;
}
It's faster because both GCC and Clang now optimize loops with n&(n-1) to use AVX2 SIMD! I haven't looked closely to confirm, but I think they may in fact even do Harley-Seal for "naive popcount" loops.
C is no longer portable assembly. If you want to test whether a particular algorithm is faster than another, you probably need to write assembly --- or at least confirm that the compiler did what you thought it did.