实现以下目标最有效的算法是什么:

0010 0000 => 0000 0100

从MSB->LSB转换为LSB->MSB。所有位都必须反转;也就是说,这不是字节交换。


当前回答

您可能希望使用标准模板库。它可能比上面提到的代码慢。然而,在我看来,这似乎更清楚,更容易理解。

 #include<bitset>
 #include<iostream>


 template<size_t N>
 const std::bitset<N> reverse(const std::bitset<N>& ordered)
 {
      std::bitset<N> reversed;
      for(size_t i = 0, j = N - 1; i < N; ++i, --j)
           reversed[j] = ordered[i];
      return reversed;
 };


 // test the function
 int main()
 {
      unsigned long num; 
      const size_t N = sizeof(num)*8;

      std::cin >> num;
      std::cout << std::showbase << std::hex;
      std::cout << "ordered  = " << num << std::endl;
      std::cout << "reversed = " << reverse<N>(num).to_ulong()  << std::endl;
      std::cout << "double_reversed = " << reverse<N>(reverse<N>(num)).to_ulong() << std::endl;  
 }

其他回答

您可能希望使用标准模板库。它可能比上面提到的代码慢。然而,在我看来,这似乎更清楚,更容易理解。

 #include<bitset>
 #include<iostream>


 template<size_t N>
 const std::bitset<N> reverse(const std::bitset<N>& ordered)
 {
      std::bitset<N> reversed;
      for(size_t i = 0, j = N - 1; i < N; ++i, --j)
           reversed[j] = ordered[i];
      return reversed;
 };


 // test the function
 int main()
 {
      unsigned long num; 
      const size_t N = sizeof(num)*8;

      std::cin >> num;
      std::cout << std::showbase << std::hex;
      std::cout << "ordered  = " << num << std::endl;
      std::cout << "reversed = " << reverse<N>(num).to_ulong()  << std::endl;
      std::cout << "double_reversed = " << reverse<N>(reverse<N>(num)).to_ulong() << std::endl;  
 }

这个线程引起了我的注意,因为它处理了一个简单的问题,即使对于现代CPU也需要大量的工作(CPU周期)。有一天我也站在那里,有同样的¤#%“#”问题。我得翻几百万字节。然而,我知道我所有的目标系统都是基于现代英特尔的,所以让我们开始优化到极致!!

所以我使用了Matt J的查找代码作为基础。我正在基准测试的系统是i7 haswell 4700eq。

Matt J的查找位翻转400亿字节:大约0.272秒。

然后我继续尝试,看看英特尔的ISPC编译器是否可以向量化反向的算术。c。

我不打算在这里用我的发现来烦你,因为我尝试了很多来帮助编译器找到东西,无论如何,我最终得到了大约0.15秒的性能来bitflip 400亿字节。这是一个伟大的减少,但对于我的应用程序,这仍然是方式方式太慢。

所以人们让我展示世界上最快的基于英特尔的bitflipper。定时:

时间到bitflip 400000000字节:0.050082秒!!!!!

// Bitflip using AVX2 - The fastest Intel based bitflip in the world!!
// Made by Anders Cedronius 2014 (anders.cedronius (you know what) gmail.com)

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

using namespace std;

#define DISPLAY_HEIGHT  4
#define DISPLAY_WIDTH   32
#define NUM_DATA_BYTES  400000000

// Constants (first we got the mask, then the high order nibble look up table and last we got the low order nibble lookup table)
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
        0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
        0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,
        0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0
};

// The data to be bitflipped (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static unsigned char data[NUM_DATA_BYTES+32]={};

extern "C" {
void bitflipbyte(unsigned char[],unsigned int,unsigned char[]);
}

int main()
{

    for(unsigned int i = 0; i < NUM_DATA_BYTES; i++)
    {
        data[i] = rand();
    }

    printf ("\r\nData in(start):\r\n");
    for (unsigned int j = 0; j < 4; j++)
    {
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
        {
            printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
        }
        printf ("\r\n");
    }

    printf ("\r\nNumber of 32-byte chunks to convert: %d\r\n",(unsigned int)ceil(NUM_DATA_BYTES/32.0));

    double start_time = omp_get_wtime();
    bitflipbyte(data,(unsigned int)ceil(NUM_DATA_BYTES/32.0),k1);
    double end_time = omp_get_wtime();

    printf ("\r\nData out:\r\n");
    for (unsigned int j = 0; j < 4; j++)
    {
        for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
        {
            printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
        }
        printf ("\r\n");
    }
    printf("\r\n\r\nTime to bitflip %d bytes: %f seconds\r\n\r\n",NUM_DATA_BYTES, end_time-start_time);

    // return with no errors
    return 0;
}

printf是用来调试的。

这是主要的工作:

bits 64
global bitflipbyte

bitflipbyte:    
        vmovdqa     ymm2, [rdx]
        add         rdx, 20h
        vmovdqa     ymm3, [rdx]
        add         rdx, 20h
        vmovdqa     ymm4, [rdx]
bitflipp_loop:
        vmovdqa     ymm0, [rdi] 
        vpand       ymm1, ymm2, ymm0 
        vpandn      ymm0, ymm2, ymm0 
        vpsrld      ymm0, ymm0, 4h 
        vpshufb     ymm1, ymm4, ymm1 
        vpshufb     ymm0, ymm3, ymm0         
        vpor        ymm0, ymm0, ymm1
        vmovdqa     [rdi], ymm0
        add     rdi, 20h
        dec     rsi
        jnz     bitflipp_loop
        ret

代码占用32个字节,然后屏蔽掉蚕食。高啃角右移了4。然后使用vpshufb和ymm4 / ymm3作为查找表。我可以使用一个单独的查找表,但我将不得不在ORing再次一起啃啃之前向左移动。

还有更快的翻转比特的方法。但我被绑定到单线程和CPU,所以这是我能实现的最快速度。你能做一个快一点的版本吗?

关于使用Intel C/ c++编译器内在等效命令,请不要发表任何评论…

原生ARM指令“rbit”可以用1个cpu周期和1个额外的cpu寄存器来完成,不可能被击败。

对于其他可能遇到这个问题的网络搜索者,这里有一个总结(针对C和JavaScript)。

对于JavaScript的完整解决方案,我们可以首先生成表:

const BIT_REVERSAL_TABLE = new Array(256)

for (var i = 0; i < 256; ++i) {
  var v = i, r = i, s = 7;
  for (v >>>= 1; v; v >>>= 1) {
    r <<= 1;
    r |= v & 1;
    --s;
  }
  BIT_REVERSAL_TABLE[i] = (r << s) & 0xff;
}

这给了我们BIT_REVERSAL_TABLE,这是@MattJ发布的:

const BIT_REVERSAL_TABLE = new Uint8Array([      
  0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 
  0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, 
  0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 
  0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, 
  0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 
  0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
  0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 
  0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
  0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
  0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, 
  0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
  0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
  0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 
  0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
  0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 
  0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
])

8位、16位和32位无符号整数的算法可以在这里找到:

function reverseBits8(n) {
  return BIT_REVERSAL_TABLE[n]
}

function reverseBits16(n) {
  return (BIT_REVERSAL_TABLE[(n >> 8) & 0xff] |
    BIT_REVERSAL_TABLE[n & 0xff] << 8)
}

function reverseBits32(n) {
  return (BIT_REVERSAL_TABLE[n & 0xff] << 24) |
    (BIT_REVERSAL_TABLE[(n >>> 8) & 0xff] << 16) |
    (BIT_REVERSAL_TABLE[(n >>> 16) & 0xff] << 8) |
    BIT_REVERSAL_TABLE[(n >>> 24) & 0xff];
}

注意,32位版本不能在JavaScript中工作(必须转换为使用bigint,这很简单),但应该可以在64位语言中工作:

log8(0b11000100)
log16(0b1110001001001100)
log32(0b11110010111110111100110010101011)

// 0b11000100 => 0b00100011
// 0b1110001001001100 => 0b0011001001000111
// doesn't work in JS it seems:
// 0b11110010111110111100110010101011 => 0b0-101010110011000010000010110001

function log8(n) {
  console.log(`${bits(n, 8)} => ${bits(reverseBits8(n), 8)}`)
}

function log16(n) {
  console.log(`${bits(n, 16)} => ${bits(reverseBits16(n), 16)}`)
}

function log32(n) {
  console.log(`${bits(n, 32)} => ${bits(reverseBits32(n), 32)}`)
}

function bits(n, size) {
  return `0b${n.toString(2).padStart(size, '0')}`
}

注意:这个解决方案适用于JavaScript的32位:

function reverseBits32(n) {
  let res = 0;
  for (let i = 0; i < 32; i++) {
    res = (res << 1) + (n & 1);
    n = n >>> 1;
  }

  return res >>> 0;
}

所有3个基于表格的解决方案都可以在C中正常工作。下面是一个粗略的C版本:

#include <stdlib.h>

static uint8_t* BIT_REVERSAL_TABLE;

uint8_t* 
make_bit_reversal_table() {
  uint8_t *table = malloc(256 * sizeof(uint8_t));
  uint8_t i;
  for (i = 0; i < 256 ; ++i) {
    uint8_t v = i;
    uint8_t r = i;
    uint8_t s = 7;
    for (v = v >> 1; v; v = v >> 1) {
      r <<= 1;
      r |= v & 1;
      --s;
    }
    table[i] = (r << s) & 0xff;
  }
  return table;
}

uint8_t 
reverse_bits_8(uint8_t n) {
  return BIT_REVERSAL_TABLE[n];
}

uint16_t
reverse_bits_16(uint16_t n)
{
  return (BIT_REVERSAL_TABLE[(n >> 8) & 0xff]
    | BIT_REVERSAL_TABLE[n & 0xff] << 8);
}

uint32_t
reverse_bits_32(uint32_t n) {
  return (BIT_REVERSAL_TABLE[n & 0xff] << 24) 
    | (BIT_REVERSAL_TABLE[(n >> 8) & 0xff] << 16) 
    | (BIT_REVERSAL_TABLE[(n >> 16) & 0xff] << 8) 
    | BIT_REVERSAL_TABLE[(n >> 24) & 0xff];
}

int 
main(void) {
  BIT_REVERSAL_TABLE = make_bit_reversal_table();
  return 0;
}

假设你有一个比特数组,怎么样: 1. 从MSB开始,将比特一个一个地推入堆栈。 2. 从这个堆栈弹出位到另一个数组(如果你想节省空间,也可以是同一个数组),将第一个弹出位放入MSB,然后从那里继续到较低的有效位。

Stack stack = new Stack();
Bit[] bits = new Bit[] { 0, 0, 1, 0, 0, 0, 0, 0 };

for (int i = 0; i < bits.Length; i++) 
{
    stack.push(bits[i]);
}

for (int i = 0; i < bits.Length; i++)
{
    bits[i] = stack.pop();
}