crc32 with lookup table - c++

// -- Edited
Currently, hardware functions (__builtin_ia32_crc32qi and __builtin_ia32_crc32di) are used for crc32 with __builtin_ia32_crc32di returning 64 bits. Then, 64-bits are trimmed to 32-bits. Existing data is based on this logic.
https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/X86-Built-in-Functions.html
uint32_t calculateCrc32(uint32_t init, const uint8_t* buf, size_t size) {
uint32_t crc32 = init;
const uint8_t* pos = buf;
const uint8_t* end = buf + size;
// byte-wise crc
while (((uint64_t)pos) % sizeof(uint64_t) && pos < end) {
crc32 = __builtin_ia32_crc32qi(crc32, *pos);
++pos;
}
// 8-bytes-wise
while (((uint64_t)pos) <
(((uint64_t)end) / sizeof(uint64_t)) * sizeof(uint64_t)) {
crc32 = __builtin_ia32_crc32di(crc32, *(uint64_t*)pos);
pos += sizeof(uint64_t);
}
// byte-wise crc for remaining
while (pos < end) {
crc32 = __builtin_ia32_crc32qi(crc32, *pos);
++pos;
}
return crc32;
}
I am trying to implement a lookup-table version. What I am doing is: 1) first generate a lookup table 2) do table lookup
uint8_t kCrc32tab[256];
for (int i=0; i < 256; ++i) {
uint8_t buf = i;
kCrc32tab[i] = calculateCrc32(0xFF, &buf, 1);
}
uint32_t crc32WithLookup(uint32_t crc32_init, const uint8_t* buf, size_t size) {
uint32_t crc32 = crc32_init;
for (std::size_t i = 0; i < size; i++) {
uint8_t key = (crc32 ^ buf[i]) & 0xFF;
crc32 = kCrc32tab[key] ^ (crc32 >> 8);
}
return crc32;
}
However, crc32 outcome is different between crc32WithLookup and calculateCrc32. Any suggestions?
lookup example in redis:
https://github.com/redis/redis/blob/unstable/src/crc16.c

That CRC-32 is commonly referred to as the CRC-32C (where outside the provided code the initial value and final exclusive-or is 0xffffffff).
There are two errors in your code. The table must be 32-bit values, and the initial value for your CRCs is zero. So you need uint32_t kCrc32tab[256]; and kCrc32tab[i] = calculateCrc32(0, &buf, 1);.
This answer provides more advanced and faster code for both the hardware and software versions of that CRC calculation.

Related

Parallel CRC32 in C++

I am trying to implement CRC32 calculation of file by splitting it in parts. I used algorithms and ideas from CRC Calculation Of A Mostly Static Data Stream.
Unfortunately, my program gives incorrect answer, although it returns the same value of CRC regardless of number of parts.
Please, tell me where the mistake is and what I do wrong.
Here is the code of program:
#include <iostream>
#include <fstream>
#include <stdint.h>
#include <string>
#include <sstream>
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
using namespace std;
struct data {
pthread_t id;
uint8_t *buf;
long int start, end;
long int num_zeros;
uint32_t crc;
};
//Straight function
uint32_t crc_32(ifstream& input) {
input.seekg(0, input.end);
size_t size = input.tellg();
input.seekg(0, input.beg);
uint32_t polynomial = 0xEDB88320;
uint32_t table[256];
for(uint32_t i=0; i<=0xff; i++) {
uint32_t c = i;
for (size_t j = 0; j < 8; j++)
{
if (c & 1) {
c = polynomial ^ (c >> 1);
}
else {
c >>= 1;
}
}
table[i] = c;
}
uint32_t CRC = 0xffffffff;
uint8_t buf;
for(size_t i=0; i<size; i++) {
input.read( (char *) &buf, sizeof(buf));
CRC = (CRC>>8) ^ table[(CRC ^ buf) & 0xff ];
}
CRC ^= 0xffffffff;
return CRC;
}
// generate crc
uint32_t GenCrc(data *work, long int beg, long int end, uint32_t *crctbl) {
uint32_t init = 0x00000000;
for(long int i = beg; i<end; i++) {
init = (init<<8)^crctbl[ (init>>24) ^ work->buf[i] ];
}
return init;
}
// (a*b)%crc
uint32_t MpyModCrc(uint32_t a, uint32_t b) {
uint32_t pd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
pd = (pd<<1)^((0-(pd>>31))&0x04c11db7);
pd ^= (0-(b>>31))&a;
b <<= 1;
}
return pd;
}
// pow(2,p)%crc
uint32_t PowModCrc(uint32_t p) {
uint32_t prd = 0x1u;
uint32_t sqr = 0x2u;
while(p) {
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
void do_work(data *work) {
//Generate lookup table:
uint32_t polynomial = 0x04c11db7;
uint32_t crctbl[256];
uint32_t crc;
uint32_t c;
uint32_t i;
for(c=0; c <256; c++) {
crc = c<<24;
/*
for(i=0; i <8; i++) {
if( (crc & 0x80000000) !=0) {
crc <<= 1;
crc ^= polynomial;
}
else {
crc <<=1;
}
}
*/
for(i=0; i<8; i++) {
crc = (crc<<1)^(0-(crc>>31))&polynomial;
}
crctbl[c] = crc;
}
uint32_t pmc;
uint32_t crf;
crf = GenCrc(work, work->start, work->end, crctbl);
if(work->num_zeros > 0) {
pmc = PowModCrc((work->num_zeros)*8);
crf = MpyModCrc(crf, pmc);
}
work->crc = crf;
}
void *do_stuff(void *d) {
data *mydata = (data*)d;
do_work(mydata);
return 0;
}
int main(int argc, char** argv) {
ifstream input("8733718.zip", ios::binary);
if(!input) {
cerr << "Can't open file." <<endl;
}
input.seekg(0, input.end);
long int len = input.tellg();
input.seekg(0, input.beg);
uint8_t *buf = new uint8_t[len];
input.read( (char *) buf, len);
int k;
cout << "Enter number of parts: ";
if(!(cin >> k) || k<1) {
cout << "Error. We need at least one part!" <<endl;
return -1;
}
data *work = new data[k+1];
for(int i=0; i < k; i++) {
work[i].start = len*i;
work[i].start /=k;
work[i].end = len * (i+1);
work[i].end /= k;
work[i].num_zeros = len - work[i].end;
work[i].buf = buf;
}
for(int i=0; i < k; i++) {
void *tmp = (void*)&work[i];
pthread_create(&work[i].id, 0, do_stuff, tmp);
}
for(int i=0; i<k; i++) {
pthread_join(work[i].id, 0);
}
uint32_t crc = work[0].crc;
for(int i=1; i<k; i++) {
crc ^= work[i].crc;
}
delete [] buf;
delete [] work;
cout << "Straigth CRC_32 = ";
uint32_t result;
result = crc_32(input);
cout << hex << result;
cout <<endl <<endl;
cout << "Parallel CRC_32 = ";
uint32_t result2;
result2 = crc;
cout << hex << crc <<endl <<endl;
cout <<endl <<endl;
cout <<"=========================="<<endl<<endl;
input.close();
return 0;
}
"Straight" function gives the answer which coincides with the answer of, for example, website https://emn178.github.io/online-tools/crc32_checksum.html.
But "parallel" procedure gives another answer.
As rcgldr noted, you are mixing up reflected and non-reflected CRC calculations. The original CRC is reflected so you need to stick with that. You need to always be shifting right. You always need to use the reflected polynomial, as in the original, 0xedb88320.
So, GenCRC() needs to shift right, and use the low eight bits of the CRC (which you're calling init) instead of the high eight bits to get the index.
MpyModCrc() needs to shift right, use the low bit instead of the high bit for the decisions, and use the correct polynomial.
PowModCrc() is starting off with the wrong initial polynomials. Since the polynomials are reflected, the initial values need to be as well. So 1 (x0) is 0x80000000, and x (x1) is 0x40000000.
In do_work(), you need to generate the table exactly as you did crc_32(). Of course, why you're generating the exact same table in every thread, I have no idea. Do that once.
Lastly, having done all that, your threads will be computing the CRC with a zero initial value and zero final exclusive or. That's ok, so long as you then exclusive-or with the CRC of len zero bytes with an initial value of 0xffffffff, and then exclusive-or the final result with 0xffffffff to get the same effect. That is:
crc ^= MpyModCrc(0xffffffff, PowModCrc(len * 8)) ^ 0xffffffff;
Alternatively, you could start the first work unit, and only that one, with an initial CRC of 0xffffffff. And then exclusive-or the final result with 0xffffffff.
Another improvement is to not calculate the CRC of so many zeros (even though it is an O(log n) calculation), and to only calculate the power function once. You can combine the CRCs as you join each thread, only needing PowModCrc() of your chunk size, calculating that just once, and applying it each time. And once more for the final chunk which may be smaller.
You don't need to read in the entire file and then do the CRC calculation. You should be calculating in parallel with reading. Instead of deciding how many pieces, decide on a fixed chunk size. Then read chunks and fire off threads for CRC calculations as you read them. Combine the CRCs as the threads complete, joining them in order. Limit the number of threads to something like twice the number of cores. You want to keep the cores busy, but you don't want the overhead of too many threads and too much memory in use.
The final alternative would be to not do any of this, and simply use zlib which provides the CRC combination functions for you.
crc_32() is a right shifting CRC that initial CRC = 0xffffffff and final CRC ^= 0xffffffff, while do_work() gencrc(), ... are using a left shifting CRC.
Change do_work() and gencrc() to be a modified version of crc_32() with initial CRC = 0, and final CRC not changed.
Changes for CRC32 (which is a reflected CRC):
#define CRCPOLY 0xEDB88320u
#define INITXOR 0xFFFFFFFFu
#define FINALXOR 0xFFFFFFFFu
static uint32_t crc_table[256];
void GenCrcTable()
{
uint32_t crc;
for (uint32_t byte = 0; byte <= 0xFFu; byte++ )
{
crc = byte;
for (uint8_t bit = 0; bit < 8; bit++ )
crc = (crc&1) ? (crc>>1)^CRCPOLY : (crc>>1);
crc_table[byte] = crc;
}
}
int Crc32(uint8_t *buffer, size_t length)
{
uint32_t crc = INITXOR;
for (size_t i = 0; i < length; ++i)
crc = crc_table[(crc&0xFFu)^*buffer++]^(crc>>8);
crc ^= FINALXOR;
return crc;
}
// use this one for the multi-thread code
int Crc320(uint8_t *buffer, size_t length)
{
uint32_t crc = 0;
for (size_t i = 0; i < length; ++i)
crc = crc_table[(crc&0xFFu)^*buffer++]^(crc>>8);
return crc;
}
// carryless multiply modulo crc
uint32_t MpyModCrc(uint32_t a, uint32_t b) // (a*b)%crc
{
uint32_t prd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
prd = (prd&1u) ? (prd>>1)^CRCPOLY : (prd>>1);
prd ^= (b&1u) ? a : 0;
b >>= 1;
}
return prd;
}
// exponentiate by repeated squaring modulo crc
uint32_t PowModCrc(uint32_t p) // pow(2,p)%crc
{
uint32_t prd = 0x80000000u; // current product
uint32_t sqr = 0x40000000u; // current square
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
After these changes try this code at the end (I haven't tested this yet):
uint32_t crc = work[0].crc;
for(int i=1; i<k; i++) {
crc ^= work[i].crc;
}
uint32_t pmc = PowModCrc(len*8); // add initial CRC of 0xffffffff
crc ^= MpyModCrc(0xffffffff, pmc);
crc ^= 0xffffffff; // final_xor = 0xffffffff
If running on a PC in 64 bit mode, you could use a PCLMULQDQ based CRC to speed things up (probably to the point that multi-threading won't help much). You can search github for examples of this. The assembly files are a bit over 500 lines. I have 6 sets of these for Visual Studio | MASM (ML64.EXE). Link to to code for 32 bit CRC reflected. If using my code, you need to change the if defines from 0 to 1 to use CRC32 instead of CRC32C.
https://github.com/jeffareid/crc/tree/master/crc32r

Why are there two variants of the implementation of CRCs in software

I'm digging into the subtleties of CRCs. If I understand correctly, every CRC polynomial is provided in at least two representations, the normal one and the reversed one.
The normal one targets implementations where the content is processed from most signifiant bit to least significant bit and switched to the left (like for example in this wikipedia page).
The reversed one aims to handle LSb to MSb interfaces. If you process LSb to MSb with the reversed polynomial and switching to the right you get the same CRC value (also encoded LSb to MSb). This is described for example here. This is convenient for LSb to MSb communication interfaces.
What I don't understand is when you switch to software implementations. Why are there two variants of a software ie. byte implementation? (One for MSb to LSb, and one for the opposite bit order.)
You do not get the "same CRC value" (reflected or not) with the reflected calculation. It is an entirely different value, because the bits of the message are processed in the opposite order.
"when you switch": You simply use the CRC definition, reflected or not, that matches what the application is expecting. Whether the CRC is reflected is one of several parameters that define the CRC, along with the number of the bits in the CRC, the polynomial, the initial value, and the final exclusive or value. You can find the definition of over a hundred different CRCs here.
"why are there two": The forward implementation exists because that corresponds most closely to the mathematics, with the least significant term of the polynomial in the least significant bit of the binary representation of the polynomial. The reflected implementation exists because it was realized that it could be implemented in software a little more simply, with fewer instructions, but still have the same error-detection performance.
Here is an example for two common 32-bit CRCs with the same polynomial. Forward, CRC-32/BZIP bit-wise implementation:
uint32_t crc32bzip2_bit(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc = ~crc;
for (size_t i = 0; i < len; i++) {
crc ^= (uint32_t)data[i] << 24;
for (unsigned k = 0; k < 8; k++) {
crc = crc & 0x80000000 ? (crc << 1) ^ 0x4c11db7 : crc << 1;
}
}
crc = ~crc;
return crc;
}
Reflected CRC-32/ZIP bit-wise:
uint32_t crc32iso_hdlc_bit(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc = ~crc;
for (size_t i = 0; i < len; i++) {
crc ^= data[i];
for (unsigned k = 0; k < 8; k++) {
crc = crc & 1 ? (crc >> 1) ^ 0xedb88320 : crc >> 1;
}
}
crc = ~crc;
return crc;
}
The main savings is one instruction, the shift up of the data byte, that you can get rid of with the reflected implementation. Also the constant that you & with (1 vs. 0x80000000) is smaller, which may also save an instruction or a register, or perhaps just result in a shorter instruction, depending on the size of immediate values supported in the instruction set.
The shift is avoided for byte-wise calculations as well:
uint32_t crc32bzip2_byte(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
for (size_t i = 0; i < len; i++) {
crc = (crc << 8) ^
table_byte[((crc >> 24) ^ data[i]) & 0xff];
}
return crc;
}
vs.
uint32_t crc32iso_hdlc_byte(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
for (size_t i = 0; i < len; i++) {
crc = (crc >> 8) ^
table_byte[(crc ^ data[i]) & 0xff];
}
return crc;
}

CRC midstream instead of at the end

Normally one would add a CRC to the end of the data stream. The CRC check would include the CRC itself and return 0 if the CRC is correct.
I need to add a CRC to verify my embedded code. It needs to be checked in place, but the top word in memory space is for an interrupt vector. Is it possible to place a key value midstream such that the CRC check returns 0 for the whole code? (or is this unsolvable?)
It's definitely possible. You can run a CRC backwards, which would be fast and easy. Below is example code.
In fact, you can give me the locations of bits scattered wherever in the stream, and if you give me enough of them I can tell you what to set them to to get a zero CRC at the end, or any other CRC value for that matter. My spoof code solves the linear equations to come up with that answer.
However I would wonder why you'd want to do any of that. Why not just know where the CRC is stored and compute the CRC for everything but that, and then check the result against the stored CRC?
// Example of the generation of a "middle" CRC, which is inserted somewhere in
// the middle of a sequence, where the CRC is generated such that the CRC of
// the complete sequence will be zero. This particular CRC has no pre or post
// processing.
//
// Placed into the public domain by Mark Adler, 11 May 2016.
#include <stddef.h> // for size_t
#include <stdint.h> // for uint32_t and uint64_t
#define POLY 0xedb88320 // CRC polynomial
// Byte-wise CRC tables for forward and reverse calculations.
uint32_t crc_forward_table[256];
uint32_t crc_reverse_table[256];
// Fill in CRC tables using bit-wise calculations.
void crc32_make_tables(void) {
for (uint32_t n = 0; n < 256; n++) {
uint32_t crc = n;
for (int k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
crc_forward_table[n] = crc;
crc_reverse_table[crc >> 24] = (crc << 8) ^ n;
}
}
// Return the forward CRC of buf[0..len-1], starting with crc at the front.
uint32_t crc32(uint32_t crc, unsigned char *buf, size_t len) {
for (size_t n = 0; n < len; n++)
crc = (crc >> 8) ^ crc_forward_table[(crc ^ buf[n]) & 0xff];
return crc;
}
// Return the reverse CRC of buf[0..len-1], starting with crc at the end.
uint32_t crc32_reverse(uint32_t crc, unsigned char *buf, size_t len) {
while (len)
crc = (crc << 8) ^ crc_reverse_table[crc >> 24] ^ buf[--len];
return crc;
}
// Put a 32-bit value into a byte buffer in little-endian order.
void put4(uint32_t word, unsigned char *pos) {
pos[0] = word;
pos[1] = word >> 8;
pos[2] = word >> 16;
pos[3] = word >> 24;
}
#include <stdlib.h> // for random() and srandomdev()
// Fill dat[0..len-1] with uniformly random byte values. All of the bits from
// each random() call are used, except for possibly a few leftover at the end.
void ranfill(unsigned char *dat, size_t len) {
uint64_t ran = 1;
while (len) {
if (ran < 0x100)
ran = (ran << 31) + random();
*dat++ = ran;
ran >>= 8;
len--;
}
}
#include <stdio.h> // for printf()
#define LEN 1024 // length of the message without the CRC
// Demonstrate the generation of a middle-CRC, using the forward and reverse
// CRC computations. Verify that the CRC of the resulting sequence is zero.
int main(void) {
crc32_make_tables();
srandomdev();
unsigned char dat[LEN+4];
ranfill(dat, LEN/2);
put4(0, dat + LEN/2); // put zeros where the CRC will go
ranfill(dat + LEN/2 + 4, (LEN+1)/2);
put4(crc32(0, dat, LEN/2) ^ crc32_reverse(0, dat + LEN/2, (LEN+1)/2 + 4),
dat + LEN/2); // replace the zeros with the CRC
printf("%08x\n", crc32(0, dat, LEN+4));
return 0;
}

Fast search/replace of matching single bytes in a 8-bit array, on ARM

I develop image processing algorithms (using GCC, targeting ARMv7 (Raspberry Pi 2B)).
In particular I use a simple algorithm, which changes index in a mask:
void ChangeIndex(uint8_t * mask, size_t size, uint8_t oldIndex, uint8_t newIndex)
{
for(size_t i = 0; i < size; ++i)
{
if(mask[i] == oldIndex)
mask[i] = newIndex;
}
}
Unfortunately it has poor performance for the target platform.
Is there any way to optimize it?
The ARMv7 platform supports SIMD instructions called NEON.
With use of them you can make you code faster:
#include <arm_neon.h>
void ChangeIndex(uint8_t * mask, size_t size, uint8_t oldIndex, uint8_t newIndex)
{
size_t alignedSize = size/16*16, i = 0;
uint8x16_t _oldIndex = vdupq_n_u8(oldIndex);
uint8x16_t _newIndex = vdupq_n_u8(newIndex);
for(; i < alignedSize; i += 16)
{
uint8x16_t oldMask = vld1q_u8(mask + i); // loading of 128-bit vector
uint8x16_t condition = vceqq_u8(oldMask, _oldIndex); // compare two 128-bit vectors
uint8x16_t newMask = vbslq_u8(condition, _newIndex, oldMask); // selective copying of 128-bit vector
vst1q_u8(mask + i, newMask); // saving of 128-bit vector
}
for(; i < size; ++i)
{
if(mask[i] == oldIndex)
mask[i] = newIndex;
}
}

Does anyone have an easy solution to parsing Exp-Golomb codes using C++?

Trying to decode the SDP sprop-parameter-sets values for an H.264 video stream and have found to access some of the values will involve parsing of Exp-Golomb encoded data and my method contains the base64 decoded sprop-parameter-sets data in a byte array which I now bit walking but have come up to the first part of Exp-Golomb encoded data and looking for a suitable code extract to parse these values.
Exp.-Golomb codes of what order ??
If it you need to parse H.264 bit stream (I mean transport layer) you can write a simple functions to make an access to scecified bits in the endless bit stream. Bits indexing from left to right.
inline u_dword get_bit(const u_byte * const base, u_dword offset)
{
return ((*(base + (offset >> 0x3))) >> (0x7 - (offset & 0x7))) & 0x1;
}
This function implement decoding of exp-Golomb codes of zero range (used in H.264).
u_dword DecodeUGolomb(const u_byte * const base, u_dword * const offset)
{
u_dword zeros = 0;
// calculate zero bits. Will be optimized.
while (0 == get_bit(base, (*offset)++)) zeros++;
// insert first 1 bit
u_dword info = 1 << zeros;
for (s_dword i = zeros - 1; i >= 0; i--)
{
info |= get_bit(base, (*offset)++) << i;
}
return (info - 1);
}
u_dword means unsigned 4 bytes integer.
u_byte means unsigned 1 byte integer.
Note that first byte of each NAL Unit is a specified structure with forbidden bit, NAL reference, and NAL type.
Accepted answer is not a correct implementation. It is giving wrong output. Correct implementation as per pseudo code from
"Sec 9.1 Parsing process for Exp-Golomb codes" spec T-REC-H.264-201304
int32_t getBitByPos(unsigned char *buffer, int32_t pos) {
return (buffer[pos/8] >> (8 - pos%8) & 0x01);
}
uint32_t decodeGolomb(unsigned char *byteStream, uint32_t *index) {
uint32_t leadingZeroBits = -1;
uint32_t codeNum = 0;
uint32_t pos = *index;
if (byteStream == NULL || pos == 0 ) {
printf("Invalid input\n");
return 0;
}
for (int32_t b = 0; !b; leadingZeroBits++)
b = getBitByPos(byteStream, pos++);
for (int32_t b = leadingZeroBits; b > 0; b--)
codeNum = codeNum | (getBitByPos(byteStream, pos++) << (b - 1));
*index = pos;
return ((1 << leadingZeroBits) - 1 + codeNum);
}
I wrote a c++ jpeg-ls compression library that uses golomb codes. I don't know if Exp-Golomb codes is exactly the same. The library is open source can be found at http://charls.codeplex.com. I use a lookup table to decode golomb codes <= 8 bits in length. Let me know if you have problems finding your way around.
Revised with a function to get N bits from the stream; works parsing H.264 NALs
inline uint32_t get_bit(const uint8_t * const base, uint32_t offset)
{
return ((*(base + (offset >> 0x3))) >> (0x7 - (offset & 0x7))) & 0x1;
}
inline uint32_t get_bits(const uint8_t * const base, uint32_t * const offset, uint8_t bits)
{
uint32_t value = 0;
for (int i = 0; i < bits; i++)
{
value = (value << 1) | (get_bit(base, (*offset)++) ? 1 : 0);
}
return value;
}
// This function implement decoding of exp-Golomb codes of zero range (used in H.264).
uint32_t DecodeUGolomb(const uint8_t * const base, uint32_t * const offset)
{
uint32_t zeros = 0;
// calculate zero bits. Will be optimized.
while (0 == get_bit(base, (*offset)++)) zeros++;
// insert first 1 bit
uint32_t info = 1 << zeros;
for (int32_t i = zeros - 1; i >= 0; i--)
{
info |= get_bit(base, (*offset)++) << i;
}
return (info - 1);
}