GENIE
DeviceBitPackingCodec.cu
Go to the documentation of this file.
2 
3 #include "DeviceCodecTemplatesImpl.hpp"
4 
5 using namespace genie::compression;
6 
7 // Explicit template instances for CPU decoding wrapper function of simple codecs
8 // NOTE: This is intentionally separated into mutliple codec implementation files in order to facilitiate separate
9 // compilation units, as opposed to defining all these templates in one place.
10 template void
11 genie::compression::decodeArrayParallel<DeviceBitPackingCodec>(int, int, uint32_t*, size_t, uint32_t*, size_t, size_t*);
12 template void
13 genie::compression::decodeArrayParallel<DeviceBitPackingPrefixedCodec>(int, int, uint32_t*, size_t, uint32_t*, size_t, size_t*);
14 
15 void
16 genie::compression::DeviceBitPackingCodec::encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
17 {
18  const uint32_t *const initout(out);
19  *out++ = static_cast<uint32_t>(length);
20  uint32_t Bs[HowManyMiniBlocks];
21  uint32_t init = 0;
22  const uint32_t *const final = in + length;
23  for (; in + HowManyMiniBlocks * MiniBlockSize <= final; in += HowManyMiniBlocks * MiniBlockSize) {
24  uint32_t tmpinit = init;
25  for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
26  Bs[i] = DeviceNoDeltaBlockPacker::maxbits(in + i * MiniBlockSize, tmpinit);
27  }
28  *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3];
29  for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
30  DeviceNoDeltaBlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init);
31  out += Bs[i];
32  }
33  }
34  if (in < final) {
35  size_t howmany = ((final - in) + MiniBlockSize -1) / MiniBlockSize;
36  uint32_t zeroedIn[HowManyMiniBlocks * MiniBlockSize];
37  if (!divisibleby(length, BlockSize)) {
38  // We treat the rest of the block as 0
39  assert(final < in + HowManyMiniBlocks * MiniBlockSize);
40  memset(&zeroedIn[0], 0, HowManyMiniBlocks * MiniBlockSize * sizeof(uint32_t));
41  memcpy(&zeroedIn[0], in, (final - in) * sizeof(uint32_t));
42  assert(zeroedIn[HowManyMiniBlocks * MiniBlockSize - 1] == 0);
43  assert(zeroedIn[(final-in)] == 0);
44  in = zeroedIn;
45  }
46  uint32_t tmpinit = init;
47  memset(&Bs[0], 0, HowManyMiniBlocks * sizeof(uint32_t));
48  for (uint32_t i = 0; i < howmany; ++i) {
49  Bs[i] = DeviceNoDeltaBlockPacker::maxbits(in + i * MiniBlockSize, tmpinit);
50  }
51  *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3];
52  for (uint32_t i = 0; i < howmany; ++i) {
53  DeviceNoDeltaBlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init);
54  out += Bs[i];
55  }
56  }
57  nvalue = out - initout;
58  }
59 
60 const uint32_t*
61 genie::compression::DeviceBitPackingCodec::decodeArray(const uint32_t *in, const size_t /*len*/, uint32_t *out, size_t &nvalue)
62 {
63  const uint32_t actuallength = *in++;
64  const uint32_t *const initout(out);
65  uint32_t Bs[HowManyMiniBlocks];
66  uint32_t init = 0;
67  for (; out < initout + actuallength / (HowManyMiniBlocks * MiniBlockSize) * HowManyMiniBlocks * MiniBlockSize;
68  out += HowManyMiniBlocks * MiniBlockSize) {
69  Bs[0] = static_cast<uint8_t>(in[0] >> 24);
70  Bs[1] = static_cast<uint8_t>(in[0] >> 16);
71  Bs[2] = static_cast<uint8_t>(in[0] >> 8);
72  Bs[3] = static_cast<uint8_t>(in[0]);
73  ++in;
74  for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
75  DeviceNoDeltaBlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init);
76  in += Bs[i];
77  }
78  }
79  if (out < initout + actuallength) {
80  size_t howmany = ((initout + actuallength) - out + MiniBlockSize - 1) / MiniBlockSize;
81  Bs[0] = static_cast<uint8_t>(in[0] >> 24);
82  Bs[1] = static_cast<uint8_t>(in[0] >> 16);
83  Bs[2] = static_cast<uint8_t>(in[0] >> 8);
84  Bs[3] = static_cast<uint8_t>(in[0]);
85  ++in;
86 
87  for (uint32_t i = 0; i < howmany; ++i) {
88  DeviceNoDeltaBlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init);
89  in += Bs[i];
90  }
91  if (divisibleby(actuallength, BlockSize))
92  out += howmany * MiniBlockSize;
93  else
94  out += ((initout + actuallength) - out);
95  }
96  nvalue = out - initout;
97  assert(nvalue == actuallength);
98  return in;
99 }
100 
101 __device__ uint32_t*
103  uint32_t *d_in, size_t /*length*/, uint32_t *d_out, size_t &nvalue)
104 {
105  uint32_t actuallength = *d_in++;
106  uint32_t *initout(d_out);
107  uint32_t Bs[HowManyMiniBlocks];
108  uint32_t init = 0;
109  for (;d_out < initout + actuallength / (HowManyMiniBlocks * MiniBlockSize) * HowManyMiniBlocks * MiniBlockSize;
110  d_out += HowManyMiniBlocks * MiniBlockSize) {
111  Bs[0] = static_cast<uint8_t>(d_in[0] >> 24);
112  Bs[1] = static_cast<uint8_t>(d_in[0] >> 16);
113  Bs[2] = static_cast<uint8_t>(d_in[0] >> 8);
114  Bs[3] = static_cast<uint8_t>(d_in[0]);
115  ++d_in;
116  for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
117  DeviceNoDeltaBlockPacker::unpackblock(d_in, d_out + i * MiniBlockSize, Bs[i], init);
118  d_in += Bs[i];
119  }
120  }
121  if (d_out < initout + actuallength) {
122  size_t howmany = ((initout + actuallength) - d_out + MiniBlockSize - 1) / MiniBlockSize;
123  Bs[0] = static_cast<uint8_t>(d_in[0] >> 24);
124  Bs[1] = static_cast<uint8_t>(d_in[0] >> 16);
125  Bs[2] = static_cast<uint8_t>(d_in[0] >> 8);
126  Bs[3] = static_cast<uint8_t>(d_in[0]);
127  ++d_in;
128 
129  for (uint32_t i = 0; i < howmany; ++i) {
130  DeviceNoDeltaBlockPacker::unpackblock(d_in, d_out + i * MiniBlockSize, Bs[i], init);
131  d_in += Bs[i];
132  }
133  if (divisibleby(actuallength, BlockSize))
134  d_out += howmany * MiniBlockSize;
135  else
136  d_out += ((initout + actuallength) - d_out);
137  }
138  nvalue = d_out - initout;
139  assert(nvalue == actuallength);
140  return d_in;
141 }
142 
143 __device__ uint32_t*
145  uint32_t *d_in, size_t /* comprLength */, uint32_t *d_out, size_t &capacity)
146 {
147  int idx = threadIdx.x;
148 
149  uint32_t length = d_in[0]; // first uint32_t is an uncompressed length
150  d_in++;
151  assert(length <= decodeArrayParallel_lengthPerBlock()); // one thread can process 4 values
152  assert(length <= capacity); // not enough capacity in the decompressed array!
153  assert(length > 0);
154 
155  uint32_t blocks = (length + GPUGENIE_CODEC_BPP_BLOCK_LENGTH - 1) / GPUGENIE_CODEC_BPP_BLOCK_LENGTH;
156 
157  __shared__ uint32_t s_bitSizes[GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH];
158  __shared__ uint32_t s_bitOffsets[GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH];
159 
160  if (idx == 0) // thread 0 has to do all the bit sizes summing sequentially
161  {
162  uint32_t bitOffsetsAcc = 0;
163  int inIt = 0;
164  for (int b = 0; b < blocks; b+=4)
165  {
166  s_bitSizes[b] = static_cast<uint8_t>(d_in[inIt] >> 24);
167  s_bitSizes[b+1] = static_cast<uint8_t>(d_in[inIt] >> 16);
168  s_bitSizes[b+2] = static_cast<uint8_t>(d_in[inIt] >> 8);
169  s_bitSizes[b+3] = static_cast<uint8_t>(d_in[inIt]);
170 
171  // account for next block of bitSizes
172  bitOffsetsAcc += 1;
173 
174  // exclusive scan
175  s_bitOffsets[b] = bitOffsetsAcc;
176  bitOffsetsAcc += s_bitSizes[b];
177 
178  s_bitOffsets[b+1] = bitOffsetsAcc;
179  bitOffsetsAcc += s_bitSizes[b+1];
180 
181  s_bitOffsets[b+2] = bitOffsetsAcc;
182  bitOffsetsAcc += s_bitSizes[b+2];
183 
184  s_bitOffsets[b+3] = bitOffsetsAcc;
185  bitOffsetsAcc += s_bitSizes[b+3];
186 
187  // printf("Block %d has bitSize %u and bitOffset %u \n", b, s_bitSizes[b], s_bitOffsets[b]);
188  // printf("Block %d has bitSize %u and bitOffset %u \n", b+1, s_bitSizes[b+1], s_bitOffsets[b+1]);
189  // printf("Block %d has bitSize %u and bitOffset %u \n", b+2, s_bitSizes[b+2], s_bitOffsets[b+2]);
190  // printf("Block %d has bitSize %u and bitOffset %u \n", b+3, s_bitSizes[b+3], s_bitOffsets[b+3]);
191 
192  assert(s_bitSizes[b] <= 32); // bit size has to be in [0,32] range
193  assert(s_bitSizes[b+1] <= 32); // bit size has to be in [0,32] range
194  assert(s_bitSizes[b+2] <= 32); // bit size has to be in [0,32] range
195  assert(s_bitSizes[b+3] <= 32); // bit size has to be in [0,32] range
196 
197  // advance the input iterator to another uint32_t with block sizes
198  inIt += 1 + s_bitSizes[b] + s_bitSizes[b+1] + s_bitSizes[b+2] + s_bitSizes[b+3];
199  }
200  }
201  __syncthreads();
202 
203 
204  // we need at most 4 loops of unpacking for the current setup, since we use exactly 256 threads,
205  // but the maximal unpacked capacity is 1024
207  {
208  int idxUnpack = i * GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE + idx;
209 
210  if (idxUnpack >= length)
211  break;
212 
213  // every 32 threads process one block
214  int blockNum = idxUnpack / GPUGENIE_CODEC_BPP_BLOCK_LENGTH;
215 
216  // read the bit size to unpack
217  int bitSize = s_bitSizes[blockNum];
218 
219  // determine the index of the first and last (exclusive) bit that belongs to the packed number
220  int firstBit = bitSize * (idxUnpack % GPUGENIE_CODEC_BPP_BLOCK_LENGTH);
221  int lastBit = firstBit + bitSize - 1;
222  assert(lastBit < bitSize * GPUGENIE_CODEC_BPP_BLOCK_LENGTH); // cannot exceed bit packed size
223 
224  // choose a packed bit range(s)
225  uint32_t packed = d_in[s_bitOffsets[blockNum] + firstBit / GPUGENIE_CODEC_BPP_BLOCK_LENGTH];
226  int firstBitInPacked = firstBit % 32;
227  uint32_t packedOverflow = d_in[s_bitOffsets[blockNum] + lastBit / GPUGENIE_CODEC_BPP_BLOCK_LENGTH];
228 
229  bool isOverflowing = lastBit % 32 < firstBitInPacked;
230  // int lastBitInPacked = isOverflowing ? 31 : lastBit % 32;
231  int lastBitInPackedOverflow = !isOverflowing ? -1 : lastBit % 32;
232 
233  // compute decompressed value
234  uint32_t outFromPacked =
235  ((packed >> firstBitInPacked) & (0xFFFFFFFF >> (32 - (bitSize - lastBitInPackedOverflow - 1))));
236  uint32_t outFromOverflow =
237  (packedOverflow & (0xFFFFFFFF >> (32-lastBitInPackedOverflow-1))) << (bitSize-lastBitInPackedOverflow-1);
238  uint32_t out = outFromPacked | outFromOverflow;
239 
240  d_out[idxUnpack] = out;
241 
242  // printf("Thread %d unpacked idx %d: bitSize: %d, firstBit: %d, lastBit: %d, firstBitInPacked: %d, lastBitInPacked: %d, lastBitInPackedOverflow: %d, bits in packed: %d, bits in overflow: %d, out: %u\n", idx, idxUnpack, bitSize, firstBit, lastBit, firstBitInPacked, lastBitInPacked, lastBitInPackedOverflow, bitSize - lastBitInPackedOverflow - 1, lastBitInPackedOverflow, out);
243  }
244 
245  capacity = length;
246  int lastBlock = blocks - 1;
247  int offsetPastLastBlock = s_bitOffsets[lastBlock] + s_bitSizes[lastBlock];
248  return d_in + offsetPastLastBlock;
249 }
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
__device__ uint32_t * decodeArrayParallel(uint32_t *d_in, size_t, uint32_t *d_out, size_t &capacity)
#define GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
__device__ uint32_t * decodeArraySequential(uint32_t *d_in, size_t, uint32_t *d_out, size_t &nvalue)
#define GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH
#define GPUGENIE_CODEC_BPP_BLOCK_LENGTH