GENIE
DeviceBitPackingCodec.h
Go to the documentation of this file.
1 #ifndef DEVICE_BIT_PACKING_CODEC_H_
2 #define DEVICE_BIT_PACKING_CODEC_H_
3 
4 #include "DeviceCodecs.h"
6 
7 #include <genie/utility/scan.h>
8 
9 namespace genie
10 {
11 namespace compression
12 {
13 
14 // threadblock size is 256, same for all codecs (can be up to 1024 for compute capability >= 2.0)
15 #define GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE (256)
16 
17 // maximum uncompressed length -- read from the first uint32_t of compressed word
18 #define GPUGENIE_CODEC_BPP_MAX_UNCOMPRESSED_LENGTH (4 * GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE)
19 
20 // number of integers encoded in a single BP block, each block uses the same bit size
21 #define GPUGENIE_CODEC_BPP_BLOCK_LENGTH (32)
22 
23 // maximum number of uint8_t values -- read from the next <length> uint8_ts
24 #define GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH (GPUGENIE_CODEC_BPP_MAX_UNCOMPRESSED_LENGTH / GPUGENIE_CODEC_BPP_BLOCK_LENGTH)
25 
26 
38 
39 public:
40  static const uint32_t MiniBlockSize = 32;
41  static const uint32_t HowManyMiniBlocks = 4;
42  static const uint32_t BlockSize = MiniBlockSize; // HowManyMiniBlocks * MiniBlockSize;
43 
45 
46  static uint32_t
47  maxbits(const uint32_t *in, uint32_t &initoffset) {
48  uint32_t accumulator = in[0] - initoffset;
49  for (uint32_t k = 1; k < BlockSize; ++k) {
50  accumulator |= in[k] - in[k - 1];
51  }
52  initoffset = in[BlockSize - 1];
53  return DeviceBitPackingHelpers::gccbits(accumulator);
54  }
55 
56  static void inline
57  packblockwithoutmask(const uint32_t *in, uint32_t *out,
58  const uint32_t bit,
59  uint32_t &initoffset) {
61  initoffset = *(in + BlockSize - 1);
62  }
63 
64  __device__ __host__ static void inline
65  unpackblock(const uint32_t *in, uint32_t *out,
66  const uint32_t bit, uint32_t &initoffset) {
67  DeviceBitPackingHelpers::integratedfastunpack(initoffset, in, out, bit);
68  initoffset = *(out + BlockSize - 1);
69  }
70  };
71 
72 
74 
75  static uint32_t
76  maxbits(const uint32_t *in, uint32_t &) {
77  uint32_t accumulator = 0;
78  for (uint32_t k = 0; k < BlockSize; ++k) {
79  accumulator |= in[k];
80  }
81  return DeviceBitPackingHelpers::gccbits(accumulator);
82  }
83 
84  static void inline
85  packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &) {
87  }
88 
89  __device__ __host__ static void inline
90  unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &) {
92  }
93  };
94 
95 
96  void
97  encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue);
98 
99  const uint32_t*
100  decodeArray(const uint32_t *in, const size_t /*length*/, uint32_t *out, size_t &nvalue);
101 
102  __device__ __host__ static bool
103  divisibleby(size_t a, uint32_t x) {
104  return (a % x == 0);
105  }
106 
107  __device__ uint32_t*
108  decodeArraySequential(uint32_t *d_in, size_t /*length*/, uint32_t *d_out, size_t &nvalue);
109 
110  __device__ uint32_t*
111  decodeArrayParallel(uint32_t *d_in, size_t /* comprLength */, uint32_t *d_out, size_t &capacity);
112 
113  std::string
114  name() const { return "BitPacking32"; }
115 
116  __device__ __host__ int decodeArrayParallel_maxBlocks() { return 1; }
117  __device__ __host__ int decodeArrayParallel_minEffectiveLength() { return 8; }
118  __device__ __host__ int decodeArrayParallel_lengthPerBlock() { return 1024; }
119  __device__ __host__ int decodeArrayParallel_threadsPerBlock() { return 256; }
120  __device__ __host__ int decodeArrayParallel_threadLoad() { return 4; }
121 };
122 
123 
136 
137 public:
138  static const uint32_t MiniBlockSize = 32;
139  static const uint32_t HowManyMiniBlocks = 4;
140  static const uint32_t BlockSize = MiniBlockSize; // HowManyMiniBlocks * MiniBlockSize;
141 
142  static uint32_t
143  maxbits(const uint32_t *in, uint32_t &) {
144  uint32_t accumulator = 0;
145  for (uint32_t k = 0; k < BlockSize; ++k) {
146  accumulator |= in[k];
147  }
148  return DeviceBitPackingHelpers::gccbits(accumulator);
149  }
150 
151  static void inline
152  packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &) {
154  }
155 
156  __device__ __host__ static void inline
157  unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &) {
159  }
160 
161  __device__ __host__ static bool inline
162  divisibleby(size_t a, uint32_t x) {
163  return (a % x == 0);
164  }
165 
166  void
167  encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
168  {
169  return;
170  }
171 
172  const uint32_t*
173  decodeArray(const uint32_t *in, const size_t /*length*/, uint32_t *out, size_t &nvalue)
174  {
175  return NULL;
176  }
177 
178  __device__ const uint32_t*
179  decodeArraySequential(const uint32_t *d_in, const size_t /*length*/, uint32_t *d_out, size_t &nvalue)
180  {
181  return NULL;
182  }
183 
184  __device__ const uint32_t*
185  decodeArrayParallel(const uint32_t *d_in, const size_t comprLength, uint32_t *d_out, size_t &capacity) {
186  assert(gridDim.x == 1); // currently only support single block
187 
188  int idx = blockIdx.x * blockDim.x + threadIdx.x;
189 
190  uint32_t length = d_in[0]; // first uint32_t is an uncompressed length
191  d_in++;
192  assert(length <= gridDim.x * blockDim.x * 4); // one thread can process 4 values
193  assert(length <= capacity); // not enough capacity in the decompressed array!
194  assert(length > 0);
195 
196  uint32_t blocks = (length + GPUGENIE_CODEC_BPP_BLOCK_LENGTH - 1) / GPUGENIE_CODEC_BPP_BLOCK_LENGTH;
197 
198  __shared__ uint32_t s_bitSizes[GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH];
199  __shared__ uint32_t s_bitSizesSummed[GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH];
200 
201  if (idx < blocks)
202  {
203  s_bitSizes[idx] = (d_in[idx/4] >> (24 - 8 * (idx % 4))) & 0xFFu;
204  printf("Block %d has bitSize %u\n", idx, s_bitSizes[idx]);
205  assert(s_bitSizes[idx] > 0 && s_bitSizes[idx] <= 32); // bit size has to be in [0,32] range
206  }
207  __syncthreads();
208 
209  if (blocks > 1)
210  {
211  // TODO, if length is short, there is no need to do full scan of lenth
212  // GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH, instead only scan #block bit sizes
214  (uint4 *)s_bitSizes,
215  (uint4 *)s_bitSizesSummed,
218  }
219  __syncthreads();
220 
221  // we need at most 4 loops of unpacking for the current setup, since we use exactly 256 threads,
222  // but the maximal unpacked capacity is 1024
224  {
225  int idxUnpack = i * GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE + idx;
226 
227  if (idxUnpack >= length)
228  break;
229 
230  // every 32 threads process one block
231  const uint32_t *d_myIn = d_in + s_bitSizesSummed[idxUnpack / GPUGENIE_CODEC_BPP_BLOCK_LENGTH];
232 
233  // read the bit size to unpack
234  int bitSize = s_bitSizes[idxUnpack / GPUGENIE_CODEC_BPP_BLOCK_LENGTH];
235 
236  // determine the index of the first and last (exclusive) bit that belongs to the packed number
237  int firstBit = bitSize * (idxUnpack % GPUGENIE_CODEC_BPP_BLOCK_LENGTH);
238  int lastBit = firstBit + bitSize;
239  assert(lastBit <= bitSize * GPUGENIE_CODEC_BPP_BLOCK_LENGTH); // cannot exceed bit packed size
240 
241  //
242  uint32_t packed = d_myIn[firstBit / 32]; // choose a packed source
243  int firstBitInPacked = firstBit % 32;
244  uint32_t packedOverflow = d_myIn[lastBit / GPUGENIE_CODEC_BPP_BLOCK_LENGTH]; // choose a packed source
245  int lastBitInPacked = min(32, lastBit);
246  int lastBitInPackedOverflow = max(0, lastBit - 32) % 32;
247 
248  uint32_t out = ((packed >> firstBitInPacked) % (1U << bitSize)) |
249  (packedOverflow % (1U << lastBitInPackedOverflow)) << (32 - lastBitInPacked);
250 
251  d_out[idxUnpack] = out;
252  }
253 
254  capacity = length;
255  return d_in + length;
256  }
257 
258  std::string
259  name() const { return "BitPacking32Prefixed"; }
260 
261 
262  __device__ __host__ int decodeArrayParallel_maxBlocks() { return 1; }
263  __device__ __host__ int decodeArrayParallel_lengthPerBlock() { return 1024; }
264  __device__ __host__ int decodeArrayParallel_threadsPerBlock() { return 256; }
265  __device__ __host__ int decodeArrayParallel_threadLoad() { return 4; }
266 };
267 
268 } // namespace compression
269 } // namespace genie
270 
271 #endif
static void packblockwithoutmask(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset)
__device__ __host__ int decodeArrayParallel_lengthPerBlock()
__device__ static __host__ void integratedfastunpack(const uint32_t initoffset, const uint32_t *in, uint32_t *out, const uint32_t bit)
__device__ static __host__ void fastunpack(const uint32_t *in, uint32_t *out, const uint32_t bit)
This is the top-level namespace of the project.
__device__ static __host__ void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
__device__ static __host__ void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset)
__device__ static __host__ bool divisibleby(size_t a, uint32_t x)
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
__device__ __host__ int decodeArrayParallel_minEffectiveLength()
__device__ static __host__ void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
__device__ void d_scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:116
__device__ __host__ int decodeArrayParallel_threadLoad()
static void fastpackwithoutmask(const uint32_t *in, uint32_t *out, const uint32_t bit)
static void packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
static void packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
__device__ uint32_t * decodeArrayParallel(uint32_t *d_in, size_t, uint32_t *d_out, size_t &capacity)
__device__ __host__ int decodeArrayParallel_threadsPerBlock()
#define GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE
static uint32_t maxbits(const uint32_t *in, uint32_t &initoffset)
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
__device__ static __host__ bool divisibleby(size_t a, uint32_t x)
static uint32_t maxbits(const uint32_t *in, uint32_t &)
__device__ const uint32_t * decodeArraySequential(const uint32_t *d_in, const size_t, uint32_t *d_out, size_t &nvalue)
__device__ uint32_t * decodeArraySequential(uint32_t *d_in, size_t, uint32_t *d_out, size_t &nvalue)
__device__ __host__ int decodeArrayParallel_maxBlocks()
__device__ const uint32_t * decodeArrayParallel(const uint32_t *d_in, const size_t comprLength, uint32_t *d_out, size_t &capacity)
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
#define GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH
#define GPUGENIE_CODEC_BPP_BLOCK_LENGTH
static void integratedfastpackwithoutmask(const uint32_t initoffset, const uint32_t *in, uint32_t *out, const uint32_t bit)