1 #ifndef DEVICE_BIT_PACKING_CODEC_H_ 2 #define DEVICE_BIT_PACKING_CODEC_H_ 15 #define GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE (256) 18 #define GPUGENIE_CODEC_BPP_MAX_UNCOMPRESSED_LENGTH (4 * GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE) 21 #define GPUGENIE_CODEC_BPP_BLOCK_LENGTH (32) 24 #define GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH (GPUGENIE_CODEC_BPP_MAX_UNCOMPRESSED_LENGTH / GPUGENIE_CODEC_BPP_BLOCK_LENGTH) 47 maxbits(
const uint32_t *in, uint32_t &initoffset) {
48 uint32_t accumulator = in[0] - initoffset;
49 for (uint32_t k = 1; k <
BlockSize; ++k) {
50 accumulator |= in[k] - in[k - 1];
52 initoffset = in[BlockSize - 1];
59 uint32_t &initoffset) {
61 initoffset = *(in + BlockSize - 1);
64 __device__ __host__
static void inline 66 const uint32_t bit, uint32_t &initoffset) {
68 initoffset = *(out + BlockSize - 1);
77 uint32_t accumulator = 0;
78 for (uint32_t k = 0; k <
BlockSize; ++k) {
89 __device__ __host__
static void inline 90 unpackblock(
const uint32_t *in, uint32_t *out,
const uint32_t bit, uint32_t &) {
97 encodeArray(uint32_t *in,
const size_t length, uint32_t *out,
size_t &nvalue);
100 decodeArray(
const uint32_t *in,
const size_t , uint32_t *out,
size_t &nvalue);
102 __device__ __host__
static bool 114 name()
const {
return "BitPacking32"; }
144 uint32_t accumulator = 0;
145 for (uint32_t k = 0; k <
BlockSize; ++k) {
146 accumulator |= in[k];
156 __device__ __host__
static void inline 157 unpackblock(
const uint32_t *in, uint32_t *out,
const uint32_t bit, uint32_t &) {
161 __device__ __host__
static bool inline 167 encodeArray(uint32_t *in,
const size_t length, uint32_t *out,
size_t &nvalue)
173 decodeArray(
const uint32_t *in,
const size_t , uint32_t *out,
size_t &nvalue)
178 __device__
const uint32_t*
184 __device__
const uint32_t*
186 assert(gridDim.x == 1);
188 int idx = blockIdx.x * blockDim.x + threadIdx.x;
190 uint32_t length = d_in[0];
192 assert(length <= gridDim.x * blockDim.x * 4);
193 assert(length <= capacity);
203 s_bitSizes[idx] = (d_in[idx/4] >> (24 - 8 * (idx % 4))) & 0xFFu;
204 printf(
"Block %d has bitSize %u\n", idx, s_bitSizes[idx]);
205 assert(s_bitSizes[idx] > 0 && s_bitSizes[idx] <= 32);
215 (uint4 *)s_bitSizesSummed,
227 if (idxUnpack >= length)
238 int lastBit = firstBit + bitSize;
242 uint32_t packed = d_myIn[firstBit / 32];
243 int firstBitInPacked = firstBit % 32;
245 int lastBitInPacked = min(32, lastBit);
246 int lastBitInPackedOverflow = max(0, lastBit - 32) % 32;
248 uint32_t out = ((packed >> firstBitInPacked) % (1U << bitSize)) |
249 (packedOverflow % (1U << lastBitInPackedOverflow)) << (32 - lastBitInPacked);
251 d_out[idxUnpack] = out;
255 return d_in + length;
259 name()
const {
return "BitPacking32Prefixed"; }
static uint32_t gccbits(const uint32_t v)
static void packblockwithoutmask(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset)
__device__ __host__ int decodeArrayParallel_lengthPerBlock()
__device__ static __host__ void integratedfastunpack(const uint32_t initoffset, const uint32_t *in, uint32_t *out, const uint32_t bit)
__device__ static __host__ void fastunpack(const uint32_t *in, uint32_t *out, const uint32_t bit)
This is the top-level namespace of the project.
__device__ __host__ int decodeArrayParallel_threadLoad()
__device__ static __host__ void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
__device__ static __host__ void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &initoffset)
__device__ static __host__ bool divisibleby(size_t a, uint32_t x)
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
static uint32_t maxbits(const uint32_t *in, uint32_t &)
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
__device__ __host__ int decodeArrayParallel_minEffectiveLength()
__device__ static __host__ void unpackblock(const uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
__device__ void d_scanExclusiveShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
static const uint32_t BlockSize
static const uint32_t HowManyMiniBlocks
__device__ __host__ int decodeArrayParallel_threadLoad()
static void fastpackwithoutmask(const uint32_t *in, uint32_t *out, const uint32_t bit)
static void packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
static void packblockwithoutmask(uint32_t *in, uint32_t *out, const uint32_t bit, uint32_t &)
__device__ uint32_t * decodeArrayParallel(uint32_t *d_in, size_t, uint32_t *d_out, size_t &capacity)
__device__ __host__ int decodeArrayParallel_threadsPerBlock()
#define GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE
static uint32_t maxbits(const uint32_t *in, uint32_t &initoffset)
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
__device__ static __host__ bool divisibleby(size_t a, uint32_t x)
static uint32_t maxbits(const uint32_t *in, uint32_t &)
__device__ const uint32_t * decodeArraySequential(const uint32_t *d_in, const size_t, uint32_t *d_out, size_t &nvalue)
__device__ uint32_t * decodeArraySequential(uint32_t *d_in, size_t, uint32_t *d_out, size_t &nvalue)
__device__ __host__ int decodeArrayParallel_lengthPerBlock()
__device__ __host__ int decodeArrayParallel_maxBlocks()
__device__ const uint32_t * decodeArrayParallel(const uint32_t *d_in, const size_t comprLength, uint32_t *d_out, size_t &capacity)
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
#define GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH
__device__ __host__ int decodeArrayParallel_threadsPerBlock()
#define GPUGENIE_CODEC_BPP_BLOCK_LENGTH
static void integratedfastpackwithoutmask(const uint32_t initoffset, const uint32_t *in, uint32_t *out, const uint32_t bit)
__device__ __host__ int decodeArrayParallel_maxBlocks()
static const uint32_t MiniBlockSize