3 #include "DeviceCodecTemplatesImpl.hpp" 11 genie::compression::decodeArrayParallel<DeviceBitPackingCodec>(int, int, uint32_t*, size_t, uint32_t*, size_t,
size_t*);
13 genie::compression::decodeArrayParallel<DeviceBitPackingPrefixedCodec>(int, int, uint32_t*, size_t, uint32_t*, size_t,
size_t*);
18 const uint32_t *
const initout(out);
19 *out++ =
static_cast<uint32_t
>(length);
20 uint32_t Bs[HowManyMiniBlocks];
22 const uint32_t *
const final = in + length;
23 for (; in + HowManyMiniBlocks * MiniBlockSize <=
final; in += HowManyMiniBlocks * MiniBlockSize) {
24 uint32_t tmpinit = init;
25 for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
26 Bs[i] = DeviceNoDeltaBlockPacker::maxbits(in + i * MiniBlockSize, tmpinit);
28 *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3];
29 for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
30 DeviceNoDeltaBlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init);
35 size_t howmany = ((
final - in) + MiniBlockSize -1) / MiniBlockSize;
36 uint32_t zeroedIn[HowManyMiniBlocks * MiniBlockSize];
37 if (!divisibleby(length, BlockSize)) {
39 assert(
final < in + HowManyMiniBlocks * MiniBlockSize);
40 memset(&zeroedIn[0], 0, HowManyMiniBlocks * MiniBlockSize *
sizeof(uint32_t));
41 memcpy(&zeroedIn[0], in, (
final - in) *
sizeof(uint32_t));
42 assert(zeroedIn[HowManyMiniBlocks * MiniBlockSize - 1] == 0);
43 assert(zeroedIn[(
final-in)] == 0);
46 uint32_t tmpinit = init;
47 memset(&Bs[0], 0, HowManyMiniBlocks *
sizeof(uint32_t));
48 for (uint32_t i = 0; i < howmany; ++i) {
49 Bs[i] = DeviceNoDeltaBlockPacker::maxbits(in + i * MiniBlockSize, tmpinit);
51 *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) | Bs[3];
52 for (uint32_t i = 0; i < howmany; ++i) {
53 DeviceNoDeltaBlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i], init);
57 nvalue = out - initout;
63 const uint32_t actuallength = *in++;
64 const uint32_t *
const initout(out);
65 uint32_t Bs[HowManyMiniBlocks];
67 for (; out < initout + actuallength / (HowManyMiniBlocks * MiniBlockSize) * HowManyMiniBlocks * MiniBlockSize;
68 out += HowManyMiniBlocks * MiniBlockSize) {
69 Bs[0] =
static_cast<uint8_t
>(in[0] >> 24);
70 Bs[1] =
static_cast<uint8_t
>(in[0] >> 16);
71 Bs[2] =
static_cast<uint8_t
>(in[0] >> 8);
72 Bs[3] =
static_cast<uint8_t
>(in[0]);
74 for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
75 DeviceNoDeltaBlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init);
79 if (out < initout + actuallength) {
80 size_t howmany = ((initout + actuallength) - out + MiniBlockSize - 1) / MiniBlockSize;
81 Bs[0] =
static_cast<uint8_t
>(in[0] >> 24);
82 Bs[1] =
static_cast<uint8_t
>(in[0] >> 16);
83 Bs[2] =
static_cast<uint8_t
>(in[0] >> 8);
84 Bs[3] =
static_cast<uint8_t
>(in[0]);
87 for (uint32_t i = 0; i < howmany; ++i) {
88 DeviceNoDeltaBlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i], init);
91 if (divisibleby(actuallength, BlockSize))
92 out += howmany * MiniBlockSize;
94 out += ((initout + actuallength) - out);
96 nvalue = out - initout;
97 assert(nvalue == actuallength);
103 uint32_t *d_in,
size_t , uint32_t *d_out,
size_t &nvalue)
105 uint32_t actuallength = *d_in++;
106 uint32_t *initout(d_out);
107 uint32_t Bs[HowManyMiniBlocks];
109 for (;d_out < initout + actuallength / (HowManyMiniBlocks * MiniBlockSize) * HowManyMiniBlocks * MiniBlockSize;
110 d_out += HowManyMiniBlocks * MiniBlockSize) {
111 Bs[0] =
static_cast<uint8_t
>(d_in[0] >> 24);
112 Bs[1] =
static_cast<uint8_t
>(d_in[0] >> 16);
113 Bs[2] =
static_cast<uint8_t
>(d_in[0] >> 8);
114 Bs[3] =
static_cast<uint8_t
>(d_in[0]);
116 for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) {
117 DeviceNoDeltaBlockPacker::unpackblock(d_in, d_out + i * MiniBlockSize, Bs[i], init);
121 if (d_out < initout + actuallength) {
122 size_t howmany = ((initout + actuallength) - d_out + MiniBlockSize - 1) / MiniBlockSize;
123 Bs[0] =
static_cast<uint8_t
>(d_in[0] >> 24);
124 Bs[1] =
static_cast<uint8_t
>(d_in[0] >> 16);
125 Bs[2] =
static_cast<uint8_t
>(d_in[0] >> 8);
126 Bs[3] =
static_cast<uint8_t
>(d_in[0]);
129 for (uint32_t i = 0; i < howmany; ++i) {
130 DeviceNoDeltaBlockPacker::unpackblock(d_in, d_out + i * MiniBlockSize, Bs[i], init);
133 if (divisibleby(actuallength, BlockSize))
134 d_out += howmany * MiniBlockSize;
136 d_out += ((initout + actuallength) - d_out);
138 nvalue = d_out - initout;
139 assert(nvalue == actuallength);
145 uint32_t *d_in,
size_t , uint32_t *d_out,
size_t &capacity)
147 int idx = threadIdx.x;
149 uint32_t length = d_in[0];
151 assert(length <= decodeArrayParallel_lengthPerBlock());
152 assert(length <= capacity);
162 uint32_t bitOffsetsAcc = 0;
164 for (
int b = 0; b < blocks; b+=4)
166 s_bitSizes[b] =
static_cast<uint8_t
>(d_in[inIt] >> 24);
167 s_bitSizes[b+1] =
static_cast<uint8_t
>(d_in[inIt] >> 16);
168 s_bitSizes[b+2] =
static_cast<uint8_t
>(d_in[inIt] >> 8);
169 s_bitSizes[b+3] =
static_cast<uint8_t
>(d_in[inIt]);
175 s_bitOffsets[b] = bitOffsetsAcc;
176 bitOffsetsAcc += s_bitSizes[b];
178 s_bitOffsets[b+1] = bitOffsetsAcc;
179 bitOffsetsAcc += s_bitSizes[b+1];
181 s_bitOffsets[b+2] = bitOffsetsAcc;
182 bitOffsetsAcc += s_bitSizes[b+2];
184 s_bitOffsets[b+3] = bitOffsetsAcc;
185 bitOffsetsAcc += s_bitSizes[b+3];
192 assert(s_bitSizes[b] <= 32);
193 assert(s_bitSizes[b+1] <= 32);
194 assert(s_bitSizes[b+2] <= 32);
195 assert(s_bitSizes[b+3] <= 32);
198 inIt += 1 + s_bitSizes[b] + s_bitSizes[b+1] + s_bitSizes[b+2] + s_bitSizes[b+3];
210 if (idxUnpack >= length)
217 int bitSize = s_bitSizes[blockNum];
221 int lastBit = firstBit + bitSize - 1;
226 int firstBitInPacked = firstBit % 32;
229 bool isOverflowing = lastBit % 32 < firstBitInPacked;
231 int lastBitInPackedOverflow = !isOverflowing ? -1 : lastBit % 32;
234 uint32_t outFromPacked =
235 ((packed >> firstBitInPacked) & (0xFFFFFFFF >> (32 - (bitSize - lastBitInPackedOverflow - 1))));
236 uint32_t outFromOverflow =
237 (packedOverflow & (0xFFFFFFFF >> (32-lastBitInPackedOverflow-1))) << (bitSize-lastBitInPackedOverflow-1);
238 uint32_t out = outFromPacked | outFromOverflow;
240 d_out[idxUnpack] = out;
246 int lastBlock = blocks - 1;
247 int offsetPastLastBlock = s_bitOffsets[lastBlock] + s_bitSizes[lastBlock];
248 return d_in + offsetPastLastBlock;
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
__device__ uint32_t * decodeArrayParallel(uint32_t *d_in, size_t, uint32_t *d_out, size_t &capacity)
#define GPUGENIE_CODEC_BPP_THREADBLOCK_SIZE
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
__device__ uint32_t * decodeArraySequential(uint32_t *d_in, size_t, uint32_t *d_out, size_t &nvalue)
#define GPUGENIE_CODEC_BPP_MAX_BITSIZES_LENGTH
#define GPUGENIE_CODEC_BPP_BLOCK_LENGTH