GENIE
DeviceVarintCodec.cu
Go to the documentation of this file.
1 #include <genie/utility/scan.h>
2 
3 #include "DeviceVarintCodec.h"
4 
5 #include "DeviceCodecTemplatesImpl.hpp"
6 
7 // Explicit template instances for CPU decoding wrapper function of simple codecs
8 // NOTE: This is intentionally separated into mutliple codec implementation files in order to facilitiate separate
9 // compilation units, as opposed to defining all these templates in one place.
10 template void
11 genie::compression::decodeArrayParallel<genie::compression::DeviceVarintCodec>(
12  int, int, uint32_t*, size_t, uint32_t*, size_t, size_t*);
13 
14 void
15 genie::compression::DeviceVarintCodec::encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
16 {
17 
18  uint8_t *bout = reinterpret_cast<uint8_t *>(out);
19  const uint8_t *const initbout = reinterpret_cast<uint8_t *>(out);
20  size_t bytenvalue = nvalue * sizeof(uint32_t);
21  encodeToByteArray(in, length, bout, bytenvalue);
22  bout += bytenvalue;
23  while (needPaddingTo32Bits(bout)) {
24  *bout++ = 0;
25  }
26  const size_t storageinbytes = bout - initbout;
27  assert((storageinbytes % 4) == 0);
28  nvalue = storageinbytes / 4;
29 
30 }
31 
32 void
33 genie::compression::DeviceVarintCodec::encodeToByteArray(uint32_t *in, const size_t length, uint8_t *bout, size_t &nvalue) {
34  const uint8_t *const initbout = bout;
35  for (size_t k = 0; k < length; ++k) {
36  const uint32_t val = in[k];
37 
38  if (val < (1U << 7)) {
39  *bout = static_cast<uint8_t>(val | (1U << 7));
40  ++bout;
41  } else if (val < (1U << 14)) {
42  *bout = extract7bits<0>(val);
43  ++bout;
44  *bout = extract7bitsmaskless<1>(val) | (1U << 7);
45  ++bout;
46  } else if (val < (1U << 21)) {
47  *bout = extract7bits<0>(val);
48  ++bout;
49  *bout = extract7bits<1>(val);
50  ++bout;
51  *bout = extract7bitsmaskless<2>(val) | (1U << 7);
52  ++bout;
53  } else if (val < (1U << 28)) {
54  *bout = extract7bits<0>(val);
55  ++bout;
56  *bout = extract7bits<1>(val);
57  ++bout;
58  *bout = extract7bits<2>(val);
59  ++bout;
60  *bout = extract7bitsmaskless<3>(val) | (1U << 7);
61  ++bout;
62  } else {
63  *bout = extract7bits<0>(val);
64  ++bout;
65  *bout = extract7bits<1>(val);
66  ++bout;
67  *bout = extract7bits<2>(val);
68  ++bout;
69  *bout = extract7bits<3>(val);
70  ++bout;
71  *bout = extract7bitsmaskless<4>(val) | (1U << 7);
72  ++bout;
73  }
74  }
75  nvalue = bout - initbout;
76 }
77 
78 const uint32_t*
79 genie::compression::DeviceVarintCodec::decodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
80 {
81  decodeFromByteArray((const uint8_t *)in, length * sizeof(uint32_t), out, nvalue);
82  return in + length;
83 }
84 
85 
86 
87 const uint8_t*
88 genie::compression::DeviceVarintCodec::decodeFromByteArray(const uint8_t *inbyte, const size_t length, uint32_t *out,
89  size_t &nvalue)
90 {
91  if (length == 0) {
92  nvalue = 0;
93  return inbyte;
94  }
95  const uint8_t *const endbyte = inbyte + length;
96  const uint32_t *const initout(out);
97 
98  while (endbyte > inbyte + 5) {
99 
100  uint8_t c;
101  uint32_t v;
102 
103  c = inbyte[0];
104  v = c & 0x7F;
105  if (c >= 128) {
106  inbyte += 1;
107  *out++ = v;
108  continue;
109  }
110 
111  c = inbyte[1];
112  v |= (c & 0x7F) << 7;
113  if (c >= 128) {
114  inbyte += 2;
115  *out++ = v;
116  continue;
117  }
118 
119  c = inbyte[2];
120  v |= (c & 0x7F) << 14;
121  if (c >= 128) {
122  inbyte += 3;
123  *out++ = v;
124  continue;
125  }
126 
127  c = inbyte[3];
128  v |= (c & 0x7F) << 21;
129  if (c >= 128) {
130  inbyte += 4;
131  *out++ = v;
132  continue;
133  }
134 
135  c = inbyte[4];
136  inbyte += 5;
137  v |= (c & 0x0F) << 28;
138  *out++ = v;
139  }
140  while (endbyte > inbyte) {
141  unsigned int shift = 0;
142  for (uint32_t v = 0; endbyte > inbyte; shift += 7) {
143  uint8_t c = *inbyte++;
144  v += ((c & 127) << shift);
145  if ((c & 128)) {
146  *out++ = v;
147  break;
148  }
149  }
150  }
151  nvalue = out - initout;
152  return inbyte;
153 }
154 
155 
156 __device__ uint32_t*
158  uint32_t *d_in, size_t comprLength, uint32_t *d_out, size_t &nvalue)
159 {
160  return nullptr;
161 }
162 
163 __device__ uint32_t*
165  uint32_t *d_in, size_t comprLength, uint32_t *d_out, size_t &capacity)
166 {
167  int idx = threadIdx.x;
168 
169  assert(comprLength > 0);
170  assert(comprLength <= GPUGENIE_CODEC_VARINT_MAX_UNCOMPR_LENGTH);
171 
172  // each thread stores number of integers that are decoded from the uint32_t processed by the thread
173  // the possible values in this array may be 1..4
174  __shared__ uint32_t s_numInts[GPUGENIE_CODEC_VARINT_MAX_UNCOMPR_LENGTH];
175  __shared__ uint32_t s_numIntsScanned[GPUGENIE_CODEC_VARINT_MAX_UNCOMPR_LENGTH];
176 
177  for (int i = 0; i < (comprLength + GPUGENIE_CODEC_VARINT_THREADBLOCK_SIZE - 1) /
179  {
180  int idxUnpack = i * GPUGENIE_CODEC_VARINT_THREADBLOCK_SIZE + idx;
181 
182  if (idxUnpack < comprLength)
183  s_numInts[idxUnpack] = numIntsStartingHere(d_in, idxUnpack, comprLength);
184  else
185  s_numInts[idxUnpack] = 0;
186  }
187 
188  // do a scan of s_numInts to find d_out position for each thread
189  uint comprLengthPow2 = genie::utility::d_pow2ceil_32(comprLength);
190  uint comprLength4 = (comprLength + 3) / 4;
191  __syncthreads();
192  genie::utility::d_scanExclusivePerBlockShared((uint4 *)s_numIntsScanned, (uint4 *)s_numInts, comprLength4, comprLengthPow2);
193  __syncthreads();
194 
195  int decomprLength = s_numIntsScanned[comprLength-1] + s_numInts[comprLength-1];
196  assert(decomprLength <= capacity);
197 
198  // we need at most 4 loops of unpacking for the current setup, since we use exactly 256 threads,
199  // but the maximal unpacked capacity is 1024
200  for (int i = 0; i < (comprLength + GPUGENIE_CODEC_VARINT_THREADBLOCK_SIZE - 1) /
202  {
203  int idxUnpack = i * GPUGENIE_CODEC_VARINT_THREADBLOCK_SIZE + idx;
204 
205  if (idxUnpack >= comprLength)
206  break;
207 
208  uint8_t* myBytes = reinterpret_cast<uint8_t*>(d_in);
209  int myBytesIdx = idxUnpack * 4;
210  uint8_t myCurrByte = myBytes[myBytesIdx++];
211  uint8_t myPrevByte = idxUnpack > 0 ? (d_in[idxUnpack-1] >> 24) : 0xFF;
212 
213  int myNumInts = (int)s_numInts[idxUnpack];
214  int myOutIdx = (int)s_numIntsScanned[idxUnpack];
215 
216  assert(myNumInts <= 4);
217  assert(myOutIdx < decomprLength || myNumInts == 0);
218 
219  // find first starting position position, such that previous byte has 1 on the highest position (last byte of
220  // it's corresponding int)
221  while (myNumInts && !(myPrevByte & 128))
222  {
223  myPrevByte = myCurrByte;
224  assert(myBytesIdx < comprLength * 4);
225  myCurrByte = myBytes[myBytesIdx++];
226  }
227 
228  for (int j = 0; j < myNumInts; j++)
229  {
230  uint32_t decoded = 0;
231  for (unsigned int shift = 0; ; shift += 7)
232  {
233  decoded += (myCurrByte & 127) << shift;
234  if (myBytesIdx == comprLength * 4) // pointer past the compressed input, must be last int available
235  {
236  d_out[myOutIdx + j] = decoded;
237  assert(j == myNumInts - 1);
238  break;
239  }
240  else if (myCurrByte & 128) // this was last byte of the currently decompressed int
241  {
242  d_out[myOutIdx + j] = decoded;
243  // printf("Thread: %d unpacked idx: %d int number %d out of numInts %d, value: %u, saved into d_out[%d]\n", idx, idxUnpack, j, myNumInts, decoded, myOutIdx + j);
244  assert(myBytesIdx < comprLength * 4);
245  myCurrByte = myBytes[myBytesIdx++];
246  break;
247  }
248  assert(myBytesIdx < comprLength * 4);
249  myCurrByte = myBytes[myBytesIdx++];
250  }
251  }
252  }
253 
254  capacity = decomprLength;
255  return d_in + comprLength;
256 
257 }
258 
259 __device__ int
260 genie::compression::DeviceVarintCodec::numIntsStartingHere(uint32_t *d_in, int idxUnpack, int comprLength)
261 {
262  // This function checks the last byte of the preceding uint32_t and the first 3 bytes of the current uint32_t, i.e.
263  // d_in[idxUnpack]. If such byte value has 1 in the highest bit, then a new int must start in this uint32_t
264  uint8_t* nextBytePtr = reinterpret_cast<uint8_t*>(d_in + idxUnpack);
265  uint8_t prevByte = idxUnpack > 0 ? (d_in[idxUnpack-1] >> 24) : 0xFF;
266  int numIntsStartingHere = 0;
267 
268  for (int i = 0; i < 4; i++)
269  {
270  if (prevByte & 128)
271  numIntsStartingHere++;
272 
273  prevByte = *nextBytePtr;
274  nextBytePtr++;
275  }
276  // the last compressed uint32_t may have leading bits of some bytes set to 0, even though no integer starts there
277  if (idxUnpack == comprLength - 1 && !(prevByte & 128))
278  numIntsStartingHere--;
279  return numIntsStartingHere;
280 }
__device__ void d_scanExclusivePerBlockShared(uint4 *d_Dst, uint4 *d_Src, uint activeThreads, uint pow2size)
Definition: scan.cu:95
const uint32_t * decodeArray(const uint32_t *in, const size_t, uint32_t *out, size_t &nvalue)
__device__ uint32_t * decodeArrayParallel(uint32_t *d_in, size_t, uint32_t *d_out, size_t &capacity)
void encodeArray(uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue)
__device__ uint d_pow2ceil_32(uint x)
Definition: scan.cu:279
#define GPUGENIE_CODEC_VARINT_MAX_UNCOMPR_LENGTH
__device__ uint32_t * decodeArraySequential(uint32_t *d_in, const size_t, uint32_t *d_out, size_t &nvalue)
#define GPUGENIE_CODEC_VARINT_THREADBLOCK_SIZE