|
209 | 209 | #define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ |
210 | 210 | { \ |
211 | 211 | TYPE1 *TENSOR1##_data = NULL; \ |
212 | | - long *TENSOR1##_counter = NULL; \ |
| 212 | + long *TENSOR1##_counter = NULL, *TENSOR1##_dims = NULL, *TENSOR1##_strides = NULL; \ |
213 | 213 | long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \ |
214 | 214 | TYPE2 *TENSOR2##_data = NULL; \ |
215 | | - long *TENSOR2##_counter = NULL; \ |
| 215 | + long *TENSOR2##_counter = NULL, *TENSOR2##_dims = NULL, *TENSOR2##_strides = NULL; \ |
216 | 216 | long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \ |
217 | 217 | int TH_TENSOR_APPLY_hasFinished = 0; \ |
| 218 | + long TH_TENSOR_dim_index = 0; \ |
218 | 219 | \ |
219 | 220 | TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \ |
220 | 221 | for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \ |
|
232 | 233 | else \ |
233 | 234 | { \ |
234 | 235 | TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \ |
235 | | - for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \ |
| 236 | + TENSOR1##_dim = 1; \ |
| 237 | + for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \ |
236 | 238 | { \ |
237 | | - if(TENSOR1->size[TENSOR1##_dim] != 1) \ |
238 | | - break; \ |
| 239 | + if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \ |
| 240 | +TENSOR1##_dim++; \ |
239 | 241 | } \ |
240 | | - TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \ |
241 | | - TENSOR1##_size = 1; \ |
242 | | - for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \ |
243 | | - { \ |
244 | | - if(TENSOR1->size[TENSOR1##_dim] != 1) \ |
245 | | - { \ |
246 | | - if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \ |
247 | | - TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \ |
248 | | - else \ |
249 | | - break; \ |
| 242 | + TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \ |
| 243 | + TENSOR1##_dims = TENSOR1##_counter + TENSOR1##_dim; \ |
| 244 | + TENSOR1##_strides = TENSOR1##_counter + 2*TENSOR1##_dim; \ |
| 245 | + TH_TENSOR_dim_index = TENSOR1##_dim-1; \ |
| 246 | + TENSOR1##_dims[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1->nDimension-1]; \ |
| 247 | + TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1->nDimension-1]; \ |
| 248 | + for(TENSOR1##_i = TENSOR1##_dim-1; TENSOR1##_i >= 0; --TENSOR1##_i) { \ |
| 249 | + TENSOR1##_counter[TENSOR1##_i] = 0; \ |
| 250 | + } \ |
| 251 | + for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; --TENSOR1##_i) { \ |
| 252 | + if (TENSOR1->stride[TENSOR1##_i] == TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) { \ |
| 253 | + TENSOR1##_dims[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_dims[TH_TENSOR_dim_index]; \ |
| 254 | + } else { \ |
| 255 | + --TH_TENSOR_dim_index; \ |
| 256 | +TENSOR1##_dims[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \ |
| 257 | +TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \ |
250 | 258 | } \ |
251 | 259 | } \ |
252 | | - TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \ |
253 | | - for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \ |
254 | | - TENSOR1##_counter[TENSOR1##_i] = 0; \ |
| 260 | + /* it will be used for offset updates while looping through the largest contiguous section */ \ |
| 261 | + TENSOR1##_size = TENSOR1##_dims[TENSOR1##_dim-1]; \ |
| 262 | + /* what is the largest contiguous section? size will store the size of this section */ \ |
| 263 | + TENSOR1##_stride = TENSOR1##_strides[TENSOR1##_dim-1]; \ |
255 | 264 | \ |
256 | 265 | TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \ |
257 | | - for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \ |
| 266 | + TENSOR2##_dim = 1; \ |
| 267 | + for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \ |
258 | 268 | { \ |
259 | | - if(TENSOR2->size[TENSOR2##_dim] != 1) \ |
260 | | - break; \ |
| 269 | + if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \ |
| 270 | +TENSOR2##_dim++; \ |
261 | 271 | } \ |
262 | | - TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \ |
263 | | - TENSOR2##_size = 1; \ |
264 | | - for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \ |
265 | | - { \ |
266 | | - if(TENSOR2->size[TENSOR2##_dim] != 1) \ |
267 | | - { \ |
268 | | - if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \ |
269 | | - TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \ |
270 | | - else \ |
271 | | - break; \ |
| 272 | + TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \ |
| 273 | + TENSOR2##_dims = TENSOR2##_counter + TENSOR2##_dim; \ |
| 274 | + TENSOR2##_strides = TENSOR2##_counter + 2*TENSOR2##_dim; \ |
| 275 | + TH_TENSOR_dim_index = TENSOR2##_dim-1; \ |
| 276 | + TENSOR2##_dims[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2->nDimension-1]; \ |
| 277 | + TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2->nDimension-1]; \ |
| 278 | + for(TENSOR2##_i = TENSOR2##_dim-1; TENSOR2##_i >= 0; --TENSOR2##_i) { \ |
| 279 | + TENSOR2##_counter[TENSOR2##_i] = 0; \ |
| 280 | + } \ |
| 281 | + for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; --TENSOR2##_i) { \ |
| 282 | + if (TENSOR2->stride[TENSOR2##_i] == TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) { \ |
| 283 | + TENSOR2##_dims[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_dims[TH_TENSOR_dim_index]; \ |
| 284 | + } else { \ |
| 285 | + --TH_TENSOR_dim_index; \ |
| 286 | +TENSOR2##_dims[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \ |
| 287 | +TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \ |
272 | 288 | } \ |
273 | 289 | } \ |
274 | | - TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \ |
275 | | - for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \ |
276 | | - TENSOR2##_counter[TENSOR2##_i] = 0; \ |
| 290 | + /* it will be used for offset updates while looping through the largest contiguous section */ \ |
| 291 | + TENSOR2##_size = TENSOR2##_dims[TENSOR2##_dim-1]; \ |
| 292 | + /* what is the largest contiguous section? size will store the size of this section */ \ |
| 293 | + TENSOR2##_stride = TENSOR2##_strides[TENSOR2##_dim-1]; \ |
277 | 294 | } \ |
278 | 295 | \ |
279 | 296 | TENSOR1##_i = 0; \ |
|
287 | 304 | \ |
288 | 305 | if(TENSOR1##_i == TENSOR1##_size) \ |
289 | 306 | { \ |
290 | | - if(TENSOR1##_dim == -1) \ |
| 307 | + if(TENSOR1##_dim == 1) \ |
291 | 308 | break; \ |
292 | 309 | \ |
293 | 310 | TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \ |
294 | | - for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \ |
| 311 | + for(TENSOR1##_i = TENSOR1##_dim-2; TENSOR1##_i >= 0; TENSOR1##_i--) \ |
295 | 312 | { \ |
296 | 313 | TENSOR1##_counter[TENSOR1##_i]++; \ |
297 | | - TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \ |
| 314 | + TENSOR1##_data += TENSOR1##_strides[TENSOR1##_i]; \ |
298 | 315 | \ |
299 | | - if(TENSOR1##_counter[TENSOR1##_i] == TENSOR1->size[TENSOR1##_i]) \ |
| 316 | + if(TENSOR1##_counter[TENSOR1##_i] == TENSOR1##_dims[TENSOR1##_i]) \ |
300 | 317 | { \ |
301 | 318 | if(TENSOR1##_i == 0) \ |
302 | 319 | { \ |
|
305 | 322 | } \ |
306 | 323 | else \ |
307 | 324 | { \ |
308 | | - TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \ |
| 325 | + TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1##_strides[TENSOR1##_i]; \ |
309 | 326 | TENSOR1##_counter[TENSOR1##_i] = 0; \ |
310 | 327 | } \ |
311 | 328 | } \ |
|
317 | 334 | \ |
318 | 335 | if(TENSOR2##_i == TENSOR2##_size) \ |
319 | 336 | { \ |
320 | | - if(TENSOR2##_dim == -1) \ |
| 337 | + if(TENSOR2##_dim == 1) \ |
321 | 338 | break; \ |
322 | 339 | \ |
323 | 340 | TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \ |
324 | | - for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \ |
| 341 | + for(TENSOR2##_i = TENSOR2##_dim-2; TENSOR2##_i >= 0; TENSOR2##_i--) \ |
325 | 342 | { \ |
326 | 343 | TENSOR2##_counter[TENSOR2##_i]++; \ |
327 | | - TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \ |
| 344 | + TENSOR2##_data += TENSOR2##_strides[TENSOR2##_i]; \ |
328 | 345 | \ |
329 | | - if(TENSOR2##_counter[TENSOR2##_i] == TENSOR2->size[TENSOR2##_i]) \ |
| 346 | + if(TENSOR2##_counter[TENSOR2##_i] == TENSOR2##_dims[TENSOR2##_i]) \ |
330 | 347 | { \ |
331 | 348 | if(TENSOR2##_i == 0) \ |
332 | 349 | { \ |
|
335 | 352 | } \ |
336 | 353 | else \ |
337 | 354 | { \ |
338 | | - TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \ |
| 355 | + TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2##_strides[TENSOR2##_i]; \ |
339 | 356 | TENSOR2##_counter[TENSOR2##_i] = 0; \ |
340 | 357 | } \ |
341 | 358 | } \ |
|
378 | 395 | long *TENSOR##_counter = NULL, *TENSOR##_dims = NULL, *TENSOR##_strides = NULL; \ |
379 | 396 | long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i; \ |
380 | 397 | int TH_TENSOR_APPLY_hasFinished = 0; \ |
| 398 | + long TH_TENSOR_dim_index = 0; \ |
381 | 399 | \ |
382 | 400 | if(TENSOR->nDimension == 0) \ |
383 | 401 | TH_TENSOR_APPLY_hasFinished = 1; \ |
|
400 | 418 | TENSOR##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR##_dim)); \ |
401 | 419 | TENSOR##_dims = TENSOR##_counter + TENSOR##_dim; \ |
402 | 420 | TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \ |
403 | | - long dim_index = TENSOR##_dim-1; \ |
404 | | - TENSOR##_dims[dim_index] = TENSOR->size[TENSOR->nDimension-1]; \ |
405 | | - TENSOR##_strides[dim_index] = TENSOR->stride[TENSOR->nDimension-1]; \ |
| 421 | + TH_TENSOR_dim_index = TENSOR##_dim-1; \ |
| 422 | + TENSOR##_dims[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->nDimension-1]; \ |
| 423 | + TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->nDimension-1]; \ |
406 | 424 | /* what is the first stride? */ \ |
407 | 425 | /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \ |
408 | 426 | /* storage is given by storage_offset + (i * j), where i is the stride */ \ |
|
412 | 430 | } \ |
413 | 431 | for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; --TENSOR##_i) { \ |
414 | 432 | if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1]) { \ |
415 | | - TENSOR##_dims[dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_dims[dim_index]; \ |
| 433 | + TENSOR##_dims[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_dims[TH_TENSOR_dim_index]; \ |
416 | 434 | } else { \ |
417 | | - --dim_index; \ |
418 | | -TENSOR##_dims[dim_index] = TENSOR->size[TENSOR##_i]; \ |
419 | | -TENSOR##_strides[dim_index] = TENSOR->stride[TENSOR##_i]; \ |
| 435 | + --TH_TENSOR_dim_index; \ |
| 436 | +TENSOR##_dims[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \ |
| 437 | +TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \ |
420 | 438 | } \ |
421 | 439 | } \ |
422 | 440 | /* it will be used for offset updates while looping through the largest contiguous section */ \ |
|
0 commit comments