@@ -3535,7 +3535,7 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
3535
3535
//------------------------------------------------------------------------------
3536
3536
// general utility
3537
3537
3538
- #define NONZERO_APPEND_INDEX { \
3538
+ #define NONZERO_APPEND_INDEX_RELATIVE { \
3539
3539
if (AK_UNLIKELY(count == capacity)) { \
3540
3540
capacity <<= 1; \
3541
3541
indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
@@ -3546,6 +3546,17 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
3546
3546
indices[count++] = p - p_start; \
3547
3547
} \
3548
3548
3549
+ #define NONZERO_APPEND_INDEX_ABSOLUTE { \
3550
+ if (AK_UNLIKELY(count == capacity)) { \
3551
+ capacity <<= 1; \
3552
+ indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
3553
+ if (indices == NULL) { \
3554
+ return NULL; \
3555
+ } \
3556
+ } \
3557
+ indices[count++] = i; \
3558
+ } \
3559
+
3549
3560
// Given a Boolean, contiguous 1D array, return the index positions in an int64 array.
3550
3561
// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
3551
3562
static inline PyObject *
@@ -3567,44 +3578,71 @@ AK_nonzero_1d(PyArrayObject* array) {
3567
3578
Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8 ;
3568
3579
npy_int64 * indices = (npy_int64 * )malloc (sizeof (npy_int64 ) * capacity );
3569
3580
3570
- // array is contiguous, 1d, boolean
3571
3581
NPY_BEGIN_THREADS_DEF ;
3572
3582
NPY_BEGIN_THREADS ;
3573
3583
3574
- npy_bool * p_start = (npy_bool * )PyArray_DATA (array );
3575
- npy_bool * p = p_start ;
3576
- npy_bool * p_end = p + count_max ;
3577
- npy_bool * p_end_roll = p_end - size_div .rem ;
3584
+ if (PyArray_IS_C_CONTIGUOUS (array )) {
3585
+ npy_bool * p_start = (npy_bool * )PyArray_DATA (array );
3586
+ npy_bool * p = p_start ;
3587
+ npy_bool * p_end = p + count_max ;
3588
+ npy_bool * p_end_roll = p_end - size_div .rem ;
3578
3589
3579
- while (p < p_end_roll ) {
3580
- if (* (npy_uint64 * )p == 0 ) {
3581
- p += 8 ; // no true within this 8 byte roll region
3582
- continue ;
3590
+ while (p < p_end_roll ) {
3591
+ if (* (npy_uint64 * )p == 0 ) {
3592
+ p += 8 ; // no true within this 8 byte roll region
3593
+ continue ;
3594
+ }
3595
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3596
+ p ++ ;
3597
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3598
+ p ++ ;
3599
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3600
+ p ++ ;
3601
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3602
+ p ++ ;
3603
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3604
+ p ++ ;
3605
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3606
+ p ++ ;
3607
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3608
+ p ++ ;
3609
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3610
+ p ++ ;
3611
+ }
3612
+ while (p < p_end ) {
3613
+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3614
+ p ++ ;
3583
3615
}
3584
- if (* p ) {NONZERO_APPEND_INDEX ;}
3585
- p ++ ;
3586
- if (* p ) {NONZERO_APPEND_INDEX ;}
3587
- p ++ ;
3588
- if (* p ) {NONZERO_APPEND_INDEX ;}
3589
- p ++ ;
3590
- if (* p ) {NONZERO_APPEND_INDEX ;}
3591
- p ++ ;
3592
- if (* p ) {NONZERO_APPEND_INDEX ;}
3593
- p ++ ;
3594
- if (* p ) {NONZERO_APPEND_INDEX ;}
3595
- p ++ ;
3596
- if (* p ) {NONZERO_APPEND_INDEX ;}
3597
- p ++ ;
3598
- if (* p ) {NONZERO_APPEND_INDEX ;}
3599
- p ++ ;
3600
3616
}
3601
- while (p < p_end ) {
3602
- if (* p ) {NONZERO_APPEND_INDEX ;}
3603
- p ++ ;
3617
+ else {
3618
+ npy_intp i = 0 ; // position within Boolean array
3619
+ npy_intp i_end = count_max ;
3620
+ npy_intp i_end_roll = count_max - size_div .rem ;
3621
+ while (i < i_end_roll ) {
3622
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3623
+ i ++ ;
3624
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3625
+ i ++ ;
3626
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3627
+ i ++ ;
3628
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3629
+ i ++ ;
3630
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3631
+ i ++ ;
3632
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3633
+ i ++ ;
3634
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3635
+ i ++ ;
3636
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3637
+ i ++ ;
3638
+ }
3639
+ while (i < i_end ) {
3640
+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3641
+ i ++ ;
3642
+ }
3604
3643
}
3605
3644
NPY_END_THREADS ;
3606
3645
3607
-
3608
3646
npy_intp dims = {count };
3609
3647
final = PyArray_SimpleNewFromData (1 , & dims , NPY_INT64 , (void * )indices );
3610
3648
if (!final ) {
@@ -3616,7 +3654,7 @@ AK_nonzero_1d(PyArrayObject* array) {
3616
3654
PyArray_CLEARFLAGS ((PyArrayObject * )final , NPY_ARRAY_WRITEABLE );
3617
3655
return final ;
3618
3656
}
3619
- #undef NONZERO_APPEND_INDEX
3657
+ #undef NONZERO_APPEND_INDEX_RELATIVE
3620
3658
3621
3659
static PyObject *
3622
3660
nonzero_1d (PyObject * Py_UNUSED (m ), PyObject * a ) {
@@ -3630,10 +3668,6 @@ nonzero_1d(PyObject *Py_UNUSED(m), PyObject *a) {
3630
3668
PyErr_SetString (PyExc_ValueError , "Array must be of type bool" );
3631
3669
return NULL ;
3632
3670
}
3633
- if (!PyArray_IS_C_CONTIGUOUS (array )) {
3634
- PyErr_SetString (PyExc_ValueError , "Array must be contiguous" );
3635
- return NULL ;
3636
- }
3637
3671
return AK_nonzero_1d (array );
3638
3672
}
3639
3673
0 commit comments