Skip to content
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ cdef class Int64Vector(Vector):
cdef Int64VectorData data
cdef ndarray ao

cdef resize(self)
cdef resize(self, Py_ssize_t new_size)
cpdef ndarray to_array(self)
cdef void append(self, int64_t x) noexcept
cdef extend(self, int64_t[:] x)
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from libc.stdlib cimport (
free,
malloc,
)
from libc.string cimport memcpy

import numpy as np

Expand Down
50 changes: 31 additions & 19 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ ctypedef fused vector_data:
Complex64VectorData
StringVectorData

cdef bint needs_resize(vector_data *data) noexcept nogil:
return data.size == data.capacity

cdef bint needs_resize(Py_ssize_t nelems, Py_ssize_t capacity) noexcept nogil:
return nelems >= capacity

# ----------------------------------------------------------------------
# Vector
Expand Down Expand Up @@ -214,8 +215,8 @@ cdef class {{name}}Vector(Vector):
self.ao = np.empty(self.data.capacity, dtype=np.{{dtype}})
self.data.data = <{{c_type}}*>self.ao.data

cdef resize(self):
self.data.capacity = max(self.data.capacity * 4, _INIT_VEC_CAP)
cdef resize(self, Py_ssize_t new_size):
self.data.capacity = max(new_size, _INIT_VEC_CAP)
self.ao.resize(self.data.capacity, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data

Expand All @@ -234,17 +235,28 @@ cdef class {{name}}Vector(Vector):

cdef void append(self, {{c_type}} x) noexcept:

if needs_resize(&self.data):
if needs_resize(self.data.size, self.data.capacity):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()
self.resize(self.data.capacity * 4)

append_data_{{dtype}}(&self.data, x)

cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
self.append(x[i])
cdef Py_ssize_t x_size = len(x)
if x_size == 0:
return

cdef Py_ssize_t needed_size = self.data.size + x_size
if needs_resize(needed_size, self.data.capacity):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize(needed_size)

memcpy(self.data.data + self.data.size, &x[0], x_size * sizeof({{c_type}}))
self.data.size = needed_size

{{endfor}}

Expand All @@ -260,7 +272,7 @@ cdef class StringVector(Vector):
if self.data.data is NULL:
raise MemoryError()

cdef resize(self):
cdef resize(self, Py_ssize_t new_size):
cdef:
char **orig_data
Py_ssize_t i, orig_capacity
Expand Down Expand Up @@ -297,8 +309,8 @@ cdef class StringVector(Vector):

cdef void append(self, char *x) noexcept:

if needs_resize(&self.data):
self.resize()
if needs_resize(self.data.size, self.data.capacity):
self.resize(self.data.capacity * 4)

append_data_string(&self.data, x)

Expand Down Expand Up @@ -684,18 +696,18 @@ cdef class {{name}}HashTable(HashTable):
continue

seen_na = True
if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
uniques.resize(uniques.data.capacity * 4)
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
result_mask.resize(result_mask.data.capacity * 4)
append_data_{{dtype}}(ud, val)
append_data_uint8(rmd, 1)
continue
Expand All @@ -706,19 +718,19 @@ cdef class {{name}}HashTable(HashTable):
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)

if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
uniques.resize(uniques.data.capacity * 4)
if use_result_mask:
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
result_mask.resize(result_mask.data.capacity * 4)
append_data_{{dtype}}(ud, val)
if use_result_mask:
append_data_uint8(rmd, 0)
Expand Down Expand Up @@ -849,9 +861,9 @@ cdef class {{name}}HashTable(HashTable):
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count

if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
uniques.resize()
uniques.resize(uniques.data.capacity * 4)
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
for i in range(n):
kh_put_{{ttype}}(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
if needs_resize(ud.size, ud.capacity):
with gil:
idx.resize()
idx.resize(idx.data.capacity * 4)
append_data_{{ttype}}(ud, i)

kh_destroy_{{ttype}}(table)
Expand Down