a custom css dtypeยค
i need custom string dtypes that can indicate a column contains css values. we using the extending pandas instructions to acheive this. we need to do two things:
- register an extension css dtype that is aware of the arrow semantics
- register an extension array
we're going to skip over arrow in this implementation but we need to support that for the best gains. when we find the css type we'll push this content to the css represented in the table.
## creating a css dtype
new dtypes accompany new arrays. the dtype is responsible for dispatching the creation of the new arrays.
from pandas._libs import missing as libmissing
we can only register extensions once so we need to rstart
class CssDtype(pandas.core.arrays.string_.StringDtype):
name = "css"
_na_value = libmissing.NA
def __eq__(self, other: object) -> bool:
if isinstance(other, str):
if other == "string" or other == self.name: # noqa: PLR1714
return True
other = self.construct_from_string(other)
except (TypeError, ImportError):
return False
if isinstance(other, type(self)):
return self.storage == other.storage and self.na_value is other.na_value
return False
def __reduce__(self):
return CssDtype, (self.storage, self.na_value)
def construct_from_string(cls, string):
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
if string == "css": return cls()
elif string == "str" and using_string_dtype(): return cls(na_value=np.nan)
elif string == "css[python]": return cls(storage="python")
elif string == "css[pyarrow]": return cls(storage="pyarrow")
elif string == "css[pyarrow_numpy]": return cls(storage="pyarrow_numpy")
else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
def construct_array_type(self):
from pandas.core.arrays.string_arrow import (
if self.storage == "python" and self._na_value is libmissing.NA:
return CssArray
elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
return ArrowCssArray
# elif self.storage == "python":
# return CssArrayNumpySemantics
return ArrowCssArrayNumpySemantics
def __from_arrow__(self, array):
return super().__from_arrow__(array,)
## the array part
class CssArray(pandas.core.arrays.string_.StringArray):
def __init__(self, values, copy: bool = False) -> None:
values = extract_array(values)
super().__init__(values, copy=copy)
if not isinstance(values, type(self)):
CssDtype(storage=self._storage, na_value=self._na_value),
def _from_sequence(cls, scalars, *, dtype = None, copy = False):
if dtype is None:
dtype = CssDtype(backend="python")
from pandas.core.arrays.masked import BaseMaskedArray
na_value = dtype.na_value
if isinstance(scalars, BaseMaskedArray):
na_values = scalars._mask
result = scalars._data
result = pandas._libs.lib.ensure_string_array(result, copy=copy, convert_na_value=False)
result[na_values] = na_value
if pandas._libs.lib.is_pyarrow_array(scalars):
scalars = np.array(scalars)
result = pandas._libs.lib.ensure_string_array(scalars, na_value=na_value, copy=copy)
# Manually creating new array avoids the validation step in the __init__, so is
# faster. Refactor need for validation?
new_string_array = cls.__new__(cls)
pandas._libs.arrays.NDArrayBacked.__init__(new_string_array, result, dtype)
return new_string_array
we need to create multiple classes to support arrow and numpy.
# class CssArrayNumpySemantics(pandas.core.arrays.string_.StringArrayNumpySemantics):
# pass
class ArrowCssArray(pandas.core.arrays.string_arrow.ArrowStringArray):
class ArrowCssArrayNumpySemantics(pandas.core.arrays.string_arrow.ArrowStringArrayNumpySemantics):