Skip to content

a custom css dtypeยค

i need custom string dtypes that can indicate a column contains css values. we using the extending pandas instructions to acheive this. we need to do two things:

  1. register an extension css dtype that is aware of the arrow semantics
  2. register an extension array

we're going to skip over arrow in this implementation but we need to support that for the best gains. when we find the css type we'll push this content to the css represented in the table.

%%
## creating a css dtype

new dtypes accompany new arrays. the dtype is responsible for dispatching the creation of the new arrays.

    from pandas._libs import missing as  libmissing
we can only register extensions once so we need to rstart

    @pandas.core.dtypes.base.register_extension_dtype
    class CssDtype(pandas.core.arrays.string_.StringDtype):
        name = "css"
        _na_value = libmissing.NA
        def __eq__(self, other: object) -> bool:
            if isinstance(other, str):
                if other == "string" or other == self.name:  # noqa: PLR1714
                    return True
                try:
                    other = self.construct_from_string(other)
                except (TypeError, ImportError):
                    return False
            if isinstance(other, type(self)):
                return self.storage == other.storage and self.na_value is other.na_value
            return False

        def __reduce__(self):
            return CssDtype, (self.storage, self.na_value)

        @classmethod
        def construct_from_string(cls, string):
            if not isinstance(string, str):
                raise TypeError(
                    f"'construct_from_string' expects a string, got {type(string)}"
                )
            if string == "css": return cls()
            elif string == "str" and using_string_dtype(): return cls(na_value=np.nan)
            elif string == "css[python]": return cls(storage="python")
            elif string == "css[pyarrow]": return cls(storage="pyarrow")
            elif string == "css[pyarrow_numpy]": return cls(storage="pyarrow_numpy")
            else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

        def construct_array_type(self):
            from pandas.core.arrays.string_arrow import (
                ArrowStringArray,
                ArrowStringArrayNumpySemantics,
            )
            if self.storage == "python" and self._na_value is libmissing.NA:
                return CssArray
            elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
                return ArrowCssArray
            # elif self.storage == "python":
            #     return CssArrayNumpySemantics
            else:
                return ArrowCssArrayNumpySemantics

        def __from_arrow__(self, array):
            return super().__from_arrow__(array,)

creating a css dtype

new dtypes accompany new arrays. the dtype is responsible for dispatching the creation of the new arrays.

from pandas._libs import missing as  libmissing

we can only register extensions once so we need to rstart

@pandas.core.dtypes.base.register_extension_dtype
class CssDtype(pandas.core.arrays.string_.StringDtype):
    name = "css"
    _na_value = libmissing.NA
    def __eq__(self, other: object) -> bool:
        if isinstance(other, str):
            if other == "string" or other == self.name:  # noqa: PLR1714
                return True
            try:
                other = self.construct_from_string(other)
            except (TypeError, ImportError):
                return False
        if isinstance(other, type(self)):
            return self.storage == other.storage and self.na_value is other.na_value
        return False

    def __reduce__(self):
        return CssDtype, (self.storage, self.na_value)

    @classmethod
    def construct_from_string(cls, string):
        if not isinstance(string, str):
            raise TypeError(
                f"'construct_from_string' expects a string, got {type(string)}"
            )
        if string == "css": return cls()
        elif string == "str" and using_string_dtype(): return cls(na_value=np.nan)
        elif string == "css[python]": return cls(storage="python")
        elif string == "css[pyarrow]": return cls(storage="pyarrow")
        elif string == "css[pyarrow_numpy]": return cls(storage="pyarrow_numpy")
        else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

    def construct_array_type(self):
        from pandas.core.arrays.string_arrow import (
            ArrowStringArray,
            ArrowStringArrayNumpySemantics,
        )
        if self.storage == "python" and self._na_value is libmissing.NA:
            return CssArray
        elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
            return ArrowCssArray
        # elif self.storage == "python":
        #     return CssArrayNumpySemantics
        else:
            return ArrowCssArrayNumpySemantics

    def __from_arrow__(self, array):
        return super().__from_arrow__(array,)
%%
## the array part 

    class CssArray(pandas.core.arrays.string_.StringArray):
        def __init__(self, values, copy: bool = False) -> None:
            values = extract_array(values)

            super().__init__(values, copy=copy)
            if not isinstance(values, type(self)):
                self._validate()
            NDArrayBacked.__init__(
                self,
                self._ndarray,
                CssDtype(storage=self._storage, na_value=self._na_value),
            )

        @classmethod
        def _from_sequence(cls, scalars, *, dtype = None, copy = False):
            if dtype is None:
                dtype = CssDtype(backend="python")

            from pandas.core.arrays.masked import BaseMaskedArray

            na_value = dtype.na_value
            if isinstance(scalars, BaseMaskedArray):
                na_values = scalars._mask
                result = scalars._data
                result = pandas._libs.lib.ensure_string_array(result, copy=copy, convert_na_value=False)
                result[na_values] = na_value

            else:
                if pandas._libs.lib.is_pyarrow_array(scalars):
                    scalars = np.array(scalars)
                result = pandas._libs.lib.ensure_string_array(scalars, na_value=na_value, copy=copy)

            # Manually creating new array avoids the validation step in the __init__, so is
            # faster. Refactor need for validation?
            new_string_array = cls.__new__(cls)
            pandas._libs.arrays.NDArrayBacked.__init__(new_string_array, result, dtype)

            return new_string_array


we need to create multiple classes to support arrow and numpy.

    # class CssArrayNumpySemantics(pandas.core.arrays.string_.StringArrayNumpySemantics):
    #     pass

    class ArrowCssArray(pandas.core.arrays.string_arrow.ArrowStringArray):
        pass

    class ArrowCssArrayNumpySemantics(pandas.core.arrays.string_arrow.ArrowStringArrayNumpySemantics):
        pass

the array part

class CssArray(pandas.core.arrays.string_.StringArray):
    def __init__(self, values, copy: bool = False) -> None:
        values = extract_array(values)

        super().__init__(values, copy=copy)
        if not isinstance(values, type(self)):
            self._validate()
        NDArrayBacked.__init__(
            self,
            self._ndarray,
            CssDtype(storage=self._storage, na_value=self._na_value),
        )

    @classmethod
    def _from_sequence(cls, scalars, *, dtype = None, copy = False):
        if dtype is None:
            dtype = CssDtype(backend="python")

        from pandas.core.arrays.masked import BaseMaskedArray

        na_value = dtype.na_value
        if isinstance(scalars, BaseMaskedArray):
            na_values = scalars._mask
            result = scalars._data
            result = pandas._libs.lib.ensure_string_array(result, copy=copy, convert_na_value=False)
            result[na_values] = na_value

        else:
            if pandas._libs.lib.is_pyarrow_array(scalars):
                scalars = np.array(scalars)
            result = pandas._libs.lib.ensure_string_array(scalars, na_value=na_value, copy=copy)

        # Manually creating new array avoids the validation step in the __init__, so is
        # faster. Refactor need for validation?
        new_string_array = cls.__new__(cls)
        pandas._libs.arrays.NDArrayBacked.__init__(new_string_array, result, dtype)

        return new_string_array

we need to create multiple classes to support arrow and numpy.

# class CssArrayNumpySemantics(pandas.core.arrays.string_.StringArrayNumpySemantics):
#     pass

class ArrowCssArray(pandas.core.arrays.string_arrow.ArrowStringArray):
    pass

class ArrowCssArrayNumpySemantics(pandas.core.arrays.string_arrow.ArrowStringArrayNumpySemantics):
    pass
    Series(["grayscale(1)"]).astype("css")
0    grayscale(1)
dtype: css