# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # Avoid name clash with `pa.struct` function import struct as _struct cdef class Tensor(_Weakrefable): """ A n-dimensional array a.k.a Tensor. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) type: int32 shape: (2, 3) strides: (12, 4) """ def __init__(self): raise TypeError("Do not call Tensor's constructor directly, use one " "of the `pyarrow.Tensor.from_*` functions instead.") cdef void init(self, const shared_ptr[CTensor]& sp_tensor): self.sp_tensor = sp_tensor self.tp = sp_tensor.get() self.type = pyarrow_wrap_data_type(self.tp.type()) self._ssize_t_shape = self._make_shape_or_strides_buffer(self.shape) self._ssize_t_strides = self._make_shape_or_strides_buffer(self.strides) def _make_shape_or_strides_buffer(self, values): """ Make a bytes object holding an array of `values` cast to `Py_ssize_t`. """ return _struct.pack(f"{len(values)}n", *values) def __repr__(self): return """ type: {0.type} shape: {0.shape} strides: {0.strides}""".format(self) @staticmethod def from_numpy(obj, dim_names=None): """ Create a Tensor from a numpy array. Parameters ---------- obj : numpy.ndarray The source numpy array dim_names : list, optional Names of each dimension of the Tensor. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) type: int32 shape: (2, 3) strides: (12, 4) """ cdef: vector[c_string] c_dim_names shared_ptr[CTensor] ctensor if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) check_status(NdarrayToTensor(c_default_memory_pool(), obj, c_dim_names, &ctensor)) return pyarrow_wrap_tensor(ctensor) def to_numpy(self): """ Convert arrow::Tensor to numpy.ndarray with zero copy Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.to_numpy() array([[ 2, 2, 4], [ 4, 5, 100]], dtype=int32) """ cdef PyObject* out check_status(TensorToNdarray(self.sp_tensor, self, &out)) return PyObject_to_object(out) def equals(self, Tensor other): """ Return true if the tensors contains exactly equal data. Parameters ---------- other : Tensor The other tensor to compare for equality. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a","b"]) >>> tensor.equals(tensor) True >>> tensor.equals(tensor2) False """ return self.tp.Equals(deref(other.tp)) def __eq__(self, other): if isinstance(other, Tensor): return self.equals(other) else: return NotImplemented def dim_name(self, i): """ Returns the name of the i-th tensor dimension. Parameters ---------- i : int The physical index of the tensor dimension. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.dim_name(0) 'dim1' >>> tensor.dim_name(1) 'dim2' """ return frombytes(self.tp.dim_name(i)) @property def dim_names(self): """ Names of this tensor dimensions. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.dim_names ['dim1', 'dim2'] """ return [frombytes(x) for x in tuple(self.tp.dim_names())] @property def is_mutable(self): """ Is this tensor mutable or immutable. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.is_mutable True """ return self.tp.is_mutable() @property def is_contiguous(self): """ Is this tensor contiguous in memory. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.is_contiguous True """ return self.tp.is_contiguous() @property def ndim(self): """ The dimension (n) of this tensor. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.ndim 2 """ return self.tp.ndim() @property def size(self): """ The size of this tensor. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.size 6 """ return self.tp.size() @property def shape(self): """ The shape of this tensor. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.shape (2, 3) """ # Cython knows how to convert a vector[T] to a Python list return tuple(self.tp.shape()) @property def strides(self): """ Strides of this tensor. Examples -------- >>> import pyarrow as pa >>> import numpy as np >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) >>> tensor.strides (12, 4) """ return tuple(self.tp.strides()) def __getbuffer__(self, cp.Py_buffer* buffer, int flags): buffer.buf = self.tp.data().get().data() pep3118_format = self.type.pep3118_format if pep3118_format is None: raise NotImplementedError("type %s not supported for buffer " "protocol" % (self.type,)) buffer.format = pep3118_format buffer.itemsize = self.type.bit_width // 8 buffer.internal = NULL buffer.len = self.tp.size() * buffer.itemsize buffer.ndim = self.tp.ndim() buffer.obj = self if self.tp.is_mutable(): buffer.readonly = 0 else: buffer.readonly = 1 buffer.shape = cp.PyBytes_AsString(self._ssize_t_shape) buffer.strides = cp.PyBytes_AsString(self._ssize_t_strides) buffer.suboffsets = NULL ctypedef CSparseCOOIndex* _CSparseCOOIndexPtr cdef class SparseCOOTensor(_Weakrefable): """ A sparse COO tensor. """ def __init__(self): raise TypeError("Do not call SparseCOOTensor's constructor directly, " "use one of the `pyarrow.SparseCOOTensor.from_*` " "functions instead.") cdef void init(self, const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor): self.sp_sparse_tensor = sp_sparse_tensor self.stp = sp_sparse_tensor.get() self.type = pyarrow_wrap_data_type(self.stp.type()) def __repr__(self): return """ type: {0.type} shape: {0.shape}""".format(self) @classmethod def from_dense_numpy(cls, obj, dim_names=None): """ Convert numpy.ndarray to arrow::SparseCOOTensor Parameters ---------- obj : numpy.ndarray Data used to populate the rows. dim_names : list[str], optional Names of the dimensions. Returns ------- pyarrow.SparseCOOTensor """ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) @staticmethod def from_numpy(data, coords, shape, dim_names=None): """ Create arrow::SparseCOOTensor from numpy.ndarrays Parameters ---------- data : numpy.ndarray Data used to populate the rows. coords : numpy.ndarray Coordinates of the data. shape : tuple Shape of the tensor. dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) # Enforce precondition for SparseCOOTensor indices coords = np.require(coords, dtype='i8', requirements='C') if coords.ndim != 2: raise ValueError("Expected 2-dimensional array for " "SparseCOOTensor indices") check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @staticmethod def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor Parameters ---------- obj : scipy.sparse.csr_matrix The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.coo_matrix): raise TypeError( "Expected scipy.sparse.coo_matrix, got {}".format(type(obj))) cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in obj.shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) row = obj.row col = obj.col # When SciPy's coo_matrix has canonical format, its indices matrix is # sorted in column-major order. As Arrow's SparseCOOIndex is sorted # in row-major order if it is canonical, we must sort indices matrix # into row-major order to keep its canonicalness, here. if obj.has_canonical_format: order = np.lexsort((col, row)) # sort in row-major order row = row[order] col = col[order] coords = np.vstack([row, col]).T coords = np.require(coords, dtype='i8', requirements='C') check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), obj.data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @staticmethod def from_pydata_sparse(obj, dim_names=None): """ Convert pydata/sparse.COO to arrow::SparseCOOTensor. Parameters ---------- obj : pydata.sparse.COO The sparse multidimensional array that should be converted. dim_names : list, optional Names of the dimensions. """ import sparse if not isinstance(obj, sparse.COO): raise TypeError( "Expected sparse.COO, got {}".format(type(obj))) cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in obj.shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) coords = np.require(obj.coords.T, dtype='i8', requirements='C') check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), obj.data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) @staticmethod def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCOOTensor. Parameters ---------- obj : Tensor The tensor that should be converted. """ cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) with nogil: check_status(TensorToSparseCOOTensor(ctensor, &csparse_tensor)) return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) def to_numpy(self): """ Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. """ cdef PyObject* out_data cdef PyObject* out_coords check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, &out_data, &out_coords)) return PyObject_to_object(out_data), PyObject_to_object(out_coords) def to_scipy(self): """ Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. """ from scipy.sparse import coo_matrix cdef PyObject* out_data cdef PyObject* out_coords check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, &out_data, &out_coords)) data = PyObject_to_object(out_data) coords = PyObject_to_object(out_coords) row, col = coords[:, 0], coords[:, 1] result = coo_matrix((data[:, 0], (row, col)), shape=self.shape) # As the description in from_scipy above, we sorted indices matrix # in row-major order if SciPy's coo_matrix has canonical format. # So, we must call sum_duplicates() to make the result coo_matrix # has canonical format. if self.has_canonical_format: result.sum_duplicates() return result def to_pydata_sparse(self): """ Convert arrow::SparseCOOTensor to pydata/sparse.COO. """ from sparse import COO cdef PyObject* out_data cdef PyObject* out_coords check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, &out_data, &out_coords)) data = PyObject_to_object(out_data) coords = PyObject_to_object(out_coords) result = COO(data=data[:, 0], coords=coords.T, shape=self.shape) return result def to_tensor(self): """ Convert arrow::SparseCOOTensor to arrow::Tensor. """ cdef shared_ptr[CTensor] ctensor with nogil: ctensor = GetResultValue(self.stp.ToTensor()) return pyarrow_wrap_tensor(ctensor) def equals(self, SparseCOOTensor other): """ Return true if sparse tensors contains exactly equal data. Parameters ---------- other : SparseCOOTensor The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) def __eq__(self, other): if isinstance(other, SparseCOOTensor): return self.equals(other) else: return NotImplemented @property def is_mutable(self): return self.stp.is_mutable() @property def ndim(self): return self.stp.ndim() @property def shape(self): # Cython knows how to convert a vector[T] to a Python list return tuple(self.stp.shape()) @property def size(self): return self.stp.size() def dim_name(self, i): """ Returns the name of the i-th tensor dimension. Parameters ---------- i : int The physical index of the tensor dimension. Returns ------- str """ return frombytes(self.stp.dim_name(i)) @property def dim_names(self): names_tuple = tuple(self.stp.dim_names()) return tuple(frombytes(x) for x in names_tuple) @property def non_zero_length(self): return self.stp.non_zero_length() @property def has_canonical_format(self): cdef: _CSparseCOOIndexPtr csi csi = <_CSparseCOOIndexPtr>(self.stp.sparse_index().get()) if csi != nullptr: return csi.is_canonical() return True cdef class SparseCSRMatrix(_Weakrefable): """ A sparse CSR matrix. """ def __init__(self): raise TypeError("Do not call SparseCSRMatrix's constructor directly, " "use one of the `pyarrow.SparseCSRMatrix.from_*` " "functions instead.") cdef void init(self, const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor): self.sp_sparse_tensor = sp_sparse_tensor self.stp = sp_sparse_tensor.get() self.type = pyarrow_wrap_data_type(self.stp.type()) def __repr__(self): return """ type: {0.type} shape: {0.shape}""".format(self) @classmethod def from_dense_numpy(cls, obj, dim_names=None): """ Convert numpy.ndarray to arrow::SparseCSRMatrix Parameters ---------- obj : numpy.ndarray The dense numpy array that should be converted. dim_names : list, optional The names of the dimensions. Returns ------- pyarrow.SparseCSRMatrix """ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) @staticmethod def from_numpy(data, indptr, indices, shape, dim_names=None): """ Create arrow::SparseCSRMatrix from numpy.ndarrays. Parameters ---------- data : numpy.ndarray Data used to populate the sparse matrix. indptr : numpy.ndarray Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. indices : numpy.ndarray Column indices of the corresponding non-zero values. shape : tuple Shape of the matrix. dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) # Enforce precondition for SparseCSRMatrix indices indptr = np.require(indptr, dtype='i8') indices = np.require(indices, dtype='i8') if indptr.ndim != 1: raise ValueError("Expected 1-dimensional array for " "SparseCSRMatrix indptr") if indices.ndim != 1: raise ValueError("Expected 1-dimensional array for " "SparseCSRMatrix indices") check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(), data, indptr, indices, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) @staticmethod def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. Parameters ---------- obj : scipy.sparse.csr_matrix The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csr_matrix): raise TypeError( "Expected scipy.sparse.csr_matrix, got {}".format(type(obj))) cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in obj.shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) # Enforce precondition for CSparseCSRMatrix indices indptr = np.require(obj.indptr, dtype='i8') indices = np.require(obj.indices, dtype='i8') check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(), obj.data, indptr, indices, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) @staticmethod def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSRMatrix. Parameters ---------- obj : Tensor The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) with nogil: check_status(TensorToSparseCSRMatrix(ctensor, &csparse_tensor)) return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) def to_numpy(self): """ Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. """ cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self, &out_data, &out_indptr, &out_indices)) return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), PyObject_to_object(out_indices)) def to_scipy(self): """ Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. """ from scipy.sparse import csr_matrix cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self, &out_data, &out_indptr, &out_indices)) data = PyObject_to_object(out_data) indptr = PyObject_to_object(out_indptr) indices = PyObject_to_object(out_indices) result = csr_matrix((data[:, 0], indices, indptr), shape=self.shape) return result def to_tensor(self): """ Convert arrow::SparseCSRMatrix to arrow::Tensor. """ cdef shared_ptr[CTensor] ctensor with nogil: ctensor = GetResultValue(self.stp.ToTensor()) return pyarrow_wrap_tensor(ctensor) def equals(self, SparseCSRMatrix other): """ Return true if sparse tensors contains exactly equal data. Parameters ---------- other : SparseCSRMatrix The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) def __eq__(self, other): if isinstance(other, SparseCSRMatrix): return self.equals(other) else: return NotImplemented @property def is_mutable(self): return self.stp.is_mutable() @property def ndim(self): return self.stp.ndim() @property def shape(self): # Cython knows how to convert a vector[T] to a Python list return tuple(self.stp.shape()) @property def size(self): return self.stp.size() def dim_name(self, i): """ Returns the name of the i-th tensor dimension. Parameters ---------- i : int The physical index of the tensor dimension. Returns ------- str """ return frombytes(self.stp.dim_name(i)) @property def dim_names(self): names_tuple = tuple(self.stp.dim_names()) return tuple(frombytes(x) for x in names_tuple) @property def non_zero_length(self): return self.stp.non_zero_length() cdef class SparseCSCMatrix(_Weakrefable): """ A sparse CSC matrix. """ def __init__(self): raise TypeError("Do not call SparseCSCMatrix's constructor directly, " "use one of the `pyarrow.SparseCSCMatrix.from_*` " "functions instead.") cdef void init(self, const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor): self.sp_sparse_tensor = sp_sparse_tensor self.stp = sp_sparse_tensor.get() self.type = pyarrow_wrap_data_type(self.stp.type()) def __repr__(self): return """ type: {0.type} shape: {0.shape}""".format(self) @classmethod def from_dense_numpy(cls, obj, dim_names=None): """ Convert numpy.ndarray to arrow::SparseCSCMatrix Parameters ---------- obj : numpy.ndarray Data used to populate the rows. dim_names : list[str], optional Names of the dimensions. Returns ------- pyarrow.SparseCSCMatrix """ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) @staticmethod def from_numpy(data, indptr, indices, shape, dim_names=None): """ Create arrow::SparseCSCMatrix from numpy.ndarrays Parameters ---------- data : numpy.ndarray Data used to populate the sparse matrix. indptr : numpy.ndarray Range of the rows, The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. indices : numpy.ndarray Column indices of the corresponding non-zero values. shape : tuple Shape of the matrix. dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) # Enforce precondition for SparseCSCMatrix indices indptr = np.require(indptr, dtype='i8') indices = np.require(indices, dtype='i8') if indptr.ndim != 1: raise ValueError("Expected 1-dimensional array for " "SparseCSCMatrix indptr") if indices.ndim != 1: raise ValueError("Expected 1-dimensional array for " "SparseCSCMatrix indices") check_status(NdarraysToSparseCSCMatrix(c_default_memory_pool(), data, indptr, indices, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_csc_matrix(csparse_tensor) @staticmethod def from_scipy(obj, dim_names=None): """ Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix Parameters ---------- obj : scipy.sparse.csc_matrix The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. """ import scipy.sparse if not isinstance(obj, scipy.sparse.csc_matrix): raise TypeError( "Expected scipy.sparse.csc_matrix, got {}".format(type(obj))) cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in obj.shape: c_shape.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) # Enforce precondition for CSparseCSCMatrix indices indptr = np.require(obj.indptr, dtype='i8') indices = np.require(obj.indices, dtype='i8') check_status(NdarraysToSparseCSCMatrix(c_default_memory_pool(), obj.data, indptr, indices, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_csc_matrix(csparse_tensor) @staticmethod def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSCMatrix Parameters ---------- obj : Tensor The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSCMatrix] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) with nogil: check_status(TensorToSparseCSCMatrix(ctensor, &csparse_tensor)) return pyarrow_wrap_sparse_csc_matrix(csparse_tensor) def to_numpy(self): """ Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy """ cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices check_status(SparseCSCMatrixToNdarray(self.sp_sparse_tensor, self, &out_data, &out_indptr, &out_indices)) return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), PyObject_to_object(out_indices)) def to_scipy(self): """ Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix """ from scipy.sparse import csc_matrix cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices check_status(SparseCSCMatrixToNdarray(self.sp_sparse_tensor, self, &out_data, &out_indptr, &out_indices)) data = PyObject_to_object(out_data) indptr = PyObject_to_object(out_indptr) indices = PyObject_to_object(out_indices) result = csc_matrix((data[:, 0], indices, indptr), shape=self.shape) return result def to_tensor(self): """ Convert arrow::SparseCSCMatrix to arrow::Tensor """ cdef shared_ptr[CTensor] ctensor with nogil: ctensor = GetResultValue(self.stp.ToTensor()) return pyarrow_wrap_tensor(ctensor) def equals(self, SparseCSCMatrix other): """ Return true if sparse tensors contains exactly equal data Parameters ---------- other : SparseCSCMatrix The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) def __eq__(self, other): if isinstance(other, SparseCSCMatrix): return self.equals(other) else: return NotImplemented @property def is_mutable(self): return self.stp.is_mutable() @property def ndim(self): return self.stp.ndim() @property def shape(self): # Cython knows how to convert a vector[T] to a Python list return tuple(self.stp.shape()) @property def size(self): return self.stp.size() def dim_name(self, i): """ Returns the name of the i-th tensor dimension. Parameters ---------- i : int The physical index of the tensor dimension. Returns ------- str """ return frombytes(self.stp.dim_name(i)) @property def dim_names(self): names_tuple = tuple(self.stp.dim_names()) return tuple(frombytes(x) for x in names_tuple) @property def non_zero_length(self): return self.stp.non_zero_length() cdef class SparseCSFTensor(_Weakrefable): """ A sparse CSF tensor. CSF is a generalization of compressed sparse row (CSR) index. CSF index recursively compresses each dimension of a tensor into a set of prefix trees. Each path from a root to leaf forms one tensor non-zero index. CSF is implemented with two arrays of buffers and one arrays of integers. """ def __init__(self): raise TypeError("Do not call SparseCSFTensor's constructor directly, " "use one of the `pyarrow.SparseCSFTensor.from_*` " "functions instead.") cdef void init(self, const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor): self.sp_sparse_tensor = sp_sparse_tensor self.stp = sp_sparse_tensor.get() self.type = pyarrow_wrap_data_type(self.stp.type()) def __repr__(self): return """ type: {0.type} shape: {0.shape}""".format(self) @classmethod def from_dense_numpy(cls, obj, dim_names=None): """ Convert numpy.ndarray to arrow::SparseCSFTensor Parameters ---------- obj : numpy.ndarray Data used to populate the rows. dim_names : list[str], optional Names of the dimensions. Returns ------- pyarrow.SparseCSFTensor """ return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) @staticmethod def from_numpy(data, indptr, indices, shape, axis_order=None, dim_names=None): """ Create arrow::SparseCSFTensor from numpy.ndarrays Parameters ---------- data : numpy.ndarray Data used to populate the sparse tensor. indptr : numpy.ndarray The sparsity structure. Each two consecutive dimensions in a tensor correspond to a buffer in indices. A pair of consecutive values at `indptr[dim][i]` `indptr[dim][i + 1]` signify a range of nodes in `indices[dim + 1]` who are children of `indices[dim][i]` node. indices : numpy.ndarray Stores values of nodes. Each tensor dimension corresponds to a buffer in indptr. shape : tuple Shape of the matrix. axis_order : list, optional the sequence in which dimensions were traversed to produce the prefix tree. dim_names : list, optional Names of the dimensions. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef vector[int64_t] c_axis_order cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names for x in shape: c_shape.push_back(x) if not axis_order: axis_order = np.argsort(shape) for x in axis_order: c_axis_order.push_back(x) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) # Enforce preconditions for SparseCSFTensor indices if not (isinstance(indptr, (list, tuple)) and isinstance(indices, (list, tuple))): raise TypeError("Expected list or tuple, got {}, {}" .format(type(indptr), type(indices))) if len(indptr) != len(shape) - 1: raise ValueError("Expected list of {ndim} np.arrays for " "SparseCSFTensor.indptr".format(ndim=len(shape))) if len(indices) != len(shape): raise ValueError("Expected list of {ndim} np.arrays for " "SparseCSFTensor.indices".format(ndim=len(shape))) if any([x.ndim != 1 for x in indptr]): raise ValueError("Expected a list of 1-dimensional arrays for " "SparseCSFTensor.indptr") if any([x.ndim != 1 for x in indices]): raise ValueError("Expected a list of 1-dimensional arrays for " "SparseCSFTensor.indices") indptr = [np.require(arr, dtype='i8') for arr in indptr] indices = [np.require(arr, dtype='i8') for arr in indices] check_status(NdarraysToSparseCSFTensor(c_default_memory_pool(), data, indptr, indices, c_shape, c_axis_order, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_csf_tensor(csparse_tensor) @staticmethod def from_tensor(obj): """ Convert arrow::Tensor to arrow::SparseCSFTensor Parameters ---------- obj : Tensor The dense tensor that should be converted. """ cdef shared_ptr[CSparseCSFTensor] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) with nogil: check_status(TensorToSparseCSFTensor(ctensor, &csparse_tensor)) return pyarrow_wrap_sparse_csf_tensor(csparse_tensor) def to_numpy(self): """ Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy """ cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices check_status(SparseCSFTensorToNdarray(self.sp_sparse_tensor, self, &out_data, &out_indptr, &out_indices)) return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), PyObject_to_object(out_indices)) def to_tensor(self): """ Convert arrow::SparseCSFTensor to arrow::Tensor """ cdef shared_ptr[CTensor] ctensor with nogil: ctensor = GetResultValue(self.stp.ToTensor()) return pyarrow_wrap_tensor(ctensor) def equals(self, SparseCSFTensor other): """ Return true if sparse tensors contains exactly equal data Parameters ---------- other : SparseCSFTensor The other tensor to compare for equality. """ return self.stp.Equals(deref(other.stp)) def __eq__(self, other): if isinstance(other, SparseCSFTensor): return self.equals(other) else: return NotImplemented @property def is_mutable(self): return self.stp.is_mutable() @property def ndim(self): return self.stp.ndim() @property def shape(self): # Cython knows how to convert a vector[T] to a Python list return tuple(self.stp.shape()) @property def size(self): return self.stp.size() def dim_name(self, i): """ Returns the name of the i-th tensor dimension. Parameters ---------- i : int The physical index of the tensor dimension. Returns ------- str """ return frombytes(self.stp.dim_name(i)) @property def dim_names(self): names_tuple = tuple(self.stp.dim_names()) return tuple(frombytes(x) for x in names_tuple) @property def non_zero_length(self): return self.stp.non_zero_length()