// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include "arrow/compute/function_options.h" #include "arrow/compute/ordering.h" #include "arrow/result.h" #include "arrow/type_fwd.h" namespace arrow { namespace compute { class ExecContext; /// \addtogroup compute-concrete-options /// @{ class ARROW_EXPORT FilterOptions : public FunctionOptions { public: /// Configure the action taken when a slot of the selection mask is null enum NullSelectionBehavior { /// The corresponding filtered value will be removed in the output. DROP, /// The corresponding filtered value will be null in the output. EMIT_NULL, }; explicit FilterOptions(NullSelectionBehavior null_selection = DROP); static constexpr char const kTypeName[] = "FilterOptions"; static FilterOptions Defaults() { return FilterOptions(); } NullSelectionBehavior null_selection_behavior = DROP; }; class ARROW_EXPORT TakeOptions : public FunctionOptions { public: explicit TakeOptions(bool boundscheck = true); static constexpr char const kTypeName[] = "TakeOptions"; static TakeOptions BoundsCheck() { return TakeOptions(true); } static TakeOptions NoBoundsCheck() { return TakeOptions(false); } static TakeOptions Defaults() { return BoundsCheck(); } bool boundscheck = true; }; /// \brief Options for the dictionary encode function class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions { public: /// Configure how null values will be encoded enum NullEncodingBehavior { /// The null value will be added to the dictionary with a proper index. ENCODE, /// The null value will be masked in the indices array. MASK }; explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK); static constexpr char const kTypeName[] = "DictionaryEncodeOptions"; static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } NullEncodingBehavior null_encoding_behavior = MASK; }; /// \brief Options for the run-end encode function class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions { public: explicit RunEndEncodeOptions(std::shared_ptr run_end_type = int32()); static constexpr char const kTypeName[] = "RunEndEncodeOptions"; static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); } std::shared_ptr run_end_type; }; class ARROW_EXPORT ArraySortOptions : public FunctionOptions { public: explicit ArraySortOptions(SortOrder order = SortOrder::Ascending, NullPlacement null_placement = NullPlacement::AtEnd); static constexpr char const kTypeName[] = "ArraySortOptions"; static ArraySortOptions Defaults() { return ArraySortOptions(); } /// Sorting order SortOrder order; /// Whether nulls and NaNs are placed at the start or at the end NullPlacement null_placement; }; class ARROW_EXPORT SortOptions : public FunctionOptions { public: explicit SortOptions(std::vector sort_keys = {}, NullPlacement null_placement = NullPlacement::AtEnd); explicit SortOptions(const Ordering& ordering); static constexpr char const kTypeName[] = "SortOptions"; static SortOptions Defaults() { return SortOptions(); } /// Convenience constructor to create an ordering from SortOptions /// /// Note: Both classes contain the exact same information. However, /// sort_options should only be used in a "function options" context while Ordering /// is used more generally. Ordering AsOrdering() && { return Ordering(std::move(sort_keys), null_placement); } Ordering AsOrdering() const& { return Ordering(sort_keys, null_placement); } /// Column key(s) to order by and how to order by these sort keys. std::vector sort_keys; /// Whether nulls and NaNs are placed at the start or at the end NullPlacement null_placement; }; /// \brief SelectK options class ARROW_EXPORT SelectKOptions : public FunctionOptions { public: explicit SelectKOptions(int64_t k = -1, std::vector sort_keys = {}); static constexpr char const kTypeName[] = "SelectKOptions"; static SelectKOptions Defaults() { return SelectKOptions(); } static SelectKOptions TopKDefault(int64_t k, std::vector key_names = {}) { std::vector keys; for (const auto& name : key_names) { keys.emplace_back(SortKey(name, SortOrder::Descending)); } if (key_names.empty()) { keys.emplace_back(SortKey("not-used", SortOrder::Descending)); } return SelectKOptions{k, keys}; } static SelectKOptions BottomKDefault(int64_t k, std::vector key_names = {}) { std::vector keys; for (const auto& name : key_names) { keys.emplace_back(SortKey(name, SortOrder::Ascending)); } if (key_names.empty()) { keys.emplace_back(SortKey("not-used", SortOrder::Ascending)); } return SelectKOptions{k, keys}; } /// The number of `k` elements to keep. int64_t k; /// Column key(s) to order by and how to order by these sort keys. std::vector sort_keys; }; /// \brief Rank options class ARROW_EXPORT RankOptions : public FunctionOptions { public: /// Configure how ties between equal values are handled enum Tiebreaker { /// Ties get the smallest possible rank in sorted order. Min, /// Ties get the largest possible rank in sorted order. Max, /// Ranks are assigned in order of when ties appear in the input. /// This ensures the ranks are a stable permutation of the input. First, /// The ranks span a dense [1, M] interval where M is the number /// of distinct values in the input. Dense }; explicit RankOptions(std::vector sort_keys = {}, NullPlacement null_placement = NullPlacement::AtEnd, Tiebreaker tiebreaker = RankOptions::First); /// Convenience constructor for array inputs explicit RankOptions(SortOrder order, NullPlacement null_placement = NullPlacement::AtEnd, Tiebreaker tiebreaker = RankOptions::First) : RankOptions({SortKey("", order)}, null_placement, tiebreaker) {} static constexpr char const kTypeName[] = "RankOptions"; static RankOptions Defaults() { return RankOptions(); } /// Column key(s) to order by and how to order by these sort keys. std::vector sort_keys; /// Whether nulls and NaNs are placed at the start or at the end NullPlacement null_placement; /// Tiebreaker for dealing with equal values in ranks Tiebreaker tiebreaker; }; /// \brief Partitioning options for NthToIndices class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { public: explicit PartitionNthOptions(int64_t pivot, NullPlacement null_placement = NullPlacement::AtEnd); PartitionNthOptions() : PartitionNthOptions(0) {} static constexpr char const kTypeName[] = "PartitionNthOptions"; /// The index into the equivalent sorted array of the partition pivot element. int64_t pivot; /// Whether nulls and NaNs are partitioned at the start or at the end NullPlacement null_placement; }; /// \brief Options for cumulative functions /// \note Also aliased as CumulativeSumOptions for backward compatibility class ARROW_EXPORT CumulativeOptions : public FunctionOptions { public: explicit CumulativeOptions(bool skip_nulls = false); explicit CumulativeOptions(double start, bool skip_nulls = false); explicit CumulativeOptions(std::shared_ptr start, bool skip_nulls = false); static constexpr char const kTypeName[] = "CumulativeOptions"; static CumulativeOptions Defaults() { return CumulativeOptions(); } /// Optional starting value for cumulative operation computation, default depends on the /// operation and input type. /// - sum: 0 /// - prod: 1 /// - min: maximum of the input type /// - max: minimum of the input type /// - mean: start is ignored because it has no meaning for mean std::optional> start; /// If true, nulls in the input are ignored and produce a corresponding null output. /// When false, the first null encountered is propagated through the remaining output. bool skip_nulls = false; }; using CumulativeSumOptions = CumulativeOptions; // For backward compatibility /// \brief Options for pairwise functions class ARROW_EXPORT PairwiseOptions : public FunctionOptions { public: explicit PairwiseOptions(int64_t periods = 1); static constexpr char const kTypeName[] = "PairwiseOptions"; static PairwiseOptions Defaults() { return PairwiseOptions(); } /// Periods to shift for applying the binary operation, accepts negative values. int64_t periods = 1; }; /// @} /// \brief Filter with a boolean selection filter /// /// The output will be populated with values from the input at positions /// where the selection filter is not 0. Nulls in the filter will be handled /// based on options.null_selection_behavior. /// /// For example given values = ["a", "b", "c", null, "e", "f"] and /// filter = [0, 1, 1, 0, null, 1], the output will be /// (null_selection_behavior == DROP) = ["b", "c", "f"] /// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"] /// /// \param[in] values array to filter /// \param[in] filter indicates which values should be filtered out /// \param[in] options configures null_selection_behavior /// \param[in] ctx the function execution context, optional /// \return the resulting datum ARROW_EXPORT Result Filter(const Datum& values, const Datum& filter, const FilterOptions& options = FilterOptions::Defaults(), ExecContext* ctx = NULLPTR); namespace internal { // These internal functions are implemented in kernels/vector_selection.cc /// \brief Return the number of selected indices in the boolean filter /// /// \param filter a plain or run-end encoded boolean array with or without nulls /// \param null_selection how to handle nulls in the filter ARROW_EXPORT int64_t GetFilterOutputSize(const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection); /// \brief Compute uint64 selection indices for use with Take given a boolean /// filter /// /// \param filter a plain or run-end encoded boolean array with or without nulls /// \param null_selection how to handle nulls in the filter ARROW_EXPORT Result> GetTakeIndices( const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, MemoryPool* memory_pool = default_memory_pool()); } // namespace internal /// \brief ReplaceWithMask replaces each value in the array corresponding /// to a true value in the mask with the next element from `replacements`. /// /// \param[in] values Array input to replace /// \param[in] mask Array or Scalar of Boolean mask values /// \param[in] replacements The replacement values to draw from. There must /// be as many replacement values as true values in the mask. /// \param[in] ctx the function execution context, optional /// /// \return the resulting datum /// /// \since 5.0.0 /// \note API not yet finalized ARROW_EXPORT Result ReplaceWithMask(const Datum& values, const Datum& mask, const Datum& replacements, ExecContext* ctx = NULLPTR); /// \brief FillNullForward fill null values in forward direction /// /// The output array will be of the same type as the input values /// array, with replaced null values in forward direction. /// /// For example given values = ["a", "b", "c", null, null, "f"], /// the output will be = ["a", "b", "c", "c", "c", "f"] /// /// \param[in] values datum from which to take /// \param[in] ctx the function execution context, optional /// \return the resulting datum ARROW_EXPORT Result FillNullForward(const Datum& values, ExecContext* ctx = NULLPTR); /// \brief FillNullBackward fill null values in backward direction /// /// The output array will be of the same type as the input values /// array, with replaced null values in backward direction. /// /// For example given values = ["a", "b", "c", null, null, "f"], /// the output will be = ["a", "b", "c", "f", "f", "f"] /// /// \param[in] values datum from which to take /// \param[in] ctx the function execution context, optional /// \return the resulting datum ARROW_EXPORT Result FillNullBackward(const Datum& values, ExecContext* ctx = NULLPTR); /// \brief Take from an array of values at indices in another array /// /// The output array will be of the same type as the input values /// array, with elements taken from the values array at the given /// indices. If an index is null then the taken element will be null. /// /// For example given values = ["a", "b", "c", null, "e", "f"] and /// indices = [2, 1, null, 3], the output will be /// = [values[2], values[1], null, values[3]] /// = ["c", "b", null, null] /// /// \param[in] values datum from which to take /// \param[in] indices which values to take /// \param[in] options options /// \param[in] ctx the function execution context, optional /// \return the resulting datum ARROW_EXPORT Result Take(const Datum& values, const Datum& indices, const TakeOptions& options = TakeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Take with Array inputs and output ARROW_EXPORT Result> Take(const Array& values, const Array& indices, const TakeOptions& options = TakeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Drop Null from an array of values /// /// The output array will be of the same type as the input values /// array, with elements taken from the values array without nulls. /// /// For example given values = ["a", "b", "c", null, "e", "f"], /// the output will be = ["a", "b", "c", "e", "f"] /// /// \param[in] values datum from which to take /// \param[in] ctx the function execution context, optional /// \return the resulting datum ARROW_EXPORT Result DropNull(const Datum& values, ExecContext* ctx = NULLPTR); /// \brief DropNull with Array inputs and output ARROW_EXPORT Result> DropNull(const Array& values, ExecContext* ctx = NULLPTR); /// \brief Return indices that partition an array around n-th sorted element. /// /// Find index of n-th(0 based) smallest value and perform indirect /// partition of an array around that element. Output indices[0 ~ n-1] /// holds values no greater than n-th element, and indices[n+1 ~ end] /// holds values no less than n-th element. Elements in each partition /// is not sorted. Nulls will be partitioned to the end of the output. /// Output is not guaranteed to be stable. /// /// \param[in] values array to be partitioned /// \param[in] n pivot array around sorted n-th element /// \param[in] ctx the function execution context, optional /// \return offsets indices that would partition an array ARROW_EXPORT Result> NthToIndices(const Array& values, int64_t n, ExecContext* ctx = NULLPTR); /// \brief Return indices that partition an array around n-th sorted element. /// /// This overload takes a PartitionNthOptions specifying the pivot index /// and the null handling. /// /// \param[in] values array to be partitioned /// \param[in] options options including pivot index and null handling /// \param[in] ctx the function execution context, optional /// \return offsets indices that would partition an array ARROW_EXPORT Result> NthToIndices(const Array& values, const PartitionNthOptions& options, ExecContext* ctx = NULLPTR); /// \brief Return indices that would select the first `k` elements. /// /// Perform an indirect sort of the datum, keeping only the first `k` elements. The output /// array will contain indices such that the item indicated by the k-th index will be in /// the position it would be if the datum were sorted by `options.sort_keys`. However, /// indices of null values will not be part of the output. The sort is not guaranteed to /// be stable. /// /// \param[in] datum datum to be partitioned /// \param[in] options options /// \param[in] ctx the function execution context, optional /// \return a datum with the same schema as the input ARROW_EXPORT Result> SelectKUnstable(const Datum& datum, const SelectKOptions& options, ExecContext* ctx = NULLPTR); /// \brief Return the indices that would sort an array. /// /// Perform an indirect sort of array. The output array will contain /// indices that would sort an array, which would be the same length /// as input. Nulls will be stably partitioned to the end of the output /// regardless of order. /// /// For example given array = [null, 1, 3.3, null, 2, 5.3] and order /// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0, /// 3]. /// /// \param[in] array array to sort /// \param[in] order ascending or descending /// \param[in] ctx the function execution context, optional /// \return offsets indices that would sort an array ARROW_EXPORT Result> SortIndices(const Array& array, SortOrder order = SortOrder::Ascending, ExecContext* ctx = NULLPTR); /// \brief Return the indices that would sort an array. /// /// This overload takes a ArraySortOptions specifying the sort order /// and the null handling. /// /// \param[in] array array to sort /// \param[in] options options including sort order and null handling /// \param[in] ctx the function execution context, optional /// \return offsets indices that would sort an array ARROW_EXPORT Result> SortIndices(const Array& array, const ArraySortOptions& options, ExecContext* ctx = NULLPTR); /// \brief Return the indices that would sort a chunked array. /// /// Perform an indirect sort of chunked array. The output array will /// contain indices that would sort a chunked array, which would be /// the same length as input. Nulls will be stably partitioned to the /// end of the output regardless of order. /// /// For example given chunked_array = [[null, 1], [3.3], [null, 2, /// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2, /// 4, 1, 0, 3]. /// /// \param[in] chunked_array chunked array to sort /// \param[in] order ascending or descending /// \param[in] ctx the function execution context, optional /// \return offsets indices that would sort an array ARROW_EXPORT Result> SortIndices(const ChunkedArray& chunked_array, SortOrder order = SortOrder::Ascending, ExecContext* ctx = NULLPTR); /// \brief Return the indices that would sort a chunked array. /// /// This overload takes a ArraySortOptions specifying the sort order /// and the null handling. /// /// \param[in] chunked_array chunked array to sort /// \param[in] options options including sort order and null handling /// \param[in] ctx the function execution context, optional /// \return offsets indices that would sort an array ARROW_EXPORT Result> SortIndices(const ChunkedArray& chunked_array, const ArraySortOptions& options, ExecContext* ctx = NULLPTR); /// \brief Return the indices that would sort an input in the /// specified order. Input is one of array, chunked array record batch /// or table. /// /// Perform an indirect sort of input. The output array will contain /// indices that would sort an input, which would be the same length /// as input. Nulls will be stably partitioned to the start or to the end /// of the output depending on SortOrder::null_placement. /// /// For example given input (table) = { /// "column1": [[null, 1], [ 3, null, 2, 1]], /// "column2": [[ 5], [3, null, null, 5, 5]], /// } and options = { /// {"column1", SortOrder::Ascending}, /// {"column2", SortOrder::Descending}, /// }, the output will be [5, 1, 4, 2, 0, 3]. /// /// \param[in] datum array, chunked array, record batch or table to sort /// \param[in] options options /// \param[in] ctx the function execution context, optional /// \return offsets indices that would sort a table ARROW_EXPORT Result> SortIndices(const Datum& datum, const SortOptions& options, ExecContext* ctx = NULLPTR); /// \brief Compute unique elements from an array-like object /// /// Note if a null occurs in the input it will NOT be included in the output. /// /// \param[in] datum array-like input /// \param[in] ctx the function execution context, optional /// \return result as Array /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT Result> Unique(const Datum& datum, ExecContext* ctx = NULLPTR); // Constants for accessing the output of ValueCounts ARROW_EXPORT extern const char kValuesFieldName[]; ARROW_EXPORT extern const char kCountsFieldName[]; ARROW_EXPORT extern const int32_t kValuesFieldIndex; ARROW_EXPORT extern const int32_t kCountsFieldIndex; /// \brief Return counts of unique elements from an array-like object. /// /// Note that the counts do not include counts for nulls in the array. These can be /// obtained separately from metadata. /// /// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values /// which can lead to unexpected results if the input Array has these values. /// /// \param[in] value array-like input /// \param[in] ctx the function execution context, optional /// \return counts An array of structs. /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT Result> ValueCounts(const Datum& value, ExecContext* ctx = NULLPTR); /// \brief Dictionary-encode values in an array-like object /// /// Any nulls encountered in the dictionary will be handled according to the /// specified null encoding behavior. /// /// For example, given values ["a", "b", null, "a", null] the output will be /// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null] /// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"] /// /// If the input is already dictionary encoded this function is a no-op unless /// it needs to modify the null_encoding (TODO) /// /// \param[in] data array-like input /// \param[in] ctx the function execution context, optional /// \param[in] options configures null encoding behavior /// \return result with same shape and type as input /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT Result DictionaryEncode( const Datum& data, const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Run-end-encode values in an array-like object /// /// The returned run-end encoded type uses the same value type of the input and /// run-end type defined in the options. /// /// \param[in] value array-like input /// \param[in] options configures encoding behavior /// \param[in] ctx the function execution context, optional /// \return result with same shape but run-end encoded /// /// \since 12.0.0 /// \note API not yet finalized ARROW_EXPORT Result RunEndEncode( const Datum& value, const RunEndEncodeOptions& options = RunEndEncodeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Decode a Run-End Encoded array to a plain array /// /// The output data type is the same as the values array type of run-end encoded /// input. /// /// \param[in] value run-end-encoded input /// \param[in] ctx the function execution context, optional /// \return plain array resulting from decoding the run-end encoded input /// /// \since 12.0.0 /// \note API not yet finalized ARROW_EXPORT Result RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR); /// \brief Compute the cumulative sum of an array-like object /// /// \param[in] values array-like input /// \param[in] options configures cumulative sum behavior /// \param[in] check_overflow whether to check for overflow, if true, return Invalid /// status on overflow, otherwise wrap around on overflow /// \param[in] ctx the function execution context, optional ARROW_EXPORT Result CumulativeSum( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), bool check_overflow = false, ExecContext* ctx = NULLPTR); /// \brief Compute the cumulative product of an array-like object /// /// \param[in] values array-like input /// \param[in] options configures cumulative prod behavior /// \param[in] check_overflow whether to check for overflow, if true, return Invalid /// status on overflow, otherwise wrap around on overflow /// \param[in] ctx the function execution context, optional ARROW_EXPORT Result CumulativeProd( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), bool check_overflow = false, ExecContext* ctx = NULLPTR); /// \brief Compute the cumulative max of an array-like object /// /// \param[in] values array-like input /// \param[in] options configures cumulative max behavior /// \param[in] ctx the function execution context, optional ARROW_EXPORT Result CumulativeMax( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Compute the cumulative min of an array-like object /// /// \param[in] values array-like input /// \param[in] options configures cumulative min behavior /// \param[in] ctx the function execution context, optional ARROW_EXPORT Result CumulativeMin( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Compute the cumulative mean of an array-like object /// /// \param[in] values array-like input /// \param[in] options configures cumulative mean behavior, `start` is ignored /// \param[in] ctx the function execution context, optional ARROW_EXPORT Result CumulativeMean( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), ExecContext* ctx = NULLPTR); /// \brief Return the first order difference of an array. /// /// Computes the first order difference of an array, i.e. /// output[i] = input[i] - input[i - p] if i >= p /// output[i] = null otherwise /// where p is the period. For example, with p = 1, /// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. /// With p = 2, /// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6] /// p can also be negative, in which case the diff is computed in /// the opposite direction. /// \param[in] array array input /// \param[in] options options, specifying overflow behavior and period /// \param[in] check_overflow whether to return error on overflow /// \param[in] ctx the function execution context, optional /// \return result as array ARROW_EXPORT Result> PairwiseDiff(const Array& array, const PairwiseOptions& options, bool check_overflow = false, ExecContext* ctx = NULLPTR); } // namespace compute } // namespace arrow