From 29af899f7423b519572e1465d5474e5e8d7f9fe6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 21 Oct 2023 23:56:26 +0200
Subject: [PATCH] add sortrows and sortcols to unstack

---
 NEWS.md                          |  2 ++
 src/abstractdataframe/reshape.jl | 48 +++++++++++++++++---------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index f6d51b1f3..c2d91838e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -13,6 +13,8 @@
   columns only to a subset of the columns specified by the `cols`
   keyword argument
   ([#3386](https://github.com/JuliaData/DataFrames.jl/pull/3386))
+* Add `sortrows` and `sortcols` keyword arguments to `unstack`
+  ([#3395](https://github.com/JuliaData/DataFrames.jl/pull/3395))
 
 ## Bug fixes
 
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
index 2effb6f2f..26fca495f 100644
--- a/src/abstractdataframe/reshape.jl
+++ b/src/abstractdataframe/reshape.jl
@@ -215,18 +215,19 @@ end
 """
     unstack(df::AbstractDataFrame, rowkeys, colkey, value;
             renamecols::Function=identity, allowmissing::Bool=false,
-            combine=only, fill=missing, threads::Bool=true)
+            combine=only, fill=missing, threads::Bool=true,
+            sortrows=false, sortcols=false)
     unstack(df::AbstractDataFrame, colkey, value;
             renamecols::Function=identity, allowmissing::Bool=false,
-            combine=only, fill=missing, threads::Bool=true)
+            combine=only, fill=missing, threads::Bool=true,
+            sortrows=false, sortcols=false)
     unstack(df::AbstractDataFrame;
             renamecols::Function=identity, allowmissing::Bool=false,
-            combine=only, fill=missing, threads::Bool=true)
+            combine=only, fill=missing, threads::Bool=true,
+            sortrows=false, sortcols=false)
 
 Unstack data frame `df`, i.e. convert it from long to wide format.
 
-Row and column keys are ordered in the order of their first appearance.
-
 # Positional arguments
 - `df` : the AbstractDataFrame to be unstacked
 - `rowkeys` : the columns with a unique key for each row, if not given, find a
@@ -259,6 +260,14 @@ Row and column keys are ordered in the order of their first appearance.
   time). Whether or not tasks are actually spawned and their number are
   determined automatically. Set to `false` if `combine` requires serial
   execution or is not thread-safe.
+- `sortrows`: the order of rows in the output table; all values accepted by
+  `sort` keyword argument in `groupby` passed the `rowkeys` for grouping are supported;
+  `false` by default (rows are ordered following the first appereance order).
+- `sortcols`: the order of columns in the output table; all values accepted by
+  `sort` keyword argument in `groupby` passed `colkey` for grouping are supported;
+  `false` by default (columns are ordered following the first appereance order).
+  Note that the ordering is done on the source data (not on column final column names
+  that can be potentially changed by the function passed in the `renamecols` keyword argument).
 
 Metadata: table-level `:note`-style metadata and column-level `:note`-style
 metadata for row keys columns are preserved.
@@ -420,7 +429,8 @@ julia> unstack(df, :cols, :values, combine=sum)
 function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
                  values::ColumnIndex; renamecols::Function=identity,
                  allowmissing::Bool=false,  allowduplicates::Bool=false,
-                 combine=only, fill=missing, threads::Bool=true)
+                 combine=only, fill=missing, threads::Bool=true,
+                 sortrows=false, sortcols=false)
     if allowduplicates
         Base.depwarn("allowduplicates keyword argument is deprecated. " *
                      "Pass `combine=last` instead of `allowduplicates=true`.", :unstack)
@@ -472,8 +482,9 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
         noduplicates = false
     end
 
-    g_rowkey = groupby(df_op, rowkeys)
-    g_colkey = groupby(df_op, colkey)
+    # if sorting is set to false we use fast aggregation, as we later fix the order
+    g_rowkey = groupby(df_op, rowkeys, sort=sortrows)
+    g_colkey = groupby(df_op, colkey, sort=sortcols)
     valuecol = df_op[!, values_out]
     return _unstack(df_op, index(df_op)[rowkeys], index(df_op)[colkey], g_colkey,
                     valuecol, g_rowkey, renamecols, allowmissing, noduplicates, fill)
@@ -481,8 +492,8 @@ end
 
 function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex;
                  renamecols::Function=identity, allowmissing::Bool=false,
-                  allowduplicates::Bool=false, combine=only, fill=missing,
-                  threads::Bool=true)
+                 allowduplicates::Bool=false, combine=only, fill=missing,
+                 threads::Bool=true, sortrows=false, sortcols=false)
     if allowduplicates
         Base.depwarn("allowduplicates keyword argument is deprecated. " *
                      "Pass `combine=last` instead of allowduplicates=true.", :unstack)
@@ -492,20 +503,21 @@ function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex
     value_int = index(df)[values]
     return unstack(df, Not(colkey_int, value_int), colkey_int, value_int,
             renamecols=renamecols, allowmissing=allowmissing,
-            combine=combine,
-            fill=fill, threads=threads)
+            combine=combine, fill=fill, threads=threads,
+            sortrows=sortrows, sortcols=sortcols)
 end
 
 function unstack(df::AbstractDataFrame; renamecols::Function=identity,
                  allowmissing::Bool=false, allowduplicates::Bool=false,
-                 combine=only, fill=missing, threads::Bool=true)
+                 combine=only, fill=missing, threads::Bool=true,
+                 sortrows=false, sortcols=false)
     if allowduplicates
         Base.depwarn("allowduplicates keyword argument is deprecated. " *
                      "Pass `combine=last` instead of allowduplicates=true.", :unstack)
         combine = last
     end
     unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
-            combine=combine, fill=fill, threads=threads)
+            combine=combine, fill=fill, threads=threads, sortrows=sortrows, sortcols=sortcols)
 end
 
 # we take into account the fact that idx, starts and ends are computed lazily
@@ -590,10 +602,6 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
                     copycols=false)
 
     @assert length(col_group_row_idxs) == ncol(df2)
-    # avoid reordering when col_group_row_idxs was already ordered
-    if !issorted(col_group_row_idxs)
-        df2 = df2[!, sortperm(col_group_row_idxs)]
-    end
 
     if !isempty(intersect(_names(df1), _names(df2)))
         throw(ArgumentError("Non-unique column names produced. " *
@@ -604,10 +612,6 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
     res_df = hcat(df1, df2, copycols=false)
 
     @assert length(row_group_row_idxs) == nrow(res_df)
-    # avoid reordering when row_group_row_idxs was already ordered
-    if !issorted(row_group_row_idxs)
-        res_df = res_df[sortperm(row_group_row_idxs), :]
-    end
 
     # only table-level :note-style metadata needs to be copied
     # as column-level :note-style metadata is already correctly set