From 7ba84649914b436c24e60fd87f23bb632dbb8977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 05:36:31 +0200 Subject: [PATCH 1/9] Add copy from dplyr's generics.R --- R/reconstruct.R | 255 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 R/reconstruct.R diff --git a/R/reconstruct.R b/R/reconstruct.R new file mode 100644 index 000000000..fd2f58820 --- /dev/null +++ b/R/reconstruct.R @@ -0,0 +1,255 @@ +#' Extending dplyr with new data frame subclasses +#' +#' @description +#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")} +#' +#' These three functions, along with `names<-` and 1d numeric `[` +#' (i.e. `x[loc]`) methods, provide a minimal interface for extending dplyr +#' to work with new data frame subclasses. This means that for simple cases +#' you should only need to provide a couple of methods, rather than a method +#' for every dplyr verb. +#' +#' These functions are a stop-gap measure until we figure out how to solve +#' the problem more generally, but it's likely that any code you write to +#' implement them will find a home in what comes next. +#' +#' # Basic advice +#' +#' This section gives you basic advice if you want to extend dplyr to work with +#' your custom data frame subclass, and you want the dplyr methods to behave +#' in basically the same way. +#' +#' * If you have data frame attributes that don't depend on the rows or columns +#' (and should unconditionally be preserved), you don't need to do anything. +#' +#' * If you have __scalar__ attributes that depend on __rows__, implement a +#' `dplyr_reconstruct()` method. Your method should recompute the attribute +#' depending on rows now present. +#' +#' * If you have __scalar__ attributes that depend on __columns__, implement a +#' `dplyr_reconstruct()` method and a 1d `[` method. For example, if your +#' class requires that certain columns be present, your method should return +#' a data.frame or tibble when those columns are removed. +#' +#' * If your attributes are __vectorised__ over __rows__, implement a +#' `dplyr_row_slice()` method. This gives you access to `i` so you can +#' modify the row attribute accordingly. You'll also need to think carefully +#' about how to recompute the attribute in `dplyr_reconstruct()`, and +#' you will need to carefully verify the behaviour of each verb, and provide +#' additional methods as needed. +#' +#' * If your attributes that are __vectorised__ over __columns__, implement +#' `dplyr_col_modify()`, 1d `[`, and `names<-` methods. All of these methods +#' know which columns are being modified, so you can update the column +#' attribute according. You'll also need to think carefully about how to +#' recompute the attribute in `dplyr_reconstruct()`, and you will need to +#' carefully verify the behaviour of each verb, and provide additional +#' methods as needed. +#' +#' # Current usage +#' +#' * `arrange()`, `filter()`, `slice()`, `semi_join()`, and `anti_join()` +#' work by generating a vector of row indices, and then subsetting +#' with `dplyr_row_slice()`. +#' +#' * `mutate()` generates a list of new column value (using `NULL` to indicate +#' when columns should be deleted), then passes that to `dplyr_col_modify()`. +#' `transmute()` does the same then uses 1d `[` to select the columns. +#' +#' * `summarise()` works similarly to `mutate()` but the data modified by +#' `dplyr_col_modify()` comes from `group_data()`. +#' +#' * `select()` uses 1d `[` to select columns, then `names<-` to rename them. +#' `rename()` just uses `names<-`. `relocate()` just uses 1d `[`. +#' +#' * `inner_join()`, `left_join()`, `right_join()`, and `full_join()` +#' coerces `x` to a tibble, modify the rows, then uses `dplyr_reconstruct()` +#' to convert back to the same type as `x`. +#' +#' * `nest_join()` uses `dplyr_col_modify()` to cast the key variables to +#' common type and add the nested-df that `y` becomes. +#' +#' * `distinct()` does a `mutate()` if any expressions are present, then +#' uses 1d `[` to select variables to keep, then `dplyr_row_slice()` to +#' select distinct rows. +#' +#' Note that `group_by()` and `ungroup()` don't use any these generics and +#' you'll need to provide methods directly. +#' +#' @keywords internal +#' @param data A tibble. We use tibbles because they avoid some inconsistent +#' subset-assignment use cases +#' @name dplyr_extending +NULL + +#' @export +#' @rdname dplyr_extending +#' @param i A numeric or logical vector that indexes the rows of `.data`. +dplyr_row_slice <- function(data, i, ...) { + if (!is.numeric(i) && !is.logical(i)) { + abort("`i` must be an numeric or logical vector.") + } + + UseMethod("dplyr_row_slice") +} + +#' @export +dplyr_row_slice.data.frame <- function(data, i, ...) { + dplyr_reconstruct(vec_slice(data, i), data) +} + +#' @export +dplyr_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) { + out <- vec_slice(as.data.frame(data), i) + + # Index into group_indices, then use that to restore the grouping structure + groups <- group_data(data) + new_id <- vec_slice(group_indices(data), i) + new_grps <- vec_group_loc(new_id) + + rows <- rep(list_of(integer()), length.out = nrow(groups)) + rows[new_grps$key] <- new_grps$loc + groups$.rows <- rows + if (!preserve && isTRUE(attr(groups, ".drop"))) { + groups <- group_data_trim(groups) + } + + new_grouped_df(out, groups) +} + +#' @export +dplyr_row_slice.rowwise_df <- function(data, i, ..., preserve = FALSE) { + out <- vec_slice(data, i) + group_data <- vec_slice(group_keys(data), i) + new_rowwise_df(out, group_data) +} + +#' @export +#' @rdname dplyr_extending +#' @param cols A named list used modify columns. A `NULL` value should remove +#' an existing column. +dplyr_col_modify <- function(data, cols) { + UseMethod("dplyr_col_modify") +} + +#' @export +dplyr_col_modify.data.frame <- function(data, cols) { + # Must be implemented from first principles to avoiding edge cases in + # [.data.frame and [.tibble (2.1.3 and earlier). + + # Apply tidyverse recycling rules + cols <- vec_recycle_common(!!!cols, .size = nrow(data)) + + # Transform to list to avoid stripping inner names with `[[<-` + out <- as.list(dplyr_vec_data(data)) + + nms <- as_utf8_character(names2(cols)) + names(out) <- as_utf8_character(names2(out)) + + for (i in seq_along(cols)) { + nm <- nms[[i]] + out[[nm]] <- cols[[i]] + } + + # Transform back to data frame before reconstruction + row_names <- .row_names_info(data, type = 0L) + out <- new_data_frame(out, n = nrow(data), row.names = row_names) + + dplyr_reconstruct(out, data) +} + +#' @export +dplyr_col_modify.grouped_df <- function(data, cols) { + out <- dplyr_col_modify(as_tibble(data), cols) + + if (any(names(cols) %in% group_vars(data))) { + # regroup + grouped_df(out, group_vars(data), drop = group_by_drop_default(data)) + } else { + new_grouped_df(out, group_data(data)) + } +} + +#' @export +dplyr_col_modify.rowwise_df <- function(data, cols) { + out <- dplyr_col_modify(as_tibble(data), cols) + rowwise_df(out, group_vars(data)) +} + +#' @param template Template to use for restoring attributes +#' @export +#' @rdname dplyr_extending +dplyr_reconstruct <- function(data, template) { + # Strip attributes before dispatch to make it easier to implement + # methods and prevent unexpected leaking of irrelevant attributes. + data <- dplyr_new_data_frame(data) + return(dplyr_reconstruct_dispatch(data, template)) + UseMethod("dplyr_reconstruct", template) +} +dplyr_reconstruct_dispatch <- function(data, template) { + UseMethod("dplyr_reconstruct", template) +} + +#' @export +dplyr_reconstruct.data.frame <- function(data, template) { + attrs <- attributes(template) + attrs$names <- names(data) + attrs$row.names <- .row_names_info(data, type = 0L) + + attributes(data) <- attrs + data +} + +#' @export +dplyr_reconstruct.grouped_df <- function(data, template) { + group_vars <- group_intersect(template, data) + grouped_df(data, group_vars, drop = group_by_drop_default(template)) +} + +#' @export +dplyr_reconstruct.rowwise_df <- function(data, template) { + group_vars <- group_intersect(template, data) + rowwise_df(data, group_vars) +} + +dplyr_col_select <- function(.data, loc, names = NULL) { + loc <- vec_as_location(loc, n = ncol(.data), names = names(.data)) + + out <- .data[loc] + if (!inherits(out, "data.frame")) { + abort(c( + "Can't reconstruct data frame.", + x = glue("The `[` method for class <{classes_data}> must return a data frame.", + classes_data = glue_collapse(class(.data), sep = "/") + ), + i = glue("It returned a <{classes_out}>.", + classes_out = glue_collapse(class(out), sep = "/") + ) + )) + } + if (length(out) != length(loc)) { + abort(c( + "Can't reconstruct data frame.", + x = glue("The `[` method for class <{classes_data}> must return a data frame with {length(loc)} column{s}.", + classes_data = glue_collapse(class(.data), sep = "/"), + s = if(length(loc) == 1) "" else "s" + ), + i = glue("It returned a <{classes_out}> of {length(out)} column{s}.", + classes_out = glue_collapse(class(out), sep = "/"), + s = if(length(out) == 1) "" else "s" + ) + )) + } + + # Patch base data frames to restore extra attributes that `[.data.frame` drops. + # We require `[` methods to keep extra attributes for all data frame subclasses. + if (identical(class(.data), "data.frame")) { + out <- dplyr_reconstruct(out, .data) + } + + if (!is.null(names)) { + names(out) <- names + } + + out +} From 85b27996deae40451de5a023fd921930b3f32912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 05:37:11 +0200 Subject: [PATCH 2/9] dplyr -> tibble --- R/reconstruct.R | 94 ++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/R/reconstruct.R b/R/reconstruct.R index fd2f58820..b326a122b 100644 --- a/R/reconstruct.R +++ b/R/reconstruct.R @@ -1,13 +1,13 @@ -#' Extending dplyr with new data frame subclasses +#' Extending tibble with new data frame subclasses #' #' @description #' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")} #' #' These three functions, along with `names<-` and 1d numeric `[` -#' (i.e. `x[loc]`) methods, provide a minimal interface for extending dplyr +#' (i.e. `x[loc]`) methods, provide a minimal interface for extending tibble #' to work with new data frame subclasses. This means that for simple cases #' you should only need to provide a couple of methods, rather than a method -#' for every dplyr verb. +#' for every tibble verb. #' #' These functions are a stop-gap measure until we figure out how to solve #' the problem more generally, but it's likely that any code you write to @@ -15,34 +15,34 @@ #' #' # Basic advice #' -#' This section gives you basic advice if you want to extend dplyr to work with -#' your custom data frame subclass, and you want the dplyr methods to behave +#' This section gives you basic advice if you want to extend tibble to work with +#' your custom data frame subclass, and you want the tibble methods to behave #' in basically the same way. #' #' * If you have data frame attributes that don't depend on the rows or columns #' (and should unconditionally be preserved), you don't need to do anything. #' #' * If you have __scalar__ attributes that depend on __rows__, implement a -#' `dplyr_reconstruct()` method. Your method should recompute the attribute +#' `tibble_reconstruct()` method. Your method should recompute the attribute #' depending on rows now present. #' #' * If you have __scalar__ attributes that depend on __columns__, implement a -#' `dplyr_reconstruct()` method and a 1d `[` method. For example, if your +#' `tibble_reconstruct()` method and a 1d `[` method. For example, if your #' class requires that certain columns be present, your method should return #' a data.frame or tibble when those columns are removed. #' #' * If your attributes are __vectorised__ over __rows__, implement a -#' `dplyr_row_slice()` method. This gives you access to `i` so you can +#' `tibble_row_slice()` method. This gives you access to `i` so you can #' modify the row attribute accordingly. You'll also need to think carefully -#' about how to recompute the attribute in `dplyr_reconstruct()`, and +#' about how to recompute the attribute in `tibble_reconstruct()`, and #' you will need to carefully verify the behaviour of each verb, and provide #' additional methods as needed. #' #' * If your attributes that are __vectorised__ over __columns__, implement -#' `dplyr_col_modify()`, 1d `[`, and `names<-` methods. All of these methods +#' `tibble_col_modify()`, 1d `[`, and `names<-` methods. All of these methods #' know which columns are being modified, so you can update the column #' attribute according. You'll also need to think carefully about how to -#' recompute the attribute in `dplyr_reconstruct()`, and you will need to +#' recompute the attribute in `tibble_reconstruct()`, and you will need to #' carefully verify the behaviour of each verb, and provide additional #' methods as needed. #' @@ -50,27 +50,27 @@ #' #' * `arrange()`, `filter()`, `slice()`, `semi_join()`, and `anti_join()` #' work by generating a vector of row indices, and then subsetting -#' with `dplyr_row_slice()`. +#' with `tibble_row_slice()`. #' #' * `mutate()` generates a list of new column value (using `NULL` to indicate -#' when columns should be deleted), then passes that to `dplyr_col_modify()`. +#' when columns should be deleted), then passes that to `tibble_col_modify()`. #' `transmute()` does the same then uses 1d `[` to select the columns. #' #' * `summarise()` works similarly to `mutate()` but the data modified by -#' `dplyr_col_modify()` comes from `group_data()`. +#' `tibble_col_modify()` comes from `group_data()`. #' #' * `select()` uses 1d `[` to select columns, then `names<-` to rename them. #' `rename()` just uses `names<-`. `relocate()` just uses 1d `[`. #' #' * `inner_join()`, `left_join()`, `right_join()`, and `full_join()` -#' coerces `x` to a tibble, modify the rows, then uses `dplyr_reconstruct()` +#' coerces `x` to a tibble, modify the rows, then uses `tibble_reconstruct()` #' to convert back to the same type as `x`. #' -#' * `nest_join()` uses `dplyr_col_modify()` to cast the key variables to +#' * `nest_join()` uses `tibble_col_modify()` to cast the key variables to #' common type and add the nested-df that `y` becomes. #' #' * `distinct()` does a `mutate()` if any expressions are present, then -#' uses 1d `[` to select variables to keep, then `dplyr_row_slice()` to +#' uses 1d `[` to select variables to keep, then `tibble_row_slice()` to #' select distinct rows. #' #' Note that `group_by()` and `ungroup()` don't use any these generics and @@ -79,27 +79,27 @@ #' @keywords internal #' @param data A tibble. We use tibbles because they avoid some inconsistent #' subset-assignment use cases -#' @name dplyr_extending +#' @name tibble_extending NULL #' @export -#' @rdname dplyr_extending +#' @rdname tibble_extending #' @param i A numeric or logical vector that indexes the rows of `.data`. -dplyr_row_slice <- function(data, i, ...) { +tibble_row_slice <- function(data, i, ...) { if (!is.numeric(i) && !is.logical(i)) { abort("`i` must be an numeric or logical vector.") } - UseMethod("dplyr_row_slice") + UseMethod("tibble_row_slice") } #' @export -dplyr_row_slice.data.frame <- function(data, i, ...) { - dplyr_reconstruct(vec_slice(data, i), data) +tibble_row_slice.data.frame <- function(data, i, ...) { + tibble_reconstruct(vec_slice(data, i), data) } #' @export -dplyr_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) { +tibble_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) { out <- vec_slice(as.data.frame(data), i) # Index into group_indices, then use that to restore the grouping structure @@ -118,22 +118,22 @@ dplyr_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) { } #' @export -dplyr_row_slice.rowwise_df <- function(data, i, ..., preserve = FALSE) { +tibble_row_slice.rowwise_df <- function(data, i, ..., preserve = FALSE) { out <- vec_slice(data, i) group_data <- vec_slice(group_keys(data), i) new_rowwise_df(out, group_data) } #' @export -#' @rdname dplyr_extending +#' @rdname tibble_extending #' @param cols A named list used modify columns. A `NULL` value should remove #' an existing column. -dplyr_col_modify <- function(data, cols) { - UseMethod("dplyr_col_modify") +tibble_col_modify <- function(data, cols) { + UseMethod("tibble_col_modify") } #' @export -dplyr_col_modify.data.frame <- function(data, cols) { +tibble_col_modify.data.frame <- function(data, cols) { # Must be implemented from first principles to avoiding edge cases in # [.data.frame and [.tibble (2.1.3 and earlier). @@ -141,7 +141,7 @@ dplyr_col_modify.data.frame <- function(data, cols) { cols <- vec_recycle_common(!!!cols, .size = nrow(data)) # Transform to list to avoid stripping inner names with `[[<-` - out <- as.list(dplyr_vec_data(data)) + out <- as.list(tibble_vec_data(data)) nms <- as_utf8_character(names2(cols)) names(out) <- as_utf8_character(names2(out)) @@ -155,12 +155,12 @@ dplyr_col_modify.data.frame <- function(data, cols) { row_names <- .row_names_info(data, type = 0L) out <- new_data_frame(out, n = nrow(data), row.names = row_names) - dplyr_reconstruct(out, data) + tibble_reconstruct(out, data) } #' @export -dplyr_col_modify.grouped_df <- function(data, cols) { - out <- dplyr_col_modify(as_tibble(data), cols) +tibble_col_modify.grouped_df <- function(data, cols) { + out <- tibble_col_modify(as_tibble(data), cols) if (any(names(cols) %in% group_vars(data))) { # regroup @@ -171,27 +171,27 @@ dplyr_col_modify.grouped_df <- function(data, cols) { } #' @export -dplyr_col_modify.rowwise_df <- function(data, cols) { - out <- dplyr_col_modify(as_tibble(data), cols) +tibble_col_modify.rowwise_df <- function(data, cols) { + out <- tibble_col_modify(as_tibble(data), cols) rowwise_df(out, group_vars(data)) } #' @param template Template to use for restoring attributes #' @export -#' @rdname dplyr_extending -dplyr_reconstruct <- function(data, template) { +#' @rdname tibble_extending +tibble_reconstruct <- function(data, template) { # Strip attributes before dispatch to make it easier to implement # methods and prevent unexpected leaking of irrelevant attributes. - data <- dplyr_new_data_frame(data) - return(dplyr_reconstruct_dispatch(data, template)) - UseMethod("dplyr_reconstruct", template) + data <- tibble_new_data_frame(data) + return(tibble_reconstruct_dispatch(data, template)) + UseMethod("tibble_reconstruct", template) } -dplyr_reconstruct_dispatch <- function(data, template) { - UseMethod("dplyr_reconstruct", template) +tibble_reconstruct_dispatch <- function(data, template) { + UseMethod("tibble_reconstruct", template) } #' @export -dplyr_reconstruct.data.frame <- function(data, template) { +tibble_reconstruct.data.frame <- function(data, template) { attrs <- attributes(template) attrs$names <- names(data) attrs$row.names <- .row_names_info(data, type = 0L) @@ -201,18 +201,18 @@ dplyr_reconstruct.data.frame <- function(data, template) { } #' @export -dplyr_reconstruct.grouped_df <- function(data, template) { +tibble_reconstruct.grouped_df <- function(data, template) { group_vars <- group_intersect(template, data) grouped_df(data, group_vars, drop = group_by_drop_default(template)) } #' @export -dplyr_reconstruct.rowwise_df <- function(data, template) { +tibble_reconstruct.rowwise_df <- function(data, template) { group_vars <- group_intersect(template, data) rowwise_df(data, group_vars) } -dplyr_col_select <- function(.data, loc, names = NULL) { +tibble_col_select <- function(.data, loc, names = NULL) { loc <- vec_as_location(loc, n = ncol(.data), names = names(.data)) out <- .data[loc] @@ -244,7 +244,7 @@ dplyr_col_select <- function(.data, loc, names = NULL) { # Patch base data frames to restore extra attributes that `[.data.frame` drops. # We require `[` methods to keep extra attributes for all data frame subclasses. if (identical(class(.data), "data.frame")) { - out <- dplyr_reconstruct(out, .data) + out <- tibble_reconstruct(out, .data) } if (!is.null(names)) { From 1857d3523a989a61c6ae75e5c55ed6dae3591613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 05:45:23 +0200 Subject: [PATCH 3/9] Strip --- NAMESPACE | 2 + R/reconstruct.R | 236 ++++++--------------------------------------- R/tibble-package.R | 1 + 3 files changed, 32 insertions(+), 207 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 51aec1283..07144a8da 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -97,12 +97,14 @@ importFrom(vctrs,vec_as_names_legacy) importFrom(vctrs,vec_as_subscript2) importFrom(vctrs,vec_assign) importFrom(vctrs,vec_c) +importFrom(vctrs,vec_data) importFrom(vctrs,vec_is) importFrom(vctrs,vec_names) importFrom(vctrs,vec_names2) importFrom(vctrs,vec_ptype_abbr) importFrom(vctrs,vec_rbind) importFrom(vctrs,vec_recycle) +importFrom(vctrs,vec_recycle_common) importFrom(vctrs,vec_set_names) importFrom(vctrs,vec_size) importFrom(vctrs,vec_slice) diff --git a/R/reconstruct.R b/R/reconstruct.R index b326a122b..ca1116f7c 100644 --- a/R/reconstruct.R +++ b/R/reconstruct.R @@ -1,139 +1,15 @@ -#' Extending tibble with new data frame subclasses -#' -#' @description -#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")} -#' -#' These three functions, along with `names<-` and 1d numeric `[` -#' (i.e. `x[loc]`) methods, provide a minimal interface for extending tibble -#' to work with new data frame subclasses. This means that for simple cases -#' you should only need to provide a couple of methods, rather than a method -#' for every tibble verb. -#' -#' These functions are a stop-gap measure until we figure out how to solve -#' the problem more generally, but it's likely that any code you write to -#' implement them will find a home in what comes next. -#' -#' # Basic advice -#' -#' This section gives you basic advice if you want to extend tibble to work with -#' your custom data frame subclass, and you want the tibble methods to behave -#' in basically the same way. -#' -#' * If you have data frame attributes that don't depend on the rows or columns -#' (and should unconditionally be preserved), you don't need to do anything. -#' -#' * If you have __scalar__ attributes that depend on __rows__, implement a -#' `tibble_reconstruct()` method. Your method should recompute the attribute -#' depending on rows now present. -#' -#' * If you have __scalar__ attributes that depend on __columns__, implement a -#' `tibble_reconstruct()` method and a 1d `[` method. For example, if your -#' class requires that certain columns be present, your method should return -#' a data.frame or tibble when those columns are removed. -#' -#' * If your attributes are __vectorised__ over __rows__, implement a -#' `tibble_row_slice()` method. This gives you access to `i` so you can -#' modify the row attribute accordingly. You'll also need to think carefully -#' about how to recompute the attribute in `tibble_reconstruct()`, and -#' you will need to carefully verify the behaviour of each verb, and provide -#' additional methods as needed. -#' -#' * If your attributes that are __vectorised__ over __columns__, implement -#' `tibble_col_modify()`, 1d `[`, and `names<-` methods. All of these methods -#' know which columns are being modified, so you can update the column -#' attribute according. You'll also need to think carefully about how to -#' recompute the attribute in `tibble_reconstruct()`, and you will need to -#' carefully verify the behaviour of each verb, and provide additional -#' methods as needed. -#' -#' # Current usage -#' -#' * `arrange()`, `filter()`, `slice()`, `semi_join()`, and `anti_join()` -#' work by generating a vector of row indices, and then subsetting -#' with `tibble_row_slice()`. -#' -#' * `mutate()` generates a list of new column value (using `NULL` to indicate -#' when columns should be deleted), then passes that to `tibble_col_modify()`. -#' `transmute()` does the same then uses 1d `[` to select the columns. -#' -#' * `summarise()` works similarly to `mutate()` but the data modified by -#' `tibble_col_modify()` comes from `group_data()`. -#' -#' * `select()` uses 1d `[` to select columns, then `names<-` to rename them. -#' `rename()` just uses `names<-`. `relocate()` just uses 1d `[`. -#' -#' * `inner_join()`, `left_join()`, `right_join()`, and `full_join()` -#' coerces `x` to a tibble, modify the rows, then uses `tibble_reconstruct()` -#' to convert back to the same type as `x`. -#' -#' * `nest_join()` uses `tibble_col_modify()` to cast the key variables to -#' common type and add the nested-df that `y` becomes. -#' -#' * `distinct()` does a `mutate()` if any expressions are present, then -#' uses 1d `[` to select variables to keep, then `tibble_row_slice()` to -#' select distinct rows. -#' -#' Note that `group_by()` and `ungroup()` don't use any these generics and -#' you'll need to provide methods directly. -#' -#' @keywords internal -#' @param data A tibble. We use tibbles because they avoid some inconsistent -#' subset-assignment use cases -#' @name tibble_extending -NULL - -#' @export -#' @rdname tibble_extending -#' @param i A numeric or logical vector that indexes the rows of `.data`. +# Keep in sync with generics.R in dplyr +# Imported from 3de24a738243a3d07c87b3f4e4afa5f6b02ff561 + tibble_row_slice <- function(data, i, ...) { if (!is.numeric(i) && !is.logical(i)) { abort("`i` must be an numeric or logical vector.") } - UseMethod("tibble_row_slice") -} - -#' @export -tibble_row_slice.data.frame <- function(data, i, ...) { tibble_reconstruct(vec_slice(data, i), data) } -#' @export -tibble_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) { - out <- vec_slice(as.data.frame(data), i) - - # Index into group_indices, then use that to restore the grouping structure - groups <- group_data(data) - new_id <- vec_slice(group_indices(data), i) - new_grps <- vec_group_loc(new_id) - - rows <- rep(list_of(integer()), length.out = nrow(groups)) - rows[new_grps$key] <- new_grps$loc - groups$.rows <- rows - if (!preserve && isTRUE(attr(groups, ".drop"))) { - groups <- group_data_trim(groups) - } - - new_grouped_df(out, groups) -} - -#' @export -tibble_row_slice.rowwise_df <- function(data, i, ..., preserve = FALSE) { - out <- vec_slice(data, i) - group_data <- vec_slice(group_keys(data), i) - new_rowwise_df(out, group_data) -} - -#' @export -#' @rdname tibble_extending -#' @param cols A named list used modify columns. A `NULL` value should remove -#' an existing column. tibble_col_modify <- function(data, cols) { - UseMethod("tibble_col_modify") -} - -#' @export -tibble_col_modify.data.frame <- function(data, cols) { # Must be implemented from first principles to avoiding edge cases in # [.data.frame and [.tibble (2.1.3 and earlier). @@ -141,7 +17,7 @@ tibble_col_modify.data.frame <- function(data, cols) { cols <- vec_recycle_common(!!!cols, .size = nrow(data)) # Transform to list to avoid stripping inner names with `[[<-` - out <- as.list(tibble_vec_data(data)) + out <- as.list(dplyr_vec_data(data)) nms <- as_utf8_character(names2(cols)) names(out) <- as_utf8_character(names2(out)) @@ -158,40 +34,11 @@ tibble_col_modify.data.frame <- function(data, cols) { tibble_reconstruct(out, data) } -#' @export -tibble_col_modify.grouped_df <- function(data, cols) { - out <- tibble_col_modify(as_tibble(data), cols) - - if (any(names(cols) %in% group_vars(data))) { - # regroup - grouped_df(out, group_vars(data), drop = group_by_drop_default(data)) - } else { - new_grouped_df(out, group_data(data)) - } -} - -#' @export -tibble_col_modify.rowwise_df <- function(data, cols) { - out <- tibble_col_modify(as_tibble(data), cols) - rowwise_df(out, group_vars(data)) -} - -#' @param template Template to use for restoring attributes -#' @export -#' @rdname tibble_extending tibble_reconstruct <- function(data, template) { # Strip attributes before dispatch to make it easier to implement # methods and prevent unexpected leaking of irrelevant attributes. - data <- tibble_new_data_frame(data) - return(tibble_reconstruct_dispatch(data, template)) - UseMethod("tibble_reconstruct", template) -} -tibble_reconstruct_dispatch <- function(data, template) { - UseMethod("tibble_reconstruct", template) -} + data <- dplyr_new_data_frame(data) -#' @export -tibble_reconstruct.data.frame <- function(data, template) { attrs <- attributes(template) attrs$names <- names(data) attrs$row.names <- .row_names_info(data, type = 0L) @@ -200,56 +47,31 @@ tibble_reconstruct.data.frame <- function(data, template) { data } -#' @export -tibble_reconstruct.grouped_df <- function(data, template) { - group_vars <- group_intersect(template, data) - grouped_df(data, group_vars, drop = group_by_drop_default(template)) -} - -#' @export -tibble_reconstruct.rowwise_df <- function(data, template) { - group_vars <- group_intersect(template, data) - rowwise_df(data, group_vars) -} - -tibble_col_select <- function(.data, loc, names = NULL) { - loc <- vec_as_location(loc, n = ncol(.data), names = names(.data)) - - out <- .data[loc] - if (!inherits(out, "data.frame")) { - abort(c( - "Can't reconstruct data frame.", - x = glue("The `[` method for class <{classes_data}> must return a data frame.", - classes_data = glue_collapse(class(.data), sep = "/") - ), - i = glue("It returned a <{classes_out}>.", - classes_out = glue_collapse(class(out), sep = "/") - ) - )) - } - if (length(out) != length(loc)) { - abort(c( - "Can't reconstruct data frame.", - x = glue("The `[` method for class <{classes_data}> must return a data frame with {length(loc)} column{s}.", - classes_data = glue_collapse(class(.data), sep = "/"), - s = if(length(loc) == 1) "" else "s" - ), - i = glue("It returned a <{classes_out}> of {length(out)} column{s}.", - classes_out = glue_collapse(class(out), sep = "/"), - s = if(length(out) == 1) "" else "s" - ) - )) - } +# Until fixed upstream. `vec_data()` should not return lists from data +# frames. +dplyr_vec_data <- function(x) { + out <- vec_data(x) - # Patch base data frames to restore extra attributes that `[.data.frame` drops. - # We require `[` methods to keep extra attributes for all data frame subclasses. - if (identical(class(.data), "data.frame")) { - out <- tibble_reconstruct(out, .data) - } - - if (!is.null(names)) { - names(out) <- names + if (is.data.frame(x)) { + new_data_frame(out, n = nrow(x)) + } else { + out } +} - out +# Until vctrs::new_data_frame() forwards row names automatically +dplyr_new_data_frame <- function(x = data.frame(), + n = NULL, + ..., + row.names = NULL, + class = NULL) { + row.names <- row.names %||% .row_names_info(x, type = 0L) + + new_data_frame( + x, + n = n, + ..., + row.names = row.names, + class = class + ) } diff --git a/R/tibble-package.R b/R/tibble-package.R index c076a8854..0ac249bf8 100644 --- a/R/tibble-package.R +++ b/R/tibble-package.R @@ -10,6 +10,7 @@ #' @importFrom vctrs vec_names vec_names2 vec_set_names #' @importFrom vctrs new_rcrd #' @importFrom vctrs new_data_frame +#' @importFrom vctrs vec_recycle_common vec_data #' @aliases NULL tibble-package #' @details #' `r lifecycle::badge("stable")` From 298063bca8cdc35b48fbed5bd346c2125fc72006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 17:31:49 +0200 Subject: [PATCH 4/9] Fix tibble_row_slice() --- R/reconstruct.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/reconstruct.R b/R/reconstruct.R index ca1116f7c..d427610be 100644 --- a/R/reconstruct.R +++ b/R/reconstruct.R @@ -3,10 +3,10 @@ tibble_row_slice <- function(data, i, ...) { if (!is.numeric(i) && !is.logical(i)) { - abort("`i` must be an numeric or logical vector.") + abort("`i` must be a numeric or logical vector.") } - tibble_reconstruct(vec_slice(data, i), data) + tibble_reconstruct(vec_slice(remove_rownames(data), i), data) } tibble_col_modify <- function(data, cols) { From 8c146f44c5e47022870492d275924baf5a6c19fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 06:54:51 +0200 Subject: [PATCH 5/9] Use tibble_reconstruct() --- R/subsetting.R | 2 +- src/attributes.c | 18 ------------------ src/init.c | 1 - src/tibble.h | 2 -- 4 files changed, 1 insertion(+), 22 deletions(-) delete mode 100644 src/attributes.c diff --git a/R/subsetting.R b/R/subsetting.R index f6a4b2d56..5712da08c 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -757,7 +757,7 @@ set_tibble_class <- function(x, nrow) { # External ---------------------------------------------------------------- vectbl_restore <- function(xo, x) { - .Call(`tibble_restore_impl`, xo, x) + tibble_reconstruct(xo, x) } # Errors ------------------------------------------------------------------ diff --git a/src/attributes.c b/src/attributes.c deleted file mode 100644 index 7770b41d9..000000000 --- a/src/attributes.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "tibble.h" - -SEXP tibble_restore_impl(SEXP xo, SEXP x) { - xo = PROTECT(Rf_shallow_duplicate(xo)); - - // copy over all attributes except `names` and `row.names` - SEXP attr_x = ATTRIB(x); - while(attr_x != R_NilValue) { - SEXP tag = TAG(attr_x); - if (tag != R_NamesSymbol && tag != R_RowNamesSymbol) { - Rf_setAttrib(xo, tag, CAR(attr_x)); - } - attr_x = CDR(attr_x); - } - - UNPROTECT(1); - return xo; -} diff --git a/src/init.c b/src/init.c index 4588296a2..34a41b670 100644 --- a/src/init.c +++ b/src/init.c @@ -7,7 +7,6 @@ static const R_CallMethodDef CallEntries[] = { {"tibble_matrixToDataFrame", (DL_FUNC) &tibble_matrixToDataFrame, 1}, {"tibble_string_to_indices", (DL_FUNC) &tibble_string_to_indices, 1}, - {"tibble_restore_impl", (DL_FUNC) &tibble_restore_impl, 2}, {"tibble_need_coerce", (DL_FUNC) &tibble_need_coerce, 1}, {NULL, NULL, 0} diff --git a/src/tibble.h b/src/tibble.h index b1c762fb2..6902a0b9f 100644 --- a/src/tibble.h +++ b/src/tibble.h @@ -7,7 +7,5 @@ SEXP tibble_matrixToDataFrame(SEXP xSEXP); SEXP tibble_string_to_indices(SEXP x); SEXP tibble_need_coerce(SEXP x); -SEXP tibble_update_attrs(SEXP x, SEXP dots); -SEXP tibble_restore_impl(SEXP xo, SEXP x); #endif /* TIBBLE_H */ From 5e390dee7a700b9d9a4f360fb91024a21912aabe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 06:55:36 +0200 Subject: [PATCH 6/9] Inline --- R/add.R | 4 ++-- R/subsetting.R | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/R/add.R b/R/add.R index 369842471..ae1a69476 100644 --- a/R/add.R +++ b/R/add.R @@ -63,7 +63,7 @@ add_row <- function(.data, ..., .before = NULL, .after = NULL) { pos <- pos_from_before_after(.before, .after, nrow(.data)) out <- rbind_at(.data, df, pos) - vectbl_restore(out, .data) + tibble_reconstruct(out, .data) } #' @export @@ -166,7 +166,7 @@ add_column <- function(.data, ..., .before = NULL, .after = NULL, out <- new_data[indexes] out <- set_repaired_names(out, repair_hint = TRUE, .name_repair) - vectbl_restore(out, .data) + tibble_reconstruct(out, .data) } diff --git a/R/subsetting.R b/R/subsetting.R index 5712da08c..4a22407f3 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -256,7 +256,7 @@ NULL if (drop && length(xo) == 1L) { tbl_subset2(xo, 1L, j_arg) } else { - vectbl_restore(xo, x) + tibble_reconstruct(xo, x) } } @@ -463,7 +463,7 @@ tbl_subassign <- function(x, i, j, value, i_arg, j_arg, value_arg) { } } - vectbl_restore(xo, x) + tibble_reconstruct(xo, x) } vectbl_as_new_row_index <- function(i, x, i_arg) { @@ -754,12 +754,6 @@ set_tibble_class <- function(x, nrow) { x } -# External ---------------------------------------------------------------- - -vectbl_restore <- function(xo, x) { - tibble_reconstruct(xo, x) -} - # Errors ------------------------------------------------------------------ error_need_rhs_vector <- function(value_arg) { From a31caf0bdb6c8393b4d5a6224b2f92c09976c7fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 07:14:55 +0200 Subject: [PATCH 7/9] Mark entry points --- R/add.R | 2 ++ R/subsetting.R | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/R/add.R b/R/add.R index ae1a69476..d34bebfb0 100644 --- a/R/add.R +++ b/R/add.R @@ -95,6 +95,8 @@ rbind_at <- function(old, new, pos) { seq2(pos + 1L, nrow(old)) ) vec_slice(out, idx) + + # tibble_reconstruct } #' Add columns to a data frame diff --git a/R/subsetting.R b/R/subsetting.R index 4a22407f3..9c46bdcfb 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -410,6 +410,7 @@ tbl_subset2 <- function(x, j, j_arg) { tbl_subset_row <- function(x, i, i_arg) { if (is.null(i)) return(x) i <- vectbl_as_row_index(i, x, i_arg) + # tibble_row_slice xo <- lapply(unclass(x), vec_slice, i = i) set_tibble_class(xo, nrow = length(i)) } @@ -602,6 +603,8 @@ is_tight_sequence_at_end <- function(i_new, n) { } tbl_subassign_col <- function(x, j, value) { + # tibble_col_modify + is_data <- !vapply(value, is.null, NA) nrow <- fast_nrow(x) @@ -640,6 +643,7 @@ tbl_expand_to_nrow <- function(x, i) { if (new_nrow != nrow) { # FIXME: vec_expand()? i_expand <- c(seq_len(nrow), rep(NA_integer_, new_nrow - nrow)) + # tibble_row_slice x <- vec_slice(x, i_expand) } From 4108aeb30bd15d926895130fe0f7b86fe8dfa450 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 08:56:00 +0200 Subject: [PATCH 8/9] Use tibble_row_slice() --- R/subsetting.R | 7 ++----- tests/testthat/test-subsetting.R | 9 +++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/R/subsetting.R b/R/subsetting.R index 9c46bdcfb..d11756ebf 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -410,9 +410,7 @@ tbl_subset2 <- function(x, j, j_arg) { tbl_subset_row <- function(x, i, i_arg) { if (is.null(i)) return(x) i <- vectbl_as_row_index(i, x, i_arg) - # tibble_row_slice - xo <- lapply(unclass(x), vec_slice, i = i) - set_tibble_class(xo, nrow = length(i)) + tibble_row_slice(x, i) } tbl_subassign <- function(x, i, j, value, i_arg, j_arg, value_arg) { @@ -643,8 +641,7 @@ tbl_expand_to_nrow <- function(x, i) { if (new_nrow != nrow) { # FIXME: vec_expand()? i_expand <- c(seq_len(nrow), rep(NA_integer_, new_nrow - nrow)) - # tibble_row_slice - x <- vec_slice(x, i_expand) + x <- tibble_row_slice(x, i_expand) } x diff --git a/tests/testthat/test-subsetting.R b/tests/testthat/test-subsetting.R index ffe5fef33..5d5c87062 100644 --- a/tests/testthat/test-subsetting.R +++ b/tests/testthat/test-subsetting.R @@ -16,6 +16,15 @@ test_that("[ retains class", { expect_identical(class(mtcars2), class(mtcars2[1:5, 1:5])) }) +test_that("[ removes row names", { + tbl <- tibble(a = 1:3) + expect_warning(rownames(tbl) <- letters[1:3], "deprecated") + + expect_equal(rownames(tbl), letters[1:3]) + expect_equal(rownames(tbl[1, ]), "1") + expect_equal(rownames(tbl["a"]), as.character(1:3)) +}) + test_that("[ and as_tibble commute", { mtcars2 <- as_tibble(mtcars) expect_identical(mtcars2, as_tibble(mtcars)) From 5836a30e27ca3d7395026d80fb9298544e896183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sat, 31 Jul 2021 11:41:52 +0200 Subject: [PATCH 9/9] Use tibble_col_modify() --- R/subsetting.R | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/R/subsetting.R b/R/subsetting.R index d11756ebf..5266868a6 100644 --- a/R/subsetting.R +++ b/R/subsetting.R @@ -601,32 +601,42 @@ is_tight_sequence_at_end <- function(i_new, n) { } tbl_subassign_col <- function(x, j, value) { + # Fix order + order_j <- order(j) + value <- value[order_j] + j <- j[order_j] + # tibble_col_modify - is_data <- !vapply(value, is.null, NA) - nrow <- fast_nrow(x) + # Adapt to interface + names(value) <- names(j) - x <- unclass(x) + # New names + tweak_names <- (j > length(x)) + need_tweak_names <- any(tweak_names) - # Grow, assign new names - new <- which(j > length(x)) - if (has_length(new)) { - length(x) <- max(j[new]) - names(x)[ j[new] ] <- names2(j)[new] - } + if (need_tweak_names) { + new_names <- names(x) + new_names[ j[tweak_names] ] <- names(j)[tweak_names] + + # New names ("" means appending at end) + names(value)[tweak_names] <- "" - # Update - for (jj in which(is_data)) { - ji <- j[[jj]] - x[[ji]] <- value[[jj]] + # Removed names, use vapply() for speed + col_is_null <- vapply(value, is.null, NA) + if (any(col_is_null)) { + new_names <- new_names[ -j[col_is_null] ] + } } - # Remove - j_remove <- j[!is_data & !is.na(j)] - if (has_length(j_remove)) x <- x[-j_remove] + out <- tibble_col_modify(x, value) - # Restore - set_tibble_class(x, nrow) + # This calls `names<-()` for the tibble class + if (need_tweak_names) { + names(out) <- new_names + } + + return(out) } tbl_expand_to_nrow <- function(x, i) {