across
In [1]:
Copied!
%run nb_helpers.py
from datar.data import iris
from datar.all import *
nb_header(across, if_any, if_all, c_across)
%run nb_helpers.py
from datar.data import iris
from datar.all import *
nb_header(across, if_any, if_all, c_across)
Try this notebook on binder.
★ across¶
Apply the same transformation to multiple columns
The original API:
https://dplyr.tidyverse.org/reference/across.html
Examples:
#
>>> iris >> mutate(across(c(f.Sepal_Length, f.Sepal_Width), round))
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
<float64> <float64> <float64> <float64> <object>
0 5.0 4.0 1.4 0.2 setosa
1 5.0 3.0 1.4 0.2 setosa
.. ... ... ... ... ...
>>> iris >> group_by(f.Species) >> summarise(
>>> across(starts_with("Sepal"), mean)
>>> )
Species Sepal_Length Sepal_Width
<object> <float64> <float64>
0 setosa 5.006 3.428
1 versicolor 5.936 2.770
2 virginica 6.588 2.974
Args:
_data: The dataframe.
*args: If given, the first 2 elements should be columns and functions
apply to each of the selected columns. The rest of them will be
the arguments for the functions.
_names: A glue specification that describes how to name
the output columns. This can use `{_col}` to stand for the
selected column name, and `{_fn}` to stand for the name of
the function being applied.
The default (None) is equivalent to `{_col}` for the
single function case and `{_col}_{_fn}` for the case where
a list is used for _fns. In such a case, `{_fn}` is 0-based.
To use 1-based index, use `{_fn1}`
_fn_context: Defines the context to evaluate the arguments for functions
if they are plain functions.
Note that registered functions will use its own context
**kwargs: Keyword arguments for the functions
Returns:
A dataframe with one column for each column and each function.
★ if_any¶
★ if_all¶
★ c_across¶
In [2]:
Copied!
# round not changing dtypes (Series.round)
iris >> mutate(across(c(f.Sepal_Length, f.Sepal_Width), round))
# round not changing dtypes (Series.round)
iris >> mutate(across(c(f.Sepal_Length, f.Sepal_Width), round))
Out[2]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
0 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
1 | 5.0 | 3.0 | 1.4 | 0.2 | setosa |
2 | 5.0 | 3.0 | 1.3 | 0.2 | setosa |
3 | 5.0 | 3.0 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
145 | 7.0 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.0 | 2.0 | 5.0 | 1.9 | virginica |
147 | 6.0 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.0 | 3.0 | 5.4 | 2.3 | virginica |
149 | 6.0 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [3]:
Copied!
iris >> mutate(across(c(0, 1), round))
iris >> mutate(across(c(0, 1), round))
Out[3]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
0 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
1 | 5.0 | 3.0 | 1.4 | 0.2 | setosa |
2 | 5.0 | 3.0 | 1.3 | 0.2 | setosa |
3 | 5.0 | 3.0 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
145 | 7.0 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.0 | 2.0 | 5.0 | 1.9 | virginica |
147 | 6.0 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.0 | 3.0 | 5.4 | 2.3 | virginica |
149 | 6.0 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [4]:
Copied!
# use slice with column names
iris >> mutate(across(c[:f.Sepal_Width], round))
# use slice with column names
iris >> mutate(across(c[:f.Sepal_Width], round))
Out[4]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
0 | 5.0 | 3.5 | 1.4 | 0.2 | setosa |
1 | 5.0 | 3.0 | 1.4 | 0.2 | setosa |
2 | 5.0 | 3.2 | 1.3 | 0.2 | setosa |
3 | 5.0 | 3.1 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
145 | 7.0 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.0 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.0 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.0 | 3.4 | 5.4 | 2.3 | virginica |
149 | 6.0 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [5]:
Copied!
# to include stop of slice
iris >> mutate(across(c[:f.Sepal_Width:1], round))
# to include stop of slice
iris >> mutate(across(c[:f.Sepal_Width:1], round))
Out[5]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
0 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
1 | 5.0 | 3.0 | 1.4 | 0.2 | setosa |
2 | 5.0 | 3.0 | 1.3 | 0.2 | setosa |
3 | 5.0 | 3.0 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
145 | 7.0 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.0 | 2.0 | 5.0 | 1.9 | virginica |
147 | 6.0 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.0 | 3.0 | 5.4 | 2.3 | virginica |
149 | 6.0 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [6]:
Copied!
iris >> mutate(across(where(is_double) & ~c(f.Petal_Length, f.Petal_Width), round))
iris >> mutate(across(where(is_double) & ~c(f.Petal_Length, f.Petal_Width), round))
Out[6]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
0 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
1 | 5.0 | 3.0 | 1.4 | 0.2 | setosa |
2 | 5.0 | 3.0 | 1.3 | 0.2 | setosa |
3 | 5.0 | 3.0 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 4.0 | 1.4 | 0.2 | setosa |
145 | 7.0 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.0 | 2.0 | 5.0 | 1.9 | virginica |
147 | 6.0 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.0 | 3.0 | 5.4 | 2.3 | virginica |
149 | 6.0 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [7]:
Copied!
iris = iris >> mutate(Species=as_factor(f.Species))
iris
iris = iris >> mutate(Species=as_factor(f.Species))
iris
Out[7]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <category> | |
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [8]:
Copied!
iris = iris >> mutate(across(where(is_factor), as_character))
iris
iris = iris >> mutate(across(where(is_factor), as_character))
iris
Out[8]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
In [10]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), mean)
)
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), mean)
)
Out[10]:
Species | Sepal_Length | Sepal_Width | |
---|---|---|---|
<object> | <float64> | <float64> | |
0 | setosa | 5.006 | 3.428 |
1 | versicolor | 5.936 | 2.770 |
2 | virginica | 6.588 | 2.974 |
In [11]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), dict(mean=mean, sd=sd))
)
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), dict(mean=mean, sd=sd))
)
Out[11]:
Species | Sepal_Length_mean | Sepal_Length_sd | Sepal_Width_mean | Sepal_Width_sd | |
---|---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | <float64> | |
0 | setosa | 5.006 | 0.352490 | 3.428 | 0.379064 |
1 | versicolor | 5.936 | 0.516171 | 2.770 | 0.313798 |
2 | virginica | 6.588 | 0.635880 | 2.974 | 0.322497 |
In [12]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), mean, _names = "mean_{_col}")
)
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), mean, _names = "mean_{_col}")
)
Out[12]:
Species | mean_Sepal_Length | mean_Sepal_Width | |
---|---|---|---|
<object> | <float64> | <float64> | |
0 | setosa | 5.006 | 3.428 |
1 | versicolor | 5.936 | 2.770 |
2 | virginica | 6.588 | 2.974 |
In [13]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), dict(mean=mean, sd=sd), _names = "{_col}.{_fn}")
)
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), dict(mean=mean, sd=sd), _names = "{_col}.{_fn}")
)
Out[13]:
Species | Sepal_Length.mean | Sepal_Length.sd | Sepal_Width.mean | Sepal_Width.sd | |
---|---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | <float64> | |
0 | setosa | 5.006 | 0.352490 | 3.428 | 0.379064 |
1 | versicolor | 5.936 | 0.516171 | 2.770 | 0.313798 |
2 | virginica | 6.588 | 0.635880 | 2.974 | 0.322497 |
In [14]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), [mean, sd], _names = "{_col}.fn{_fn}")
)
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), [mean, sd], _names = "{_col}.fn{_fn}")
)
Out[14]:
Species | Sepal_Length.fn0 | Sepal_Length.fn1 | Sepal_Width.fn0 | Sepal_Width.fn1 | |
---|---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | <float64> | |
0 | setosa | 5.006 | 0.352490 | 3.428 | 0.379064 |
1 | versicolor | 5.936 | 0.516171 | 2.770 | 0.313798 |
2 | virginica | 6.588 | 0.635880 | 2.974 | 0.322497 |
In [15]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(
starts_with("Sepal"),
[mean, sd],
_names="{_col}.fn{_fn}",
)
)
# or use _fn0
# iris >> group_by(f.Species) >> summarise(
# across(
# starts_with("Sepal"),
# [mean, sd],
# _names="{_col}.fn{_fn1}", # _fn1 for 1-based
# )
# )
iris >> group_by(f.Species) >> summarise(
across(
starts_with("Sepal"),
[mean, sd],
_names="{_col}.fn{_fn}",
)
)
# or use _fn0
# iris >> group_by(f.Species) >> summarise(
# across(
# starts_with("Sepal"),
# [mean, sd],
# _names="{_col}.fn{_fn1}", # _fn1 for 1-based
# )
# )
Out[15]:
Species | Sepal_Length.fn0 | Sepal_Length.fn1 | Sepal_Width.fn0 | Sepal_Width.fn1 | |
---|---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | <float64> | |
0 | setosa | 5.006 | 0.352490 | 3.428 | 0.379064 |
1 | versicolor | 5.936 | 0.516171 | 2.770 | 0.313798 |
2 | virginica | 6.588 | 0.635880 | 2.974 | 0.322497 |
In [16]:
Copied!
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), [mean, sd], _names = "{_col}.fn{_fn1}")
)
iris >> group_by(f.Species) >> summarise(
across(starts_with("Sepal"), [mean, sd], _names = "{_col}.fn{_fn1}")
)
Out[16]:
Species | Sepal_Length.fn1 | Sepal_Length.fn2 | Sepal_Width.fn1 | Sepal_Width.fn2 | |
---|---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | <float64> | |
0 | setosa | 5.006 | 0.352490 | 3.428 | 0.379064 |
1 | versicolor | 5.936 | 0.516171 | 2.770 | 0.313798 |
2 | virginica | 6.588 | 0.635880 | 2.974 | 0.322497 |
In [17]:
Copied!
iris >> filter(if_any(ends_with("Width"), lambda x: x > 4))
iris >> filter(if_any(ends_with("Width"), lambda x: x > 4))
Out[17]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
32 | 5.2 | 4.1 | 1.5 | 0.1 | setosa |
33 | 5.5 | 4.2 | 1.4 | 0.2 | setosa |
In [18]:
Copied!
iris >> filter(if_all(ends_with("Width"), lambda x: x > 2))
iris >> filter(if_all(ends_with("Width"), lambda x: x > 2))
Out[18]:
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | <object> | |
100 | 6.3 | 3.3 | 6.0 | 2.5 | virginica |
102 | 7.1 | 3.0 | 5.9 | 2.1 | virginica |
104 | 6.5 | 3.0 | 5.8 | 2.2 | virginica |
105 | 7.6 | 3.0 | 6.6 | 2.1 | virginica |
109 | 7.2 | 3.6 | 6.1 | 2.5 | virginica |
112 | 6.8 | 3.0 | 5.5 | 2.1 | virginica |
114 | 5.8 | 2.8 | 5.1 | 2.4 | virginica |
115 | 6.4 | 3.2 | 5.3 | 2.3 | virginica |
117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
118 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
120 | 6.9 | 3.2 | 5.7 | 2.3 | virginica |
124 | 6.7 | 3.3 | 5.7 | 2.1 | virginica |
128 | 6.4 | 2.8 | 5.6 | 2.1 | virginica |
132 | 6.4 | 2.8 | 5.6 | 2.2 | virginica |
135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
136 | 6.3 | 3.4 | 5.6 | 2.4 | virginica |
139 | 6.9 | 3.1 | 5.4 | 2.1 | virginica |
140 | 6.7 | 3.1 | 5.6 | 2.4 | virginica |
141 | 6.9 | 3.1 | 5.1 | 2.3 | virginica |
143 | 6.8 | 3.2 | 5.9 | 2.3 | virginica |
144 | 6.7 | 3.3 | 5.7 | 2.5 | virginica |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
In [19]:
Copied!
df = tibble(
id=[1, 2, 3, 4],
w=runif(4),
x=runif(4),
y=runif(4),
z=runif(4)
)
df >> rowwise() >> mutate(
sum = sum(c_across(c[f.w:f.z])),
sd = sd(c_across(c[f.w:f.z]))
)
df = tibble(
id=[1, 2, 3, 4],
w=runif(4),
x=runif(4),
y=runif(4),
z=runif(4)
)
df >> rowwise() >> mutate(
sum = sum(c_across(c[f.w:f.z])),
sd = sd(c_across(c[f.w:f.z]))
)
Out[19]:
id | w | x | y | z | sum | sd | |
---|---|---|---|---|---|---|---|
<int64> | <float64> | <float64> | <float64> | <float64> | <float64> | <float64> | |
0 | 1 | 0.909293 | 0.880456 | 0.174213 | 0.382593 | 1.963962 | 0.416324 |
1 | 2 | 0.102912 | 0.952811 | 0.632536 | 0.845920 | 1.688258 | 0.429225 |
2 | 3 | 0.425592 | 0.320275 | 0.803515 | 0.831533 | 1.549382 | 0.254112 |
3 | 4 | 0.218472 | 0.849190 | 0.637853 | 0.887980 | 1.705514 | 0.321026 |
TibbleRowwise: (n=4)
In [ ]:
Copied!