mutate
# https://dplyr.tidyverse.org/reference/mutate.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *
nb_header(mutate, transmute)
★ mutate¶
Add new columns to a data frame.¶
The original API:
https://dplyr.tidyverse.org/reference/mutate.html
Args:¶
_data
: A data frame
_keep
: allows you to control which columns from _data are retained
in the output:
- "all", the default, retains all variables.
- "used" keeps any variables used to make new variables;
it's useful for checking your work as it displays inputs and
outputs side-by-side.
- "unused" keeps only existing variables not used to make new variables.
- "none", only keeps grouping keys (like transmute()).
_before
: A list of column names to put the new columns before.
_after
: A list of column names to put the new columns after.
*args
: and
**kwargs
: Name-value pairs. The name gives the name of the column
in the output. The value can be:
- A vector of length 1, which will be recycled to the correct length.
- A vector the same length as the current group (or the whole data frame if ungrouped).
- None to remove the column
Returns:¶
An object of the same type as _data. The output has the following
properties:
- Rows are not affected.
- Existing columns will be preserved according to the _keep
argument. New columns will be placed according to the
_before and _after arguments. If _keep = "none"
(as in transmute()), the output order is determined only
by ..., not the order of existing columns.
- Columns given value None will be removed - Groups will be recomputed if a grouping variable is mutated. - Data frame attributes are preserved.
★ transmute¶
Add new columns to a data frame and remove existing columns¶
using mutate with _keep="none"
.
The original API:
https://dplyr.tidyverse.org/reference/mutate.html
Args:¶
_data
: A data frame
_before
: A list of column names to put the new columns before.
_after
: A list of column names to put the new columns after.
*args
: and
**kwargs
: Name-value pairs. The name gives the name of the column
in the output. The value can be:
- A vector of length 1, which will be recycled to the correct length.
- A vector the same length as the current group (or the whole data frame if ungrouped).
- None to remove the column
Returns:¶
An object of the same type as _data. The output has the following
properties:
- Rows are not affected.
- Existing columns will be preserved according to the _keep
argument. New columns will be placed according to the
_before and _after arguments. If _keep = "none"
(as in transmute()), the output order is determined only
by ..., not the order of existing columns.
- Columns given value None will be removed - Groups will be recomputed if a grouping variable is mutated. - Data frame attributes are preserved.
starwars >> \
select(f.name, f.mass) >> \
mutate(
mass2 = f.mass * 2,
mass2_squared = f.mass2 * f.mass2
)
name | mass | mass2 | mass2_squared | |
---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | |
0 | Luke Skywalker | 77.0 | 154.0 | 23716.0 |
1 | C-3PO | 75.0 | 150.0 | 22500.0 |
2 | R2-D2 | 32.0 | 64.0 | 4096.0 |
3 | Darth Vader | 136.0 | 272.0 | 73984.0 |
... | ... | ... | ... | ... |
4 | Leia Organa | 49.0 | 98.0 | 9604.0 |
82 | Rey | NaN | NaN | NaN |
83 | Poe Dameron | NaN | NaN | NaN |
84 | BB8 | NaN | NaN | NaN |
85 | Captain Phasma | NaN | NaN | NaN |
86 | Padmé Amidala | 45.0 | 90.0 | 8100.0 |
87 rows × 4 columns
starwars >> \
select(f.name, f.height, f.mass, f.homeworld) >> \
mutate(
mass = NULL,
height = f.height * 0.0328084 # convert to feet
)
name | height | homeworld | |
---|---|---|---|
<object> | <float64> | <object> | |
0 | Luke Skywalker | 5.643045 | Tatooine |
1 | C-3PO | 5.479003 | Tatooine |
2 | R2-D2 | 3.149606 | Naboo |
3 | Darth Vader | 6.627297 | Tatooine |
... | ... | ... | ... |
4 | Leia Organa | 4.921260 | Alderaan |
82 | Rey | NaN | NaN |
83 | Poe Dameron | NaN | NaN |
84 | BB8 | NaN | NaN |
85 | Captain Phasma | NaN | NaN |
86 | Padmé Amidala | 5.413386 | Naboo |
87 rows × 3 columns
x = starwars >> \
select(f.name, f.homeworld, f.species) >> \
mutate(across(~f.name, as_factor))
x.dtypes
x
name object homeworld category species category dtype: object
name | homeworld | species | |
---|---|---|---|
<object> | <category> | <category> | |
0 | Luke Skywalker | Tatooine | Human |
1 | C-3PO | Tatooine | Droid |
2 | R2-D2 | Naboo | Droid |
3 | Darth Vader | Tatooine | Human |
... | ... | ... | ... |
4 | Leia Organa | Alderaan | Human |
82 | Rey | NaN | Human |
83 | Poe Dameron | NaN | Human |
84 | BB8 | NaN | Droid |
85 | Captain Phasma | NaN | NaN |
86 | Padmé Amidala | Naboo | Human |
87 rows × 3 columns
starwars >> \
select(f.name, f.mass, f.homeworld) >> \
group_by(f.homeworld) >> \
mutate(rank=min_rank(desc(f.mass)))
name | mass | homeworld | rank | |
---|---|---|---|---|
<object> | <float64> | <object> | <float64> | |
0 | Luke Skywalker | 77.0 | Tatooine | 5.0 |
1 | C-3PO | 75.0 | Tatooine | 6.0 |
2 | R2-D2 | 32.0 | Naboo | 6.0 |
3 | Darth Vader | 136.0 | Tatooine | 1.0 |
... | ... | ... | ... | ... |
4 | Leia Organa | 49.0 | Alderaan | 2.0 |
82 | Rey | NaN | NaN | NaN |
83 | Poe Dameron | NaN | NaN | NaN |
84 | BB8 | NaN | NaN | NaN |
85 | Captain Phasma | NaN | NaN | NaN |
86 | Padmé Amidala | 45.0 | Naboo | 5.0 |
87 rows × 4 columns
TibbleGrouped: homeworld (n=49)
df = tibble(x=1, y=2)
df >> mutate(z=f.x+f.y)
x | y | z | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 1 | 2 | 3 |
df >> mutate(z=f.x+f.y, _before=1)
x | z | y | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 1 | 3 | 2 |
df >> mutate(z=f.x+f.y, _after=f.x)
x | z | y | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 1 | 3 | 2 |
# use a temporary column
df >> mutate(_z=f.x+f.y, z=f._z*2, _after=f.x)
x | z | y | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 1 | 6 | 2 |
df = tibble(x=1, y=2, a="a", b="b")
df >> mutate(z=f.x+f.y, _keep='all')
x | y | a | b | z | |
---|---|---|---|---|---|
<int64> | <int64> | <object> | <object> | <int64> | |
0 | 1 | 2 | a | b | 3 |
df >> mutate(z=f.x+f.y, _keep='used')
x | y | z | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 1 | 2 | 3 |
df >> mutate(z=f.x+f.y, _keep='unused')
a | b | z | |
---|---|---|---|
<object> | <object> | <int64> | |
0 | a | b | 3 |
df >> mutate(z=f.x+f.y, _keep='none')
z | |
---|---|
<int64> | |
0 | 3 |
starwars >> \
select(f.name, f.mass, f.species) >> \
mutate(mass_norm=f.mass/mean(f.mass))
name | mass | species | mass_norm | |
---|---|---|---|---|
<object> | <float64> | <object> | <float64> | |
0 | Luke Skywalker | 77.0 | Human | 0.791270 |
1 | C-3PO | 75.0 | Droid | 0.770718 |
2 | R2-D2 | 32.0 | Droid | 0.328840 |
3 | Darth Vader | 136.0 | Human | 1.397569 |
... | ... | ... | ... | ... |
4 | Leia Organa | 49.0 | Human | 0.503536 |
82 | Rey | NaN | Human | NaN |
83 | Poe Dameron | NaN | Human | NaN |
84 | BB8 | NaN | Droid | NaN |
85 | Captain Phasma | NaN | NaN | NaN |
86 | Padmé Amidala | 45.0 | Human | 0.462431 |
87 rows × 4 columns
starwars >> \
select(f.name, f.mass, f.species) >> \
group_by(f.species) >> \
mutate(mass_norm=f.mass / mean(f.mass)) >> \
ungroup()
name | mass | species | mass_norm | |
---|---|---|---|---|
<object> | <float64> | <object> | <float64> | |
0 | Luke Skywalker | 77.0 | Human | 0.930156 |
1 | C-3PO | 75.0 | Droid | 1.075269 |
2 | R2-D2 | 32.0 | Droid | 0.458781 |
3 | Darth Vader | 136.0 | Human | 1.642873 |
... | ... | ... | ... | ... |
4 | Leia Organa | 49.0 | Human | 0.591917 |
82 | Rey | NaN | Human | NaN |
83 | Poe Dameron | NaN | Human | NaN |
84 | BB8 | NaN | Droid | NaN |
85 | Captain Phasma | NaN | NaN | NaN |
86 | Padmé Amidala | 45.0 | Human | 0.543598 |
87 rows × 4 columns
vars = ["mass", "height"]
starwars >> mutate(starwars, prod=f[vars[0]] * f[vars[1]])
name | height | mass | hair_color | skin_color | eye_color | birth_year | sex | gender | homeworld | species | prod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<object> | <float64> | <float64> | <object> | <object> | <object> | <float64> | <object> | <object> | <object> | <object> | <float64> | |
0 | Luke Skywalker | 172.0 | 77.0 | blond | fair | blue | 19.0 | male | masculine | Tatooine | Human | 13244.0 |
1 | C-3PO | 167.0 | 75.0 | NaN | gold | yellow | 112.0 | none | masculine | Tatooine | Droid | 12525.0 |
2 | R2-D2 | 96.0 | 32.0 | NaN | white, blue | red | 33.0 | none | masculine | Naboo | Droid | 3072.0 |
3 | Darth Vader | 202.0 | 136.0 | none | white | yellow | 41.9 | male | masculine | Tatooine | Human | 27472.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4 | Leia Organa | 150.0 | 49.0 | brown | light | brown | 19.0 | female | feminine | Alderaan | Human | 7350.0 |
82 | Rey | NaN | NaN | brown | light | hazel | NaN | female | feminine | NaN | Human | NaN |
83 | Poe Dameron | NaN | NaN | brown | light | brown | NaN | male | masculine | NaN | Human | NaN |
84 | BB8 | NaN | NaN | none | none | black | NaN | none | masculine | NaN | Droid | NaN |
85 | Captain Phasma | NaN | NaN | unknown | unknown | unknown | NaN | NaN | NaN | NaN | NaN | NaN |
86 | Padmé Amidala | 165.0 | 45.0 | brown | light | brown | 46.0 | female | feminine | Naboo | Human | 7425.0 |
87 rows × 12 columns