pack
%run nb_helpers.py
from datar.data import iris
from datar.all import *
nb_header(pack, unpack)
★ pack¶
Makes df narrow by collapsing a set of columns into a single df-column.¶
Args:¶
_data
: A data frame
**cols
: Columns to pack
_names_sep
: If None
, the default, the names will be left as is.
Inner names will come from the former outer names
If a string, the inner and outer names will be used together.
The names of the new outer columns will be formed by pasting
together the outer and the inner column names, separated by
_names_sep
.
★ unpack¶
Makes df wider by expanding df-columns back out into individual columns.¶
For empty columns, the column is kept asis, instead of removing it.
Args:¶
data
: A data frame
cols
: Columns to unpack
names_sep
: If None
, the default, the names will be left as is.
Inner names will come from the former outer names
If a string, the inner and outer names will be used together.
The names of the new outer columns will be formed by pasting
together the outer and the inner column names, separated by
_names_sep
.
name_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
Data frame with given columns unpacked.
df = tibble(x1 = c[1:3], x2 = c[4:6], x3 = c[7:9], y = c[1:3])
df
x1 | x2 | x3 | y | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
0 | 1 | 4 | 7 | 1 |
1 | 2 | 5 | 8 | 2 |
df >> pack(x=starts_with('x'))
y | x$x1 | x$x2 | x$x3 | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
0 | 1 | 1 | 4 | 7 |
1 | 2 | 2 | 5 | 8 |
df >> pack(x=c(f.x1, f.x2, f.x3), y=f.y)
x$x1 | x$x2 | x$x3 | y$y | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
0 | 1 | 4 | 7 | 1 |
1 | 2 | 5 | 8 | 2 |
iris >> pack(
Sepal=starts_with("Sepal"),
Petal=starts_with("Petal"),
_names_sep="_"
)
Species | Sepal$Length | Sepal$Width | Petal$Length | Petal$Width | |
---|---|---|---|---|---|
<object> | <float64> | <float64> | <float64> | <float64> | |
0 | setosa | 5.1 | 3.5 | 1.4 | 0.2 |
1 | setosa | 4.9 | 3.0 | 1.4 | 0.2 |
2 | setosa | 4.7 | 3.2 | 1.3 | 0.2 |
3 | setosa | 4.6 | 3.1 | 1.5 | 0.2 |
... | ... | ... | ... | ... | ... |
4 | setosa | 5.0 | 3.6 | 1.4 | 0.2 |
145 | virginica | 6.7 | 3.0 | 5.2 | 2.3 |
146 | virginica | 6.3 | 2.5 | 5.0 | 1.9 |
147 | virginica | 6.5 | 3.0 | 5.2 | 2.0 |
148 | virginica | 6.2 | 3.4 | 5.4 | 2.3 |
149 | virginica | 5.9 | 3.0 | 5.1 | 1.8 |
150 rows × 5 columns
# Unpacking ===========================================================
df = tibble(
x = c[1:4],
y = tibble(a = c[1:4], b = c[4:1]),
z = tibble(X = c("a", "b", "c"), Y = runif(3), Z = c(TRUE, FALSE, NA))
)
df
x | y$a | y$b | z$X | z$Y | z$Z | |
---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <object> | <float64> | <object> | |
0 | 1 | 1 | 4 | a | 0.286761 | True |
1 | 2 | 2 | 3 | b | 0.532775 | False |
2 | 3 | 3 | 2 | c | 0.497844 | NaN |
df >> unpack(f.y)
x | a | b | z$X | z$Y | z$Z | |
---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <object> | <float64> | <object> | |
0 | 1 | 1 | 4 | a | 0.286761 | True |
1 | 2 | 2 | 3 | b | 0.532775 | False |
2 | 3 | 3 | 2 | c | 0.497844 | NaN |
df >> unpack(c(f.y, f.z))
x | a | b | X | Y | Z | |
---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <object> | <float64> | <object> | |
0 | 1 | 1 | 4 | a | 0.286761 | True |
1 | 2 | 2 | 3 | b | 0.532775 | False |
2 | 3 | 3 | 2 | c | 0.497844 | NaN |
df >> unpack(c(f.y, f.z), names_sep="_")
x | y_a | y_b | z_X | z_Y | z_Z | |
---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <object> | <float64> | <object> | |
0 | 1 | 1 | 4 | a | 0.286761 | True |
1 | 2 | 2 | 3 | b | 0.532775 | False |
2 | 3 | 3 | 2 | c | 0.497844 | NaN |
with try_catch():
# indexes from inner data frame counts
df >> unpack(c(2,3))
x | a | b | X | Y | Z | |
---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <object> | <float64> | <object> | |
0 | 1 | 1 | 4 | a | 0.286761 | True |
1 | 2 | 2 | 3 | b | 0.532775 | False |
2 | 3 | 3 | 2 | c | 0.497844 | NaN |
df >> unpack(c(2,4))
x | a | b | X | Y | Z | |
---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <object> | <float64> | <object> | |
0 | 1 | 1 | 4 | a | 0.286761 | True |
1 | 2 | 2 | 3 | b | 0.532775 | False |
2 | 3 | 3 | 2 | c | 0.497844 | NaN |