nest
%run nb_helpers.py
from datar.data import iris, fish_encounters, mtcars
from datar.all import *
nb_header(nest, unnest)
★ nest¶
Nesting creates a list-column of data frames¶
Args:¶
_data
: A data frame
**cols
: Columns to nest
_names_sep
: If None
, the default, the names will be left as is.
Inner names will come from the former outer names
If a string, the inner and outer names will be used together.
The names of the new outer columns will be formed by pasting
together the outer and the inner column names, separated by
_names_sep
.
Returns:¶
Nested data frame.
★ unnest¶
Flattens list-column of data frames back out into regular columns.¶
Args:¶
data
: A data frame to flatten.
*cols
: Columns to unnest.
keep_empty
: By default, you get one row of output for each element
of the list your unchopping/unnesting.
This means that if there's a size-0 element
(like NULL or an empty data frame), that entire row will be
dropped from the output.
If you want to preserve all rows, use keep_empty
= True
to
replace size-0 elements with a single row of missing values.
dtypes
: Providing the dtypes for the output columns.
Could be a single dtype, which will be applied to all columns, or
a dictionary of dtypes with keys for the columns and values the
dtypes.
names_sep
: If None
, the default, the names will be left as is.
Inner names will come from the former outer names
If a string, the inner and outer names will be used together.
The names of the new outer columns will be formed by pasting
together the outer and the inner column names, separated by
names_sep
.
names_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
Data frame with selected columns unnested.
df = tibble(x = c(1, 1, 1, 2, 2, 3), y = c[1:7], z = c[7:1])
df >> nest(data=c(f.y, f.z))
x | data | |
---|---|---|
<int64> | <object> | |
0 | 1 | <DF 3x2> |
1 | 2 | <DF 2x2> |
2 | 3 | <DF 1x2> |
df >> chop(c(f.y, f.z))
x | y | z | |
---|---|---|---|
<int64> | <object> | <object> | |
0 | 1 | [1, 2, 3] | [7, 6, 5] |
1 | 2 | [4, 5] | [4, 3] |
2 | 3 | [6] | [2] |
df >> nest(data=any_of(c(f.y, f.z)))
x | data | |
---|---|---|
<int64> | <object> | |
0 | 1 | <DF 3x2> |
1 | 2 | <DF 2x2> |
2 | 3 | <DF 1x2> |
out = iris >> nest(data=~f.Species)
out.data[0]
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | |
---|---|---|---|---|
<float64> | <float64> | <float64> | <float64> | |
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
5 | 5.4 | 3.9 | 1.7 | 0.4 |
6 | 4.6 | 3.4 | 1.4 | 0.3 |
7 | 5.0 | 3.4 | 1.5 | 0.2 |
8 | 4.4 | 2.9 | 1.4 | 0.2 |
9 | 4.9 | 3.1 | 1.5 | 0.1 |
10 | 5.4 | 3.7 | 1.5 | 0.2 |
11 | 4.8 | 3.4 | 1.6 | 0.2 |
12 | 4.8 | 3.0 | 1.4 | 0.1 |
13 | 4.3 | 3.0 | 1.1 | 0.1 |
14 | 5.8 | 4.0 | 1.2 | 0.2 |
15 | 5.7 | 4.4 | 1.5 | 0.4 |
16 | 5.4 | 3.9 | 1.3 | 0.4 |
17 | 5.1 | 3.5 | 1.4 | 0.3 |
18 | 5.7 | 3.8 | 1.7 | 0.3 |
19 | 5.1 | 3.8 | 1.5 | 0.3 |
20 | 5.4 | 3.4 | 1.7 | 0.2 |
21 | 5.1 | 3.7 | 1.5 | 0.4 |
22 | 4.6 | 3.6 | 1.0 | 0.2 |
23 | 5.1 | 3.3 | 1.7 | 0.5 |
24 | 4.8 | 3.4 | 1.9 | 0.2 |
25 | 5.0 | 3.0 | 1.6 | 0.2 |
26 | 5.0 | 3.4 | 1.6 | 0.4 |
27 | 5.2 | 3.5 | 1.5 | 0.2 |
28 | 5.2 | 3.4 | 1.4 | 0.2 |
29 | 4.7 | 3.2 | 1.6 | 0.2 |
30 | 4.8 | 3.1 | 1.6 | 0.2 |
31 | 5.4 | 3.4 | 1.5 | 0.4 |
32 | 5.2 | 4.1 | 1.5 | 0.1 |
33 | 5.5 | 4.2 | 1.4 | 0.2 |
34 | 4.9 | 3.1 | 1.5 | 0.2 |
35 | 5.0 | 3.2 | 1.2 | 0.2 |
36 | 5.5 | 3.5 | 1.3 | 0.2 |
37 | 4.9 | 3.6 | 1.4 | 0.1 |
38 | 4.4 | 3.0 | 1.3 | 0.2 |
39 | 5.1 | 3.4 | 1.5 | 0.2 |
40 | 5.0 | 3.5 | 1.3 | 0.3 |
41 | 4.5 | 2.3 | 1.3 | 0.3 |
42 | 4.4 | 3.2 | 1.3 | 0.2 |
43 | 5.0 | 3.5 | 1.6 | 0.6 |
44 | 5.1 | 3.8 | 1.9 | 0.4 |
45 | 4.8 | 3.0 | 1.4 | 0.3 |
46 | 5.1 | 3.8 | 1.6 | 0.2 |
47 | 4.6 | 3.2 | 1.4 | 0.2 |
48 | 5.3 | 3.7 | 1.5 | 0.2 |
49 | 5.0 | 3.3 | 1.4 | 0.2 |
nest_vars = colnames(iris)[:4]
iris >> nest(data = any_of(nest_vars))
Species | data | |
---|---|---|
<object> | <object> | |
0 | setosa | <DF 50x4> |
1 | versicolor | <DF 50x4> |
2 | virginica | <DF 50x4> |
iris >> nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))
Species | petal | sepal | |
---|---|---|---|
<object> | <object> | <object> | |
0 | setosa | <DF 50x2> | <DF 50x2> |
1 | versicolor | <DF 50x2> | <DF 50x2> |
2 | virginica | <DF 50x2> | <DF 50x2> |
iris >> nest(width = contains("Width"), length = contains("Length"))
Species | width | length | |
---|---|---|---|
<object> | <object> | <object> | |
0 | setosa | <DF 50x2> | <DF 50x2> |
1 | versicolor | <DF 50x2> | <DF 50x2> |
2 | virginica | <DF 50x2> | <DF 50x2> |
fish_encounters >> group_by(f.fish) >> nest()
fish | data | |
---|---|---|
<int64> | <object> | |
0 | 4842 | <DF 11x2> |
1 | 4843 | <DF 11x2> |
2 | 4844 | <DF 11x2> |
3 | 4845 | <DF 5x2> |
4 | 4847 | <DF 3x2> |
5 | 4848 | <DF 4x2> |
6 | 4849 | <DF 2x2> |
7 | 4850 | <DF 6x2> |
8 | 4851 | <DF 2x2> |
9 | 4854 | <DF 2x2> |
10 | 4855 | <DF 5x2> |
11 | 4857 | <DF 9x2> |
12 | 4858 | <DF 11x2> |
13 | 4859 | <DF 5x2> |
14 | 4861 | <DF 11x2> |
15 | 4862 | <DF 9x2> |
16 | 4863 | <DF 2x2> |
17 | 4864 | <DF 2x2> |
18 | 4865 | <DF 3x2> |
TibbleGrouped: fish (n=19)
from pipda import register_func
@register_func()
def get_models(dfs):
# do whatever with the dfs
return dfs.transform(lambda df: f"<df {df.values[0].shape[0]}x{df.values[0].shape[1]}>")
mtcars >> group_by(f.cyl) >> nest() >> mutate(
models=get_models(f.data)
)
cyl | data | models | |
---|---|---|---|
<int64> | <object> | <object> | |
0 | 6 | <DF 7x10> | <df 7x10> |
1 | 4 | <DF 11x10> | <df 11x10> |
2 | 8 | <DF 14x10> | <df 14x10> |
TibbleGrouped: cyl (n=3)
df = tibble(
x = c[1:4],
y = [
NULL,
tibble(a = 1, b = 2),
tibble(a = c[1:4], b = c[4:1])
]
)
df >> unnest(f.y, dtypes=int)
x | a | b | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 2 | 1 | 2 |
1 | 3 | 1 | 4 |
2 | 3 | 2 | 3 |
3 | 3 | 3 | 2 |
df >> unnest(f.y, keep_empty=True)
x | a | b | |
---|---|---|---|
<int64> | <float64> | <float64> | |
0 | 1 | NaN | NaN |
1 | 2 | 1.0 | 2.0 |
2 | 3 | 1.0 | 4.0 |
3 | 3 | 2.0 | 3.0 |
4 | 3 | 3.0 | 2.0 |
df = tibble(
a = [c("a", "b"), "c"],
b = [[1,2], 3],
c = c(11, 22)
)
df >> unnest(c(f.a, f.b))
a | b | c | |
---|---|---|---|
<object> | <int64> | <int64> | |
0 | a | 1 | 11 |
1 | b | 2 | 11 |
2 | c | 3 | 22 |
df >> unnest(f.a) >> unnest(f.b)
a | b | c | |
---|---|---|---|
<object> | <int64> | <int64> | |
0 | a | 1 | 11 |
1 | a | 2 | 11 |
2 | b | 1 | 11 |
3 | b | 2 | 11 |
4 | c | 3 | 22 |