expand
# https://tidyr.tidyverse.org/reference/expand.html
%run nb_helpers.py
from datar.all import *
nb_header(expand, nesting, crossing)
★ expand¶
Generates all combination of variables found in a dataset.¶
Args:¶
data
: A data frame
*args
: and,
**kwargs
: columns to expand. Columns can be atomic lists.
- To find all unique combinations of x, y and z, including
those not present in the data, supply each variable as a
separate argument: expand(df, x, y, z)
.
- To find only the combinations that occur in the data, use
nesting
: expand(df, nesting(x, y, z))
.
- You can combine the two forms. For example,
expand(df, nesting(school_id, student_id), date)
would
produce a row for each present school-student combination
for all possible dates.
_name_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A data frame with all combination of variables.
★ nesting¶
A helper that only finds combinations already present in the data.¶
Args:¶
*args
: and,
**kwargs
: columns to expand. Columns can be atomic lists.
- To find all unique combinations of x, y and z, including
those not present in the data, supply each variable as a
separate argument: expand(df, x, y, z)
.
- To find only the combinations that occur in the data, use
nesting
: expand(df, nesting(x, y, z))
.
- You can combine the two forms. For example,
expand(df, nesting(school_id, student_id), date)
would
produce a row for each present school-student combination
for all possible dates.
_name_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A data frame with all combinations in data.
★ crossing¶
A wrapper around expand_grid()
that de-duplicates and sorts its inputs¶
When values are not specified by literal list
, they will be sorted.
Args:¶
*args
: and,
**kwargs
: columns to expand. Columns can be atomic lists.
- To find all unique combinations of x, y and z, including
those not present in the data, supply each variable as a
separate argument: expand(df, x, y, z)
.
- To find only the combinations that occur in the data, use
nesting
: expand(df, nesting(x, y, z))
.
- You can combine the two forms. For example,
expand(df, nesting(school_id, student_id), date)
would
produce a row for each present school-student combination
for all possible dates.
_name_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A data frame with values deduplicated and sorted.
fruits = tibble(
type = c("apple", "orange", "apple", "orange", "orange", "orange"),
year = c(2010, 2010, 2012, 2010, 2010, 2012),
size = factor(
c("XS", "S", "M", "S", "S", "M"),
levels = c("XS", "S", "M", "L")
),
weights = rnorm(6)
)
fruits
type | year | size | weights | |
---|---|---|---|---|
<object> | <int64> | <category> | <float64> | |
0 | apple | 2010 | XS | 2.522472 |
1 | orange | 2010 | S | 0.206341 |
2 | apple | 2012 | M | -0.667409 |
3 | orange | 2010 | S | 0.887561 |
4 | orange | 2010 | S | -1.317738 |
5 | orange | 2012 | M | -0.228718 |
fruits >> expand(f.type)
type | |
---|---|
<object> | |
0 | apple |
1 | orange |
fruits >> expand(f.type, f.size)
type | size | |
---|---|---|
<object> | <category> | |
0 | apple | XS |
1 | apple | S |
2 | apple | M |
3 | apple | L |
4 | orange | XS |
5 | orange | S |
6 | orange | M |
7 | orange | L |
fruits >> expand(f.type, f.size, f.year)
type | size | year | |
---|---|---|---|
<object> | <category> | <int64> | |
0 | apple | XS | 2010 |
1 | apple | XS | 2012 |
2 | apple | S | 2010 |
3 | apple | S | 2012 |
4 | apple | M | 2010 |
5 | apple | M | 2012 |
6 | apple | L | 2010 |
7 | apple | L | 2012 |
8 | orange | XS | 2010 |
9 | orange | XS | 2012 |
10 | orange | S | 2010 |
11 | orange | S | 2012 |
12 | orange | M | 2010 |
13 | orange | M | 2012 |
14 | orange | L | 2010 |
15 | orange | L | 2012 |
fruits >> expand(nesting(f.type))
type | |
---|---|
<object> | |
0 | apple |
1 | orange |
fruits >> expand(nesting(f.type, f.size))
type | size | |
---|---|---|
<object> | <category> | |
0 | apple | XS |
1 | orange | S |
2 | apple | M |
3 | orange | M |
fruits >> expand(nesting(f.type, f.size, f.year))
type | size | year | |
---|---|---|---|
<object> | <category> | <int64> | |
0 | apple | XS | 2010 |
1 | orange | S | 2010 |
2 | apple | M | 2012 |
3 | orange | M | 2012 |
fruits >> expand(f.type, f.size, full_seq(f.year, 1))
type | size | _VAR_2 | |
---|---|---|---|
<object> | <category> | <int64> | |
0 | apple | XS | 2010 |
1 | apple | XS | 2011 |
2 | apple | XS | 2012 |
3 | apple | S | 2010 |
4 | apple | S | 2011 |
5 | apple | S | 2012 |
6 | apple | M | 2010 |
7 | apple | M | 2011 |
8 | apple | M | 2012 |
9 | apple | L | 2010 |
10 | apple | L | 2011 |
11 | apple | L | 2012 |
12 | orange | XS | 2010 |
13 | orange | XS | 2011 |
14 | orange | XS | 2012 |
15 | orange | S | 2010 |
16 | orange | S | 2011 |
17 | orange | S | 2012 |
18 | orange | M | 2010 |
19 | orange | M | 2011 |
20 | orange | M | 2012 |
21 | orange | L | 2010 |
22 | orange | L | 2011 |
23 | orange | L | 2012 |
fruits >> expand(f.type, f.size, seq(2010, 2012))
type | size | _VAR_2 | |
---|---|---|---|
<object> | <category> | <int64> | |
0 | apple | XS | 2010 |
1 | apple | XS | 2011 |
2 | apple | XS | 2012 |
3 | apple | S | 2010 |
4 | apple | S | 2011 |
5 | apple | S | 2012 |
6 | apple | M | 2010 |
7 | apple | M | 2011 |
8 | apple | M | 2012 |
9 | apple | L | 2010 |
10 | apple | L | 2011 |
11 | apple | L | 2012 |
12 | orange | XS | 2010 |
13 | orange | XS | 2011 |
14 | orange | XS | 2012 |
15 | orange | S | 2010 |
16 | orange | S | 2011 |
17 | orange | S | 2012 |
18 | orange | M | 2010 |
19 | orange | M | 2011 |
20 | orange | M | 2012 |
21 | orange | L | 2010 |
22 | orange | L | 2011 |
23 | orange | L | 2012 |
fruits >> expand(f.type, f.size, year=seq(2010, 2012))
type | size | year | |
---|---|---|---|
<object> | <category> | <int64> | |
0 | apple | XS | 2010 |
1 | apple | XS | 2011 |
2 | apple | XS | 2012 |
3 | apple | S | 2010 |
4 | apple | S | 2011 |
5 | apple | S | 2012 |
6 | apple | M | 2010 |
7 | apple | M | 2011 |
8 | apple | M | 2012 |
9 | apple | L | 2010 |
10 | apple | L | 2011 |
11 | apple | L | 2012 |
12 | orange | XS | 2010 |
13 | orange | XS | 2011 |
14 | orange | XS | 2012 |
15 | orange | S | 2010 |
16 | orange | S | 2011 |
17 | orange | S | 2012 |
18 | orange | M | 2010 |
19 | orange | M | 2011 |
20 | orange | M | 2012 |
21 | orange | L | 2010 |
22 | orange | L | 2011 |
23 | orange | L | 2012 |
all = fruits >> expand(f.type, f.size, f.year)
all
type | size | year | |
---|---|---|---|
<object> | <category> | <int64> | |
0 | apple | XS | 2010 |
1 | apple | XS | 2012 |
2 | apple | S | 2010 |
3 | apple | S | 2012 |
4 | apple | M | 2010 |
5 | apple | M | 2012 |
6 | apple | L | 2010 |
7 | apple | L | 2012 |
8 | orange | XS | 2010 |
9 | orange | XS | 2012 |
10 | orange | S | 2010 |
11 | orange | S | 2012 |
12 | orange | M | 2010 |
13 | orange | M | 2012 |
14 | orange | L | 2010 |
15 | orange | L | 2012 |
all >> anti_join(fruits)
type | size | year | |
---|---|---|---|
<object> | <category> | <int64> | |
1 | apple | XS | 2012 |
2 | apple | S | 2010 |
3 | apple | S | 2012 |
4 | apple | M | 2010 |
6 | apple | L | 2010 |
7 | apple | L | 2012 |
8 | orange | XS | 2010 |
9 | orange | XS | 2012 |
13 | orange | S | 2012 |
14 | orange | M | 2010 |
16 | orange | L | 2010 |
17 | orange | L | 2012 |
fruits >> right_join(all)
type | year | size | weights | |
---|---|---|---|---|
<object> | <int64> | <category> | <float64> | |
0 | apple | 2010 | XS | 2.522472 |
1 | apple | 2012 | XS | NaN |
2 | apple | 2010 | S | NaN |
3 | apple | 2012 | S | NaN |
4 | apple | 2010 | M | NaN |
5 | apple | 2012 | M | -0.667409 |
6 | apple | 2010 | L | NaN |
7 | apple | 2012 | L | NaN |
8 | orange | 2010 | XS | NaN |
9 | orange | 2012 | XS | NaN |
10 | orange | 2010 | S | 0.206341 |
11 | orange | 2010 | S | 0.887561 |
12 | orange | 2010 | S | -1.317738 |
13 | orange | 2012 | S | NaN |
14 | orange | 2010 | M | NaN |
15 | orange | 2012 | M | -0.228718 |
16 | orange | 2010 | L | NaN |
17 | orange | 2012 | L | NaN |