expand
# https://tidyr.tidyverse.org/reference/expand.html
%run nb_helpers.py
from datar.all import *
nb_header(expand, nesting, crossing)
★ expand¶
Generates all combination of variables found in a dataset.¶
Args:¶
data: A data frame
*args: and,
**kwargs: columns to expand. Columns can be atomic lists.
- To find all unique combinations of x, y and z, including
those not present in the data, supply each variable as a
separate argument: expand(df, x, y, z).
- To find only the combinations that occur in the data, use
nesting: expand(df, nesting(x, y, z)).
- You can combine the two forms. For example,
expand(df, nesting(school_id, student_id), date) would
produce a row for each present school-student combination
for all possible dates.
_name_repair: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A data frame with all combination of variables.
★ nesting¶
A helper that only finds combinations already present in the data.¶
Args:¶
*args: and,
**kwargs: columns to expand. Columns can be atomic lists.
- To find all unique combinations of x, y and z, including
those not present in the data, supply each variable as a
separate argument: expand(df, x, y, z).
- To find only the combinations that occur in the data, use
nesting: expand(df, nesting(x, y, z)).
- You can combine the two forms. For example,
expand(df, nesting(school_id, student_id), date) would
produce a row for each present school-student combination
for all possible dates.
_name_repair: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A data frame with all combinations in data.
★ crossing¶
A wrapper around expand_grid() that de-duplicates and sorts its inputs¶
When values are not specified by literal list, they will be sorted.
Args:¶
*args: and,
**kwargs: columns to expand. Columns can be atomic lists.
- To find all unique combinations of x, y and z, including
those not present in the data, supply each variable as a
separate argument: expand(df, x, y, z).
- To find only the combinations that occur in the data, use
nesting: expand(df, nesting(x, y, z)).
- You can combine the two forms. For example,
expand(df, nesting(school_id, student_id), date) would
produce a row for each present school-student combination
for all possible dates.
_name_repair: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A data frame with values deduplicated and sorted.
fruits = tibble(
type = c("apple", "orange", "apple", "orange", "orange", "orange"),
year = c(2010, 2010, 2012, 2010, 2010, 2012),
size = factor(
c("XS", "S", "M", "S", "S", "M"),
levels = c("XS", "S", "M", "L")
),
weights = rnorm(6)
)
fruits
| type | year | size | weights | |
|---|---|---|---|---|
| <object> | <int64> | <category> | <float64> | |
| 0 | apple | 2010 | XS | 2.522472 |
| 1 | orange | 2010 | S | 0.206341 |
| 2 | apple | 2012 | M | -0.667409 |
| 3 | orange | 2010 | S | 0.887561 |
| 4 | orange | 2010 | S | -1.317738 |
| 5 | orange | 2012 | M | -0.228718 |
fruits >> expand(f.type)
| type | |
|---|---|
| <object> | |
| 0 | apple |
| 1 | orange |
fruits >> expand(f.type, f.size)
| type | size | |
|---|---|---|
| <object> | <category> | |
| 0 | apple | XS |
| 1 | apple | S |
| 2 | apple | M |
| 3 | apple | L |
| 4 | orange | XS |
| 5 | orange | S |
| 6 | orange | M |
| 7 | orange | L |
fruits >> expand(f.type, f.size, f.year)
| type | size | year | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 0 | apple | XS | 2010 |
| 1 | apple | XS | 2012 |
| 2 | apple | S | 2010 |
| 3 | apple | S | 2012 |
| 4 | apple | M | 2010 |
| 5 | apple | M | 2012 |
| 6 | apple | L | 2010 |
| 7 | apple | L | 2012 |
| 8 | orange | XS | 2010 |
| 9 | orange | XS | 2012 |
| 10 | orange | S | 2010 |
| 11 | orange | S | 2012 |
| 12 | orange | M | 2010 |
| 13 | orange | M | 2012 |
| 14 | orange | L | 2010 |
| 15 | orange | L | 2012 |
fruits >> expand(nesting(f.type))
| type | |
|---|---|
| <object> | |
| 0 | apple |
| 1 | orange |
fruits >> expand(nesting(f.type, f.size))
| type | size | |
|---|---|---|
| <object> | <category> | |
| 0 | apple | XS |
| 1 | orange | S |
| 2 | apple | M |
| 3 | orange | M |
fruits >> expand(nesting(f.type, f.size, f.year))
| type | size | year | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 0 | apple | XS | 2010 |
| 1 | orange | S | 2010 |
| 2 | apple | M | 2012 |
| 3 | orange | M | 2012 |
fruits >> expand(f.type, f.size, full_seq(f.year, 1))
| type | size | _VAR_2 | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 0 | apple | XS | 2010 |
| 1 | apple | XS | 2011 |
| 2 | apple | XS | 2012 |
| 3 | apple | S | 2010 |
| 4 | apple | S | 2011 |
| 5 | apple | S | 2012 |
| 6 | apple | M | 2010 |
| 7 | apple | M | 2011 |
| 8 | apple | M | 2012 |
| 9 | apple | L | 2010 |
| 10 | apple | L | 2011 |
| 11 | apple | L | 2012 |
| 12 | orange | XS | 2010 |
| 13 | orange | XS | 2011 |
| 14 | orange | XS | 2012 |
| 15 | orange | S | 2010 |
| 16 | orange | S | 2011 |
| 17 | orange | S | 2012 |
| 18 | orange | M | 2010 |
| 19 | orange | M | 2011 |
| 20 | orange | M | 2012 |
| 21 | orange | L | 2010 |
| 22 | orange | L | 2011 |
| 23 | orange | L | 2012 |
fruits >> expand(f.type, f.size, seq(2010, 2012))
| type | size | _VAR_2 | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 0 | apple | XS | 2010 |
| 1 | apple | XS | 2011 |
| 2 | apple | XS | 2012 |
| 3 | apple | S | 2010 |
| 4 | apple | S | 2011 |
| 5 | apple | S | 2012 |
| 6 | apple | M | 2010 |
| 7 | apple | M | 2011 |
| 8 | apple | M | 2012 |
| 9 | apple | L | 2010 |
| 10 | apple | L | 2011 |
| 11 | apple | L | 2012 |
| 12 | orange | XS | 2010 |
| 13 | orange | XS | 2011 |
| 14 | orange | XS | 2012 |
| 15 | orange | S | 2010 |
| 16 | orange | S | 2011 |
| 17 | orange | S | 2012 |
| 18 | orange | M | 2010 |
| 19 | orange | M | 2011 |
| 20 | orange | M | 2012 |
| 21 | orange | L | 2010 |
| 22 | orange | L | 2011 |
| 23 | orange | L | 2012 |
fruits >> expand(f.type, f.size, year=seq(2010, 2012))
| type | size | year | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 0 | apple | XS | 2010 |
| 1 | apple | XS | 2011 |
| 2 | apple | XS | 2012 |
| 3 | apple | S | 2010 |
| 4 | apple | S | 2011 |
| 5 | apple | S | 2012 |
| 6 | apple | M | 2010 |
| 7 | apple | M | 2011 |
| 8 | apple | M | 2012 |
| 9 | apple | L | 2010 |
| 10 | apple | L | 2011 |
| 11 | apple | L | 2012 |
| 12 | orange | XS | 2010 |
| 13 | orange | XS | 2011 |
| 14 | orange | XS | 2012 |
| 15 | orange | S | 2010 |
| 16 | orange | S | 2011 |
| 17 | orange | S | 2012 |
| 18 | orange | M | 2010 |
| 19 | orange | M | 2011 |
| 20 | orange | M | 2012 |
| 21 | orange | L | 2010 |
| 22 | orange | L | 2011 |
| 23 | orange | L | 2012 |
all = fruits >> expand(f.type, f.size, f.year)
all
| type | size | year | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 0 | apple | XS | 2010 |
| 1 | apple | XS | 2012 |
| 2 | apple | S | 2010 |
| 3 | apple | S | 2012 |
| 4 | apple | M | 2010 |
| 5 | apple | M | 2012 |
| 6 | apple | L | 2010 |
| 7 | apple | L | 2012 |
| 8 | orange | XS | 2010 |
| 9 | orange | XS | 2012 |
| 10 | orange | S | 2010 |
| 11 | orange | S | 2012 |
| 12 | orange | M | 2010 |
| 13 | orange | M | 2012 |
| 14 | orange | L | 2010 |
| 15 | orange | L | 2012 |
all >> anti_join(fruits)
| type | size | year | |
|---|---|---|---|
| <object> | <category> | <int64> | |
| 1 | apple | XS | 2012 |
| 2 | apple | S | 2010 |
| 3 | apple | S | 2012 |
| 4 | apple | M | 2010 |
| 6 | apple | L | 2010 |
| 7 | apple | L | 2012 |
| 8 | orange | XS | 2010 |
| 9 | orange | XS | 2012 |
| 13 | orange | S | 2012 |
| 14 | orange | M | 2010 |
| 16 | orange | L | 2010 |
| 17 | orange | L | 2012 |
fruits >> right_join(all)
| type | year | size | weights | |
|---|---|---|---|---|
| <object> | <int64> | <category> | <float64> | |
| 0 | apple | 2010 | XS | 2.522472 |
| 1 | apple | 2012 | XS | NaN |
| 2 | apple | 2010 | S | NaN |
| 3 | apple | 2012 | S | NaN |
| 4 | apple | 2010 | M | NaN |
| 5 | apple | 2012 | M | -0.667409 |
| 6 | apple | 2010 | L | NaN |
| 7 | apple | 2012 | L | NaN |
| 8 | orange | 2010 | XS | NaN |
| 9 | orange | 2012 | XS | NaN |
| 10 | orange | 2010 | S | 0.206341 |
| 11 | orange | 2010 | S | 0.887561 |
| 12 | orange | 2010 | S | -1.317738 |
| 13 | orange | 2012 | S | NaN |
| 14 | orange | 2010 | M | NaN |
| 15 | orange | 2012 | M | -0.228718 |
| 16 | orange | 2010 | L | NaN |
| 17 | orange | 2012 | L | NaN |