tibble
# https://tibble.tidyverse.org/reference/tibble.html
# https://tibble.tidyverse.org/reference/tribble.html
%run nb_helpers.py
from datar import f
from datar.tibble import tibble, tibble_row, tribble
from datar.base import diag, runif
from datar.dplyr import mutate
nb_header(tibble, tibble_row, tribble)
★ tibble¶
Constructs a data frame¶
Args:¶
*args
: and
**kwargs
: A set of name-value pairs.
_name_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
_rows
: Number of rows of a 0-col dataframe when args and kwargs are
not provided. When args or kwargs are provided, this is ignored.
_dtypes
: The dtypes for each columns to convert to.
_drop_index
: Whether drop the index for the final data frame
_index
: The new index of the output frame
Returns:¶
A constructed tibble
★ tibble_row¶
Constructs a data frame that is guaranteed to occupy one row.¶
Scalar values will be wrapped with []
Args:¶
*args
: and
**kwargs
: A set of name-value pairs.
_name_repair
: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,
- "unique": Make sure names are unique and not empty,
- "check_unique": (default value), no name repair, but check they are unique,
- "universal": Make the names unique and syntactic
- a function: apply custom name repair
Returns:¶
A constructed dataframe
★ tribble¶
Create dataframe using an easier to read row-by-row layout¶
Unlike original API that uses formula (f.col
) to indicate the column
names, we use f.col
to indicate them.
Args:¶
*dummies
: Arguments specifying the structure of a dataframe
Variable names should be specified with f.name
_dtypes
: The dtypes for each columns to convert to.
Returns:¶
A dataframe
a = range(5)
tibble(a=a, b=f.a*2)
a | b | |
---|---|---|
<int64> | <int64> | |
0 | 0 | 0 |
1 | 1 | 2 |
2 | 2 | 4 |
3 | 3 | 6 |
4 | 4 | 8 |
tibble(a=a, b=f.a * 2, c=1)
a | b | c | |
---|---|---|---|
<int64> | <int64> | <int64> | |
0 | 0 | 0 | 1 |
1 | 1 | 2 | 1 |
2 | 2 | 4 | 1 |
3 | 3 | 6 | 1 |
4 | 4 | 8 | 1 |
tibble(x=runif(10), y=f.x*2)
x | y | |
---|---|---|
<float64> | <float64> | |
0 | 0.639511 | 1.279022 |
1 | 0.573888 | 1.147776 |
2 | 0.123471 | 0.246943 |
3 | 0.807206 | 1.614412 |
4 | 0.159120 | 0.318241 |
5 | 0.893697 | 1.787394 |
6 | 0.897584 | 1.795168 |
7 | 0.159780 | 0.319559 |
8 | 0.919717 | 1.839433 |
9 | 0.304561 | 0.609122 |
x = 1
with try_catch():
tibble(x, x)
[NameNonUniqueError] Names must be unique: 1
tibble(x, x, _name_repair="unique")
[2022-12-02 14:48:49][datar][WARNING] New names: [2022-12-02 14:48:49][datar][WARNING] * '1' -> '1__0' [2022-12-02 14:48:49][datar][WARNING] * '1' -> '1__1'
1__0 | 1__1 | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |
tibble(x, x, _name_repair="minimal") # duplicated columns allowed
1 | 1 | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |
a = 1
tibble(a * 1, a * 2, _name_repair="universal")
[2022-12-02 14:49:00][datar][WARNING] New names: [2022-12-02 14:49:00][datar][WARNING] * '1' -> '_1' [2022-12-02 14:49:00][datar][WARNING] * '2' -> '_2'
_1 | _2 | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 2 |
from typing import Iterable
# use annotation to tell it's all names
# not only a single name
def make_unique(names: Iterable[str]):
new_names = []
for name in names:
name_count = new_names.count(name)
if name_count == 0:
new_names.append(name)
else:
new_names.append(f'{name}_{name_count}')
return new_names
tibble(a, a, _name_repair=make_unique)
1 | 1_1 | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |
# if not annotation specified
# assuming a single name
def fix_names(name):
import re
return re.sub(r'\s+', '_', name)
tibble(a + 1, a + 2, _name_repair = fix_names)
2 | 3 | |
---|---|---|
<int64> | <int64> | |
0 | 2 | 3 |
tibble(x, x, _name_repair=["a", "b"])
a | b | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |
tibble(
tibble(
b = [4,5,6],
c = [7,8,9]
),
a = range(3),
d = f.b
)
b | c | a | d | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
0 | 4 | 7 | 0 | 4 |
1 | 5 | 8 | 1 | 5 |
2 | 6 | 9 | 2 | 6 |
s = tibble(diag(1, 4))
t = tibble(s.iloc[:, :2], _name_repair=['x', 'y'])
tibble(
a=range(4),
b=s,
c=t
)
a | b$0 | b$1 | b$2 | b$3 | c$x$0 | c$x$1 | |
---|---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | |
0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
2 | 2 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 3 | 0 | 0 | 0 | 1 | 0 | 0 |
with try_catch():
tibble(a=range(3), b=range(4))
[ValueError] `b` must be size [1 3], not 4.
tibble(_dotted = 3, _name_repair=lambda x: x.replace('_', '.'))
.dotted | |
---|---|
<int64> | |
0 | 3 |
x = 3
tibble(x=1, y=f.x)
x | y | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |
tibble(x=1, y=x)
x | y | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 3 |
tribble(
f.colA, f.colB,
"a", 1,
"b", 2,
"c", 3
)
colA | colB | |
---|---|---|
<object> | <int64> | |
0 | a | 1 |
1 | b | 2 |
2 | c | 3 |
tribble(
f.x, f.y,
"a", [1,2,3],
"b", [4,5,6]
)
x | y | |
---|---|---|
<object> | <object> | |
0 | a | [1, 2, 3] |
1 | b | [4, 5, 6] |
tibble_row(a=1, b=[[2,3]])
a | b | |
---|---|---|
<int64> | <object> | |
0 | 1 | [2, 3] |
# inside a verb
tibble(x=1) >> mutate(y=tibble(y=f.x))
x | y$y | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |