distinct
In [1]:
Copied!
# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *
nb_header(distinct, n_distinct, book='distinct')
# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *
nb_header(distinct, n_distinct, book='distinct')
Try this notebook on binder.
★ distinct¶
Filter a data frame based on conditions¶
The original API:
https://dplyr.tidyverse.org/reference/distinct.html
Args:¶
_data
: A data frame
*args
: Variables to filter by.
keep_all
: If True
, keep all rows that match.
_preserve
: If True
, keep grouping variables even if they are not used.
Returns:¶
The subset dataframe
★ n_distinct¶
Count the number of distinct values¶
The original API:
https://dplyr.tidyverse.org/reference/distinct.html
Args:¶
_data
: A data frame
na_rm
: If True
, remove missing values before counting.
Returns:¶
The number of distinct values
In [2]:
Copied!
df = tibble(
x=sample(range(10), 100, replace=True),
y=sample(range(10), 100, replace=True)
)
nrow(df)
df = tibble(
x=sample(range(10), 100, replace=True),
y=sample(range(10), 100, replace=True)
)
nrow(df)
Out[2]:
100
In [3]:
Copied!
nrow(distinct(df))
nrow(distinct(df))
Out[3]:
59
In [4]:
Copied!
df >> distinct(f.x, f.y) >> nrow()
df >> distinct(f.x, f.y) >> nrow()
Out[4]:
59
In [5]:
Copied!
df >> distinct(f.x)
df >> distinct(f.x)
Out[5]:
x | |
---|---|
<int64> | |
0 | 4 |
1 | 6 |
2 | 1 |
3 | 8 |
4 | 5 |
6 | 9 |
14 | 2 |
19 | 7 |
22 | 0 |
38 | 3 |
In [6]:
Copied!
df >> distinct(f.y)
df >> distinct(f.y)
Out[6]:
y | |
---|---|
<int64> | |
0 | 4 |
1 | 3 |
2 | 1 |
3 | 7 |
4 | 8 |
6 | 6 |
10 | 9 |
13 | 0 |
16 | 5 |
18 | 2 |
In [7]:
Copied!
df >> distinct(f.x, _keep_all=True)
df >> distinct(f.x, _keep_all=True)
Out[7]:
x | y | |
---|---|---|
<int64> | <int64> | |
0 | 4 | 4 |
1 | 6 | 3 |
2 | 1 | 1 |
3 | 8 | 7 |
4 | 5 | 8 |
6 | 9 | 6 |
14 | 2 | 0 |
19 | 7 | 8 |
22 | 0 | 6 |
38 | 3 | 0 |
In [8]:
Copied!
df >> distinct(f.y, _keep_all=True)
df >> distinct(f.y, _keep_all=True)
Out[8]:
x | y | |
---|---|---|
<int64> | <int64> | |
0 | 4 | 4 |
1 | 6 | 3 |
2 | 1 | 1 |
3 | 8 | 7 |
4 | 5 | 8 |
6 | 9 | 6 |
10 | 1 | 9 |
13 | 6 | 0 |
16 | 4 | 5 |
18 | 6 | 2 |
In [9]:
Copied!
df >> distinct(diff=abs(f.x-f.y))
df >> distinct(diff=abs(f.x-f.y))
Out[9]:
diff | |
---|---|
<int64> | |
0 | 0 |
1 | 3 |
3 | 1 |
8 | 5 |
10 | 8 |
13 | 6 |
14 | 2 |
18 | 4 |
36 | 7 |
89 | 9 |
In [10]:
Copied!
starwars >> distinct(across(contains("color")))
starwars >> distinct(across(contains("color")))
Out[10]:
hair_color | skin_color | eye_color | |
---|---|---|---|
<object> | <object> | <object> | |
0 | blond | fair | blue |
1 | NaN | gold | yellow |
2 | NaN | white, blue | red |
3 | none | white | yellow |
... | ... | ... | ... |
4 | brown | light | brown |
79 | none | pale | white |
81 | black | dark | dark |
82 | brown | light | hazel |
84 | none | none | black |
85 | unknown | unknown | unknown |
67 rows × 3 columns
In [11]:
Copied!
df = tibble(
g=[1, 1, 2, 2],
x=[1, 1, 2, 1]
) >> group_by(f.g)
df >> distinct(f.x)
df = tibble(
g=[1, 1, 2, 2],
x=[1, 1, 2, 1]
) >> group_by(f.g)
df >> distinct(f.x)
Out[11]:
g | x | |
---|---|---|
<int64> | <int64> | |
0 | 1 | 1 |
2 | 2 | 2 |
TibbleGrouped: g (n=2)