distinct
In [1]:
Copied!
# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *
nb_header(distinct, n_distinct, book='distinct')
# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *
nb_header(distinct, n_distinct, book='distinct')
Try this notebook on binder.
★ distinct¶
Filter a data frame based on conditions¶
The original API:
https://dplyr.tidyverse.org/reference/distinct.html
Args:¶
_data: A data frame
*args: Variables to filter by.
keep_all: If True, keep all rows that match.
_preserve: If True, keep grouping variables even if they are not used.
Returns:¶
The subset dataframe
★ n_distinct¶
Count the number of distinct values¶
The original API:
https://dplyr.tidyverse.org/reference/distinct.html
Args:¶
_data: A data frame
na_rm: If True, remove missing values before counting.
Returns:¶
The number of distinct values
In [2]:
Copied!
df = tibble(
x=sample(range(10), 100, replace=True),
y=sample(range(10), 100, replace=True)
)
nrow(df)
df = tibble(
x=sample(range(10), 100, replace=True),
y=sample(range(10), 100, replace=True)
)
nrow(df)
Out[2]:
100
In [3]:
Copied!
nrow(distinct(df))
nrow(distinct(df))
Out[3]:
59
In [4]:
Copied!
df >> distinct(f.x, f.y) >> nrow()
df >> distinct(f.x, f.y) >> nrow()
Out[4]:
59
In [5]:
Copied!
df >> distinct(f.x)
df >> distinct(f.x)
Out[5]:
| x | |
|---|---|
| <int64> | |
| 0 | 4 |
| 1 | 6 |
| 2 | 1 |
| 3 | 8 |
| 4 | 5 |
| 6 | 9 |
| 14 | 2 |
| 19 | 7 |
| 22 | 0 |
| 38 | 3 |
In [6]:
Copied!
df >> distinct(f.y)
df >> distinct(f.y)
Out[6]:
| y | |
|---|---|
| <int64> | |
| 0 | 4 |
| 1 | 3 |
| 2 | 1 |
| 3 | 7 |
| 4 | 8 |
| 6 | 6 |
| 10 | 9 |
| 13 | 0 |
| 16 | 5 |
| 18 | 2 |
In [7]:
Copied!
df >> distinct(f.x, _keep_all=True)
df >> distinct(f.x, _keep_all=True)
Out[7]:
| x | y | |
|---|---|---|
| <int64> | <int64> | |
| 0 | 4 | 4 |
| 1 | 6 | 3 |
| 2 | 1 | 1 |
| 3 | 8 | 7 |
| 4 | 5 | 8 |
| 6 | 9 | 6 |
| 14 | 2 | 0 |
| 19 | 7 | 8 |
| 22 | 0 | 6 |
| 38 | 3 | 0 |
In [8]:
Copied!
df >> distinct(f.y, _keep_all=True)
df >> distinct(f.y, _keep_all=True)
Out[8]:
| x | y | |
|---|---|---|
| <int64> | <int64> | |
| 0 | 4 | 4 |
| 1 | 6 | 3 |
| 2 | 1 | 1 |
| 3 | 8 | 7 |
| 4 | 5 | 8 |
| 6 | 9 | 6 |
| 10 | 1 | 9 |
| 13 | 6 | 0 |
| 16 | 4 | 5 |
| 18 | 6 | 2 |
In [9]:
Copied!
df >> distinct(diff=abs(f.x-f.y))
df >> distinct(diff=abs(f.x-f.y))
Out[9]:
| diff | |
|---|---|
| <int64> | |
| 0 | 0 |
| 1 | 3 |
| 3 | 1 |
| 8 | 5 |
| 10 | 8 |
| 13 | 6 |
| 14 | 2 |
| 18 | 4 |
| 36 | 7 |
| 89 | 9 |
In [10]:
Copied!
starwars >> distinct(across(contains("color")))
starwars >> distinct(across(contains("color")))
Out[10]:
| hair_color | skin_color | eye_color | |
|---|---|---|---|
| <object> | <object> | <object> | |
| 0 | blond | fair | blue |
| 1 | NaN | gold | yellow |
| 2 | NaN | white, blue | red |
| 3 | none | white | yellow |
| ... | ... | ... | ... |
| 4 | brown | light | brown |
| 79 | none | pale | white |
| 81 | black | dark | dark |
| 82 | brown | light | hazel |
| 84 | none | none | black |
| 85 | unknown | unknown | unknown |
67 rows × 3 columns
In [11]:
Copied!
df = tibble(
g=[1, 1, 2, 2],
x=[1, 1, 2, 1]
) >> group_by(f.g)
df >> distinct(f.x)
df = tibble(
g=[1, 1, 2, 2],
x=[1, 1, 2, 1]
) >> group_by(f.g)
df >> distinct(f.x)
Out[11]:
| g | x | |
|---|---|---|
| <int64> | <int64> | |
| 0 | 1 | 1 |
| 2 | 2 | 2 |
TibbleGrouped: g (n=2)