distinct

In [1]:

Copied!





# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *

nb_header(distinct, n_distinct, book='distinct')
# https://dplyr.tidyverse.org/reference/distinct.html
%run nb_helpers.py
from datar.data import starwars
from datar.all import *

nb_header(distinct, n_distinct, book='distinct')

Try this notebook on binder.

★ distinct
¶

Filter a data frame based on conditions¶

The original API:
https://dplyr.tidyverse.org/reference/distinct.html

Args:¶

_data: A data frame
*args: Variables to filter by.
keep_all: If True, keep all rows that match.
_preserve: If True, keep grouping variables even if they are not used.

Returns:¶

The subset dataframe

★ n_distinct
¶

Count the number of distinct values¶

The original API:
https://dplyr.tidyverse.org/reference/distinct.html

Args:¶

_data: A data frame
na_rm: If True, remove missing values before counting.

Returns:¶

The number of distinct values

In [2]:

Copied!





df = tibble(
  x=sample(range(10), 100, replace=True),
  y=sample(range(10), 100, replace=True)
)
nrow(df)
df = tibble(
  x=sample(range(10), 100, replace=True),
  y=sample(range(10), 100, replace=True)
)
nrow(df)

Out[2]:

In [3]:

Copied!

nrow(distinct(df))
nrow(distinct(df))

Out[3]:

In [4]:

Copied!

df >> distinct(f.x, f.y) >> nrow()
df >> distinct(f.x, f.y) >> nrow()

Out[4]:

In [5]:

Copied!

df >> distinct(f.x)
df >> distinct(f.x)

Out[5]:

	x
	<int64>
0	4
1	6
2	1
3	8
4	5
6	9
14	2
19	7
22	0
38	3

In [6]:

Copied!

df >> distinct(f.y)
df >> distinct(f.y)

Out[6]:

	y
	<int64>
0	4
1	3
2	1
3	7
4	8
6	6
10	9
13	0
16	5
18	2

In [7]:

Copied!

df >> distinct(f.x, _keep_all=True)
df >> distinct(f.x, _keep_all=True)

Out[7]:

	x	y
	<int64>	<int64>
0	4	4
1	6	3
2	1	1
3	8	7
4	5	8
6	9	6
14	2	0
19	7	8
22	0	6
38	3	0

In [8]:

Copied!

df >> distinct(f.y, _keep_all=True)
df >> distinct(f.y, _keep_all=True)

Out[8]:

	x	y
	<int64>	<int64>
0	4	4
1	6	3
2	1	1
3	8	7
4	5	8
6	9	6
10	1	9
13	6	0
16	4	5
18	6	2

In [9]:

Copied!

df >> distinct(diff=abs(f.x-f.y))
df >> distinct(diff=abs(f.x-f.y))

Out[9]:

	diff
	<int64>
0	0
1	3
3	1
8	5
10	8
13	6
14	2
18	4
36	7
89	9

In [10]:

Copied!

starwars >> distinct(across(contains("color")))
starwars >> distinct(across(contains("color")))

Out[10]:

	hair_color	skin_color	eye_color
	<object>	<object>	<object>
0	blond	fair	blue
1	NaN	gold	yellow
2	NaN	white, blue	red
3	none	white	yellow
...	...	...	...
4	brown	light	brown
79	none	pale	white
81	black	dark	dark
82	brown	light	hazel
84	none	none	black
85	unknown	unknown	unknown

67 rows × 3 columns

In [11]:

Copied!





df = tibble(
  g=[1, 1, 2, 2],
  x=[1, 1, 2, 1]
) >> group_by(f.g)

df >> distinct(f.x) 
df = tibble(
  g=[1, 1, 2, 2],
  x=[1, 1, 2, 1]
) >> group_by(f.g)

df >> distinct(f.x) 

Out[11]:

	g	x
	<int64>	<int64>
0	1	1
2	2	2

TibbleGrouped: g (n=2)

distinct

★ distinct¶

Filter a data frame based on conditions¶

Args:¶

Returns:¶

★ n_distinct¶

Count the number of distinct values¶

Args:¶

Returns:¶

★ distinct
¶

★ n_distinct
¶