forcats_misc
In [1]:
Copied!
%run nb_helpers.py
import numpy
from datar.all import *
from datar.data import gss_cat
nb_header(
as_factor,
fct_count,
fct_match,
fct_unique,
lvls_reorder,
lvls_revalue,
lvls_expand,
lvls_union,
book="forcat_lvl_addrm",
)
%run nb_helpers.py
import numpy
from datar.all import *
from datar.data import gss_cat
nb_header(
as_factor,
fct_count,
fct_match,
fct_unique,
lvls_reorder,
lvls_revalue,
lvls_expand,
lvls_union,
book="forcat_lvl_addrm",
)
Try this notebook on binder.
★ as_factor¶
★ fct_count¶
★ fct_match¶
★ fct_unique¶
★ lvls_reorder¶
Leaves values of a factor as they are, but changes the order by¶
given indices
Args:¶
f
: A factor (or character vector).
idx
: A integer index, with one integer for each existing level.
new_levels
: A character vector of new levels.
ordered
: A logical which determines the "ordered" status of the
output factor. None
preserves the existing status of the factor.
Returns:¶
The factor with levels reordered
★ lvls_revalue¶
★ lvls_expand¶
★ lvls_union¶
as_factor¶
In [2]:
Copied!
x = c("a", "z", "g")
as_factor(x)
x = c("a", "z", "g")
as_factor(x)
Out[2]:
['a', 'z', 'g'] Categories (3, object): ['a', 'g', 'z']
In [3]:
Copied!
y = c("1.1", "11", "2.2", "22")
as_factor(y)
y = c("1.1", "11", "2.2", "22")
as_factor(y)
Out[3]:
['1.1', '11', '2.2', '22'] Categories (4, object): ['1.1', '11', '2.2', '22']
In [4]:
Copied!
z = as_numeric(y)
as_factor(z)
z = as_numeric(y)
as_factor(z)
FutureWarning: Index.ravel returning ndarray is deprecated; in a future version this will return a view on self.
Out[4]:
[1.1, 11.0, 2.2, 22.0] Categories (4, float64): [1.1, 2.2, 11.0, 22.0]
fct_count¶
In [5]:
Copied!
fct = factor(sample(letters)[rpois(1000, 10)])
table(fct)
fct = factor(sample(letters)[rpois(1000, 10)])
table(fct)
Out[5]:
b | c | d | e | i | k | l | m | n | o | ... | q | r | s | t | u | v | w | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | ... | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | |
count | 8 | 88 | 37 | 1 | 45 | 67 | 2 | 14 | 105 | 4 | ... | 5 | 22 | 1 | 87 | 49 | 17 | 134 | 128 | 72 | 112 |
1 rows × 21 columns
In [6]:
Copied!
fct_count(fct)
fct_count(fct)
Out[6]:
f | n | |
---|---|---|
<category> | <int64> | |
0 | b | 8 |
1 | c | 88 |
2 | d | 37 |
3 | e | 1 |
4 | i | 45 |
5 | k | 67 |
6 | l | 2 |
7 | m | 14 |
8 | n | 105 |
9 | o | 4 |
10 | p | 2 |
11 | q | 5 |
12 | r | 22 |
13 | s | 1 |
14 | t | 87 |
15 | u | 49 |
16 | v | 17 |
17 | w | 134 |
18 | x | 128 |
19 | y | 72 |
20 | z | 112 |
In [7]:
Copied!
fct_count(fct, sort = TRUE)
fct_count(fct, sort = TRUE)
Out[7]:
f | n | |
---|---|---|
<category> | <int64> | |
17 | w | 134 |
18 | x | 128 |
20 | z | 112 |
8 | n | 105 |
1 | c | 88 |
14 | t | 87 |
19 | y | 72 |
5 | k | 67 |
15 | u | 49 |
4 | i | 45 |
2 | d | 37 |
12 | r | 22 |
16 | v | 17 |
7 | m | 14 |
0 | b | 8 |
11 | q | 5 |
9 | o | 4 |
6 | l | 2 |
10 | p | 2 |
3 | e | 1 |
13 | s | 1 |
In [8]:
Copied!
fct_count(fct, sort = TRUE, prop = TRUE)
fct_count(fct, sort = TRUE, prop = TRUE)
Out[8]:
f | n | p | |
---|---|---|---|
<category> | <int64> | <float64> | |
17 | w | 134 | 0.134 |
18 | x | 128 | 0.128 |
20 | z | 112 | 0.112 |
8 | n | 105 | 0.105 |
1 | c | 88 | 0.088 |
14 | t | 87 | 0.087 |
19 | y | 72 | 0.072 |
5 | k | 67 | 0.067 |
15 | u | 49 | 0.049 |
4 | i | 45 | 0.045 |
2 | d | 37 | 0.037 |
12 | r | 22 | 0.022 |
16 | v | 17 | 0.017 |
7 | m | 14 | 0.014 |
0 | b | 8 | 0.008 |
11 | q | 5 | 0.005 |
9 | o | 4 | 0.004 |
6 | l | 2 | 0.002 |
10 | p | 2 | 0.002 |
3 | e | 1 | 0.001 |
13 | s | 1 | 0.001 |
fct_match¶
In [9]:
Copied!
table(fct_match(gss_cat.marital, c("Married", "Divorced")))
table(fct_match(gss_cat.marital, c("Married", "Divorced")))
Out[9]:
False | True | |
---|---|---|
<int64> | <int64> | |
count | 7983 | 13500 |
In [10]:
Copied!
table(numpy.isin(gss_cat.marital, c("Maried", "Davorced")))
table(numpy.isin(gss_cat.marital, c("Maried", "Davorced")))
Out[10]:
False | |
---|---|
<int64> | |
count | 21483 |
In [11]:
Copied!
with try_catch():
table(fct_match(gss_cat.marital, c("Maried", "Davorced")))
with try_catch():
table(fct_match(gss_cat.marital, c("Maried", "Davorced")))
[ValueError] Levels not present in factor: ['Maried' 'Davorced'].
fct_unique¶
In [12]:
Copied!
fct = factor(letters[rpois(100, 10)-1])
unique(fct)
fct = factor(letters[rpois(100, 10)-1])
unique(fct)
Out[12]:
array(['p', 'k', 'i', 'j', 'e', 'r', 'm', 'g', 'n', 'f', 'o', 'h', 'l', 'd', 'c'], dtype=object)
In [13]:
Copied!
fct_unique(fct)
fct_unique(fct)
Out[13]:
['c', 'd', 'e', 'f', 'g', ..., 'm', 'n', 'o', 'p', 'r'] Length: 15 Categories (15, object): ['c', 'd', 'e', 'f', ..., 'n', 'o', 'p', 'r']
lvls_reorder, lvls_revalue and lvls_expand¶
In [14]:
Copied!
fct = factor(c("a", "b", "c"))
lvls_reorder(fct, [2,1,0])
fct = factor(c("a", "b", "c"))
lvls_reorder(fct, [2,1,0])
Out[14]:
['a', 'b', 'c'] Categories (3, object): ['c', 'b', 'a']
In [15]:
Copied!
lvls_revalue(fct, c("apple", "banana", "carrot"))
lvls_revalue(fct, c("apple", "banana", "carrot"))
Out[15]:
['apple', 'banana', 'carrot'] Categories (3, object): ['apple', 'banana', 'carrot']
In [16]:
Copied!
lvls_expand(fct, c("a", "b", "c", "d"))
lvls_expand(fct, c("a", "b", "c", "d"))
Out[16]:
['a', 'b', 'c'] Categories (4, object): ['a', 'b', 'c', 'd']
lvls_union¶
In [17]:
Copied!
fs = [factor("a"), factor("b"), factor(c("a", "b"))]
lvls_union(fs)
fs = [factor("a"), factor("b"), factor(c("a", "b"))]
lvls_union(fs)
Out[17]:
array(['a', 'b'], dtype=object)