forcats_lvl_value
%run nb_helpers.py
from datar.all import *
from datar.data import gss_cat
gss_cat >>= mutate(rincome=as_factor(f.rincome))
nb_header(
fct_anon,
fct_collapse,
fct_lump,
fct_lump_min,
fct_lump_prop,
fct_lump_n,
fct_lump_lowfreq,
fct_other,
fct_recode,
fct_relabel,
book="forcat_lvl_value",
)
★ fct_anon¶
★ fct_collapse¶
Collapse factor levels into manually defined groups¶
Args:¶
_f
: A factor
**kwargs
: The levels to collapse.
Like name=[old_level, old_level1, ...]
. The old levels will
be replaced with name
other_level
: Replace all levels not named in kwargs
.
If not, don't collapse them.
Returns:¶
The factor with levels collapsed.
★ fct_lump¶
Lump together factor levels into "other"¶
Args:¶
f
: A factor
n
: Positive n
preserves the most common n
values.
Negative n
preserves the least common -n
values.
It there are ties, you will get at least abs(n)
values.
prop
: Positive prop
lumps values which do not appear at least
prop
of the time. Negative prop
lumps values that
do not appear at most -prop
of the time.
w
: An optional numeric vector giving weights for frequency of
each value (not level) in f.
other_level
: Value of level used for "other" values. Always
placed at end of levels.
ties_method A character string specifying how ties are treated.
One of: average
, first
, dense
, max
, and min
.
Returns:¶
The factor with levels lumped.
★ fct_lump_min¶
lumps levels that appear fewer than min_
times.¶
Args:¶
_f
: A factor
min_
: Preserve levels that appear at least min_
number of times.
w
: An optional numeric vector giving weights for frequency of
each value (not level) in f.
other_level
: Value of level used for "other" values. Always
placed at end of levels.
Returns:¶
The factor with levels lumped.
★ fct_lump_prop¶
Lumps levels that appear in fewer prop * n
times.¶
Args:¶
_f
: A factor
prop
: Positive prop
lumps values which do not appear at least
prop
of the time. Negative prop
lumps values that
do not appear at most -prop
of the time.
w
: An optional numeric vector giving weights for frequency of
each value (not level) in f.
other_level
: Value of level used for "other" values. Always
placed at end of levels.
Returns:¶
The factor with levels lumped.
★ fct_lump_n¶
Lumps all levels except for the n
most frequent.¶
Args:¶
f
: A factor
n
: Positive n
preserves the most common n
values.
Negative n
preserves the least common -n
values.
It there are ties, you will get at least abs(n)
values.
w
: An optional numeric vector giving weights for frequency of
each value (not level) in f.
other_level
: Value of level used for "other" values. Always
placed at end of levels.
ties_method A character string specifying how ties are treated.
One of: average
, first
, dense
, max
, and min
.
Returns:¶
The factor with levels lumped.
★ fct_lump_lowfreq¶
★ fct_other¶
Replace levels with "other"¶
Args:¶
_f
: A factor
keep
: and
drop
: Pick one of keep
and drop
:
- keep
will preserve listed levels, replacing all others with
other_level
.
- drop
will replace listed levels with other_level
, keeping all
as is.
other_level
: Value of level used for "other" values. Always
placed at end of levels.
Returns:¶
The factor with levels replaced.
★ fct_recode¶
Change factor levels by hand¶
Args:¶
_f
: A factor
*args
: and
**kwargs
: A sequence of named character vectors where the name
gives the new level, and the value gives the old level.
Levels not otherwise mentioned will be left as is. Levels can
be removed by naming them NULL
.
As NULL/None
cannot be a name of keyword arguments, replacement
has to be specified as a dict
(i.e. fct_recode(x, {NULL: "apple"})
)
If you want to replace multiple values with the same old value,
use a set
/list
/numpy.ndarray
(i.e. fct_recode(x, fruit=["apple", "banana"])
).
This is a safe way, since set
/list
/numpy.ndarray
is
not hashable to be a level of a factor.
Do NOT use a tuple
, as it's hashable!
Note that the order of the name-value is in the reverse way as
dplyr.recode()
and dplyr.recode_factor()
Returns:¶
The factor recoded with given recodings
★ fct_relabel¶
Automatically relabel factor levels, collapse as necessary¶
Args:¶
_f
: A factor
_fun
: A function to be applied to each level. Must accept the old
levels and return a character vector of the same length
as its input.
*args
: and
**kwargs
: Addtional arguments to _fun
Returns:¶
The factor with levels relabeled
fct_anon¶
gss_cat.relig >> fct_count()
f | n | |
---|---|---|
<category> | <int64> | |
0 | Buddhism | 147 |
1 | Catholic | 5124 |
2 | Christian | 689 |
3 | Don't know | 15 |
4 | Hinduism | 71 |
5 | Inter-nondenominational | 109 |
6 | Jewish | 388 |
7 | Moslem/islam | 104 |
8 | Native american | 23 |
9 | No answer | 93 |
10 | None | 3523 |
11 | Orthodox-christian | 95 |
12 | Other | 224 |
13 | Other eastern | 32 |
14 | Protestant | 10846 |
gss_cat.relig >> fct_anon() >> fct_count()
f | n | |
---|---|---|
<category> | <int64> | |
0 | 00 | 147 |
1 | 01 | 104 |
2 | 02 | 93 |
3 | 03 | 109 |
4 | 04 | 5124 |
5 | 05 | 689 |
6 | 06 | 32 |
7 | 07 | 15 |
8 | 08 | 224 |
9 | 09 | 71 |
10 | 10 | 3523 |
11 | 11 | 10846 |
12 | 12 | 23 |
13 | 13 | 388 |
14 | 14 | 95 |
gss_cat.relig >> fct_anon("X") >> fct_count()
f | n | |
---|---|---|
<category> | <int64> | |
0 | X00 | 388 |
1 | X01 | 689 |
2 | X02 | 224 |
3 | X03 | 3523 |
4 | X04 | 5124 |
5 | X05 | 147 |
6 | X06 | 10846 |
7 | X07 | 109 |
8 | X08 | 95 |
9 | X09 | 23 |
10 | X10 | 15 |
11 | X11 | 71 |
12 | X12 | 104 |
13 | X13 | 32 |
14 | X14 | 93 |
fct_collapse¶
fct_count(gss_cat.partyid)
f | n | |
---|---|---|
<category> | <int64> | |
0 | Don't know | 1 |
1 | Ind,near dem | 2499 |
2 | Ind,near rep | 1791 |
3 | Independent | 4119 |
4 | No answer | 154 |
5 | Not str democrat | 3690 |
6 | Not str republican | 3032 |
7 | Other party | 393 |
8 | Strong democrat | 3490 |
9 | Strong republican | 2314 |
partyid2 = fct_collapse(
gss_cat.partyid,
missing = c("No answer", "Don't know"),
other = "Other party",
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)
fct_count(partyid2)
f | n | |
---|---|---|
<category> | <int64> | |
0 | missing | 155 |
1 | ind | 8409 |
2 | dem | 7180 |
3 | rep | 5346 |
4 | other | 393 |
fct_recode¶
x = factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit=["apple", "banana"])
['fruit', 'bear', 'fruit', 'dear'] Categories (3, object): ['fruit', 'bear', 'dear']
# If you make a mistake you'll get a warning
fct_recode(x, fruit=["apple", "bananana"])
[2022-12-02 14:01:25][datar][WARNING] [fct_recode] Unknown levels in `_f`: {'bananana'}
['fruit', 'bear', 'banana', 'dear'] Categories (4, object): ['fruit', 'banana', 'bear', 'dear']
fct_recode(x, {NULL: "apple"}, fruit = "banana")
[NaN, 'fruit', 'bear', 'dear'] Categories (3, object): ['fruit', 'bear', 'dear']
# Anything cannot be a keyword directly, use a dict
fct_recode(x, {"an apple": "apple", "a bear": "bear"})
['an apple', 'a bear', 'banana', 'dear'] Categories (4, object): ['an apple', 'banana', 'a bear', 'dear']
fct_lump, fct_lump_min, fct_lump_prop, fct_lump_n, and fct_lump_lowfreq¶
x = factor(rep(LETTERS[:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
table(x)
A | B | C | D | E | F | G | H | I | |
---|---|---|---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | |
count | 40 | 10 | 5 | 27 | 1 | 1 | 1 | 1 | 1 |
x >> fct_lump_n(3)
table(_)
['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other'] Length: 87 Categories (4, object): ['A', 'B', 'D', 'Other']
A | B | D | Other | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
count | 40 | 10 | 27 | 10 |
x >> fct_lump_prop(0.10)
table(_)
['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other'] Length: 87 Categories (4, object): ['A', 'B', 'D', 'Other']
A | B | D | Other | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
count | 40 | 10 | 27 | 10 |
x >> fct_lump_min(5)
table(_)
['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other'] Length: 87 Categories (5, object): ['A', 'B', 'C', 'D', 'Other']
A | B | C | D | Other | |
---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | |
count | 40 | 10 | 5 | 27 | 5 |
x >> fct_lump_lowfreq()
table(_)
['A', 'A', 'A', 'A', 'A', ..., 'Other', 'Other', 'Other', 'Other', 'Other'] Length: 87 Categories (3, object): ['A', 'D', 'Other']
A | D | Other | |
---|---|---|---|
<int64> | <int64> | <int64> | |
count | 40 | 27 | 20 |
x = factor(LETTERS[rpois(100, 5)])
x
['D', 'E', 'D', 'I', 'E', ..., 'D', 'E', 'L', 'D', 'E'] Length: 100 Categories (12, object): ['B', 'C', 'D', 'E', ..., 'J', 'K', 'L', 'M']
table(x)
B | C | D | E | F | G | H | I | J | K | L | M | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | |
count | 1 | 9 | 17 | 18 | 18 | 13 | 8 | 8 | 4 | 1 | 1 | 2 |
table(fct_lump_lowfreq(x))
B | C | D | E | F | G | H | I | J | K | L | M | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | <int64> | |
count | 1 | 9 | 17 | 18 | 18 | 13 | 8 | 8 | 4 | 1 | 1 | 2 |
fct_lump_n(x, n = 3)
['D', 'E', 'D', 'Other', 'E', ..., 'D', 'E', 'Other', 'D', 'E'] Length: 100 Categories (4, object): ['D', 'E', 'F', 'Other']
fct_lump_prop(x, prop = 0.1)
['D', 'E', 'D', 'Other', 'E', ..., 'D', 'E', 'Other', 'D', 'E'] Length: 100 Categories (5, object): ['D', 'E', 'F', 'G', 'Other']
# Use negative values to collapse the most common
fct_lump_n(x, n = -3)
['Other', 'Other', 'Other', 'Other', 'Other', ..., 'Other', 'Other', 'L', 'Other', 'Other'] Length: 100 Categories (4, object): ['B', 'K', 'L', 'Other']
fct_lump_prop(x, prop = -0.1)
['Other', 'Other', 'Other', 'I', 'Other', ..., 'Other', 'Other', 'L', 'Other', 'Other'] Length: 100 Categories (9, object): ['B', 'C', 'H', 'I', ..., 'K', 'L', 'M', 'Other']
w = c(rep(2, 50), rep(1, 50))
fct_lump_n(x, n = 5, w = w)
['D', 'E', 'D', 'Other', 'E', ..., 'D', 'E', 'Other', 'D', 'E'] Length: 100 Categories (6, object): ['B', 'C', 'D', 'E', 'H', 'Other']
fct_lump_n(x, n = 6)
['D', 'E', 'D', 'I', 'E', ..., 'D', 'E', 'Other', 'D', 'E'] Length: 100 Categories (8, object): ['C', 'D', 'E', 'F', 'G', 'H', 'I', 'Other']
fct_lump_n(x, n = 6, ties_method = "max")
['D', 'E', 'D', 'Other', 'E', ..., 'D', 'E', 'Other', 'D', 'E'] Length: 100 Categories (6, object): ['C', 'D', 'E', 'F', 'G', 'Other']
# Use fct_lump_min() to lump together all levels with fewer than `n` values
table(fct_lump_min(x, min = 10))
D | E | F | G | Other | |
---|---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | <int64> | |
count | 17 | 18 | 18 | 13 | 34 |
table(fct_lump_min(x, min = 15))
D | E | F | Other | |
---|---|---|---|---|
<int64> | <int64> | <int64> | <int64> | |
count | 17 | 18 | 18 | 47 |
fct_other¶
fct_other(x, keep = c("A", "B"))
['Other', 'Other', 'Other', 'Other', 'Other', ..., 'Other', 'Other', 'Other', 'Other', 'Other'] Length: 100 Categories (2, object): ['B', 'Other']
fct_other(x, drop = c("A", "B"))
['D', 'E', 'D', 'I', 'E', ..., 'D', 'E', 'L', 'D', 'E'] Length: 100 Categories (12, object): ['C', 'D', 'E', 'F', ..., 'K', 'L', 'M', 'Other']
fct_recode¶
x = factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit = ["apple", "banana"])
['fruit', 'bear', 'fruit', 'dear'] Categories (3, object): ['fruit', 'bear', 'dear']
# If you make a mistake you'll get a warning
fct_recode(x, fruit = ["apple", "bananana"])
[2022-12-02 14:01:52][datar][WARNING] [fct_recode] Unknown levels in `_f`: {'bananana'}
['fruit', 'bear', 'banana', 'dear'] Categories (4, object): ['fruit', 'banana', 'bear', 'dear']
# If you name the level NULL it will be removed
fct_recode(x, {NULL: "apple"}, fruit = "banana")
[NaN, 'fruit', 'bear', 'dear'] Categories (3, object): ['fruit', 'bear', 'dear']
fct_recode(x, {"an apple": "apple", "a bear": "bear"})
['an apple', 'a bear', 'banana', 'dear'] Categories (4, object): ['an apple', 'banana', 'a bear', 'dear']
fct_relabel¶
gss_cat.partyid >> fct_count()
f | n | |
---|---|---|
<category> | <int64> | |
0 | Don't know | 1 |
1 | Ind,near dem | 2499 |
2 | Ind,near rep | 1791 |
3 | Independent | 4119 |
4 | No answer | 154 |
5 | Not str democrat | 3690 |
6 | Not str republican | 3032 |
7 | Other party | 393 |
8 | Strong democrat | 3490 |
9 | Strong republican | 2314 |
gss_cat.partyid >> fct_relabel(lambda old: gsub(",", ", ", old)) >> fct_count()
f | n | |
---|---|---|
<category> | <int64> | |
0 | Don't know | 1 |
1 | Ind, near dem | 2499 |
2 | Ind, near rep | 1791 |
3 | Independent | 4119 |
4 | No answer | 154 |
5 | Not str democrat | 3690 |
6 | Not str republican | 3032 |
7 | Other party | 393 |
8 | Strong democrat | 3490 |
9 | Strong republican | 2314 |
fct_count(gss_cat.rincome)
f | n | |
---|---|---|
<category> | <int64> | |
0 | $1000 to 2999 | 395 |
1 | $10000 - 14999 | 1168 |
2 | $15000 - 19999 | 1048 |
3 | $20000 - 24999 | 1283 |
4 | $25000 or more | 7363 |
5 | $3000 to 3999 | 276 |
6 | $4000 to 4999 | 226 |
7 | $5000 to 5999 | 227 |
8 | $6000 to 6999 | 215 |
9 | $7000 to 7999 | 188 |
10 | $8000 to 9999 | 340 |
11 | Don't know | 267 |
12 | Lt $1000 | 286 |
13 | No answer | 183 |
14 | Not applicable | 7043 |
15 | Refused | 975 |
def convert_income(income):
regex = r"^(?:Lt |)[$]([0-9]+).*$"
is_range = grepl(regex, income)
num_income = as_numeric(gsub(regex, r"\1", income[is_range]))
num_income = trunc(num_income / 5000) * 5000
income[is_range] = paste0("Gt $", num_income)
return income
convert_income(levels(gss_cat.rincome))
array(['Gt $0.0', 'Gt $10000.0', 'Gt $15000.0', 'Gt $20000.0', 'Gt $25000.0', 'Gt $0.0', 'Gt $0.0', 'Gt $5000.0', 'Gt $5000.0', 'Gt $5000.0', 'Gt $5000.0', "Don't know", 'Gt $0.0', 'No answer', 'Not applicable', 'Refused'], dtype=object)
rincome2 = fct_relabel(gss_cat.rincome, convert_income)
fct_count(rincome2)
f | n | |
---|---|---|
<category> | <int64> | |
0 | Gt $0.0 | 1183 |
1 | Gt $10000.0 | 1168 |
2 | Gt $15000.0 | 1048 |
3 | Gt $20000.0 | 1283 |
4 | Gt $25000.0 | 7363 |
5 | Gt $5000.0 | 970 |
6 | Don't know | 267 |
7 | No answer | 183 |
8 | Not applicable | 7043 |
9 | Refused | 975 |