pack

In [1]:

Copied!

%run nb_helpers.py

from datar.data import iris
from datar.all import *

nb_header(pack, unpack)
%run nb_helpers.py

from datar.data import iris
from datar.all import *

nb_header(pack, unpack)

Try this notebook on binder.

★ pack
¶

Makes df narrow by collapsing a set of columns into a single df-column.¶

Args:¶

_data: A data frame
**cols: Columns to pack
_names_sep: If None, the default, the names will be left as is.
Inner names will come from the former outer names
If a string, the inner and outer names will be used together.
The names of the new outer columns will be formed by pasting
together the outer and the inner column names, separated by
_names_sep.

★ unpack
¶

Makes df wider by expanding df-columns back out into individual columns.¶

For empty columns, the column is kept asis, instead of removing it.

Args:¶

data: A data frame
cols: Columns to unpack
names_sep: If None, the default, the names will be left as is.
Inner names will come from the former outer names
If a string, the inner and outer names will be used together.
The names of the new outer columns will be formed by pasting
together the outer and the inner column names, separated by
_names_sep.

name_repair: treatment of problematic column names:
- "minimal": No name repair or checks, beyond basic existence,

- "unique": Make sure names are unique and not empty,

- "check_unique": (default value), no name repair, but check they are unique,

- "universal": Make the names unique and syntactic

- a function: apply custom name repair

Returns:¶

Data frame with given columns unpacked.

In [2]:

Copied!

df = tibble(x1 = c[1:3], x2 = c[4:6], x3 = c[7:9], y = c[1:3])
df
df = tibble(x1 = c[1:3], x2 = c[4:6], x3 = c[7:9], y = c[1:3])
df

Out[2]:

	x1	x2	x3	y
	<int64>	<int64>	<int64>	<int64>
0	1	4	7	1
1	2	5	8	2

In [3]:

Copied!

df >> pack(x=starts_with('x'))
df >> pack(x=starts_with('x'))

Out[3]:

	y	x$x1	x$x2	x$x3
	<int64>	<int64>	<int64>	<int64>
0	1	1	4	7
1	2	2	5	8

In [4]:

Copied!

df >> pack(x=c(f.x1, f.x2, f.x3), y=f.y)
df >> pack(x=c(f.x1, f.x2, f.x3), y=f.y)

Out[4]:

	x$x1	x$x2	x$x3	y$y
	<int64>	<int64>	<int64>	<int64>
0	1	4	7	1
1	2	5	8	2

In [5]:

Copied!





iris >> pack(
    Sepal=starts_with("Sepal"),
    Petal=starts_with("Petal"),
    _names_sep="_"
)
iris >> pack(
    Sepal=starts_with("Sepal"),
    Petal=starts_with("Petal"),
    _names_sep="_"
)

Out[5]:

	Species	Sepal$Length	Sepal$Width	Petal$Length	Petal$Width
	<object>	<float64>	<float64>	<float64>	<float64>
0	setosa	5.1	3.5	1.4	0.2
1	setosa	4.9	3.0	1.4	0.2
2	setosa	4.7	3.2	1.3	0.2
3	setosa	4.6	3.1	1.5	0.2
...	...	...	...	...	...
4	setosa	5.0	3.6	1.4	0.2
145	virginica	6.7	3.0	5.2	2.3
146	virginica	6.3	2.5	5.0	1.9
147	virginica	6.5	3.0	5.2	2.0
148	virginica	6.2	3.4	5.4	2.3
149	virginica	5.9	3.0	5.1	1.8

150 rows × 5 columns

In [6]:

Copied!





# Unpacking ===========================================================

df = tibble(
  x = c[1:4],
  y = tibble(a = c[1:4], b = c[4:1]),
  z = tibble(X = c("a", "b", "c"), Y = runif(3), Z = c(TRUE, FALSE, NA))
)
df
# Unpacking ===========================================================

df = tibble(
  x = c[1:4],
  y = tibble(a = c[1:4], b = c[4:1]),
  z = tibble(X = c("a", "b", "c"), Y = runif(3), Z = c(TRUE, FALSE, NA))
)
df

Out[6]:

	x	y$a	y$b	z$X	z$Y	z$Z
	<int64>	<int64>	<int64>	<object>	<float64>	<object>
0	1	1	4	a	0.286761	True
1	2	2	3	b	0.532775	False
2	3	3	2	c	0.497844	NaN

In [7]:

Copied!

df >> unpack(f.y)
df >> unpack(f.y)

Out[7]:

	x	a	b	z$X	z$Y	z$Z
	<int64>	<int64>	<int64>	<object>	<float64>	<object>
0	1	1	4	a	0.286761	True
1	2	2	3	b	0.532775	False
2	3	3	2	c	0.497844	NaN

In [8]:

Copied!

df >> unpack(c(f.y, f.z))
df >> unpack(c(f.y, f.z))

Out[8]:

	x	a	b	X	Y	Z
	<int64>	<int64>	<int64>	<object>	<float64>	<object>
0	1	1	4	a	0.286761	True
1	2	2	3	b	0.532775	False
2	3	3	2	c	0.497844	NaN

In [9]:

Copied!

df >> unpack(c(f.y, f.z), names_sep="_")
df >> unpack(c(f.y, f.z), names_sep="_")

Out[9]:

	x	y_a	y_b	z_X	z_Y	z_Z
	<int64>	<int64>	<int64>	<object>	<float64>	<object>
0	1	1	4	a	0.286761	True
1	2	2	3	b	0.532775	False
2	3	3	2	c	0.497844	NaN

In [10]:

Copied!

with try_catch():
    # indexes from inner data frame counts
    df >> unpack(c(2,3))
with try_catch():
    # indexes from inner data frame counts
    df >> unpack(c(2,3))

Out[10]:

	x	a	b	X	Y	Z
	<int64>	<int64>	<int64>	<object>	<float64>	<object>
0	1	1	4	a	0.286761	True
1	2	2	3	b	0.532775	False
2	3	3	2	c	0.497844	NaN

In [11]:

Copied!

df >> unpack(c(2,4))
df >> unpack(c(2,4))

Out[11]:

	x	a	b	X	Y	Z
	<int64>	<int64>	<int64>	<object>	<float64>	<object>
0	1	1	4	a	0.286761	True
1	2	2	3	b	0.532775	False
2	3	3	2	c	0.497844	NaN

pack

★ pack¶

Makes df narrow by collapsing a set of columns into a single df-column.¶

Args:¶

★ unpack¶

Makes df wider by expanding df-columns back out into individual columns.¶

Args:¶

Returns:¶

★ pack
¶

★ unpack
¶