Skip to content

SOURCE CODE datar.apis.tidyr DOCS

from __future__ import annotations as _
from typing import Any, Callable as _Callable, Mapping as _Mapping

from pipda import (
    register_verb as _register_verb,
    register_func as _register_func,
)

from ..core.utils import (
    NotImplementedByCurrentBackendError as _NotImplementedByCurrentBackendError,
)
from .base import expand_grid  # noqa: F401


@_register_func(pipeable=True, dispatchable=True)DOCS
def full_seq(x, period, tol=1e-6) -> Any:
    """Create the full sequence of values in a vector

    Args:
        x: A numeric vector.
        period: Gap between each observation. The existing data will be
            checked to ensure that it is actually of this periodicity.
        tol: Numerical tolerance for checking periodicity.

    Returns:
        The full sequence
    """
    raise _NotImplementedByCurrentBackendError("full_seq", x)


@_register_verb()DOCS
def chop(
    data,
    cols=None,
) -> Any:
    """Makes data frame shorter by converting rows within each group
    into list-columns.

    Args:
        data: A data frame
        cols: Columns to chop

    Returns:
        Data frame with selected columns chopped
    """
    raise _NotImplementedByCurrentBackendError("chop", data)


@_register_verb()DOCS
def unchop(
    data,
    cols=None,
    keep_empty: bool = False,
    dtypes=None,
) -> Any:
    """Makes df longer by expanding list-columns so that each element
    of the list-column gets its own row in the output.

    See https://tidyr.tidyverse.org/reference/chop.html

    Recycling size-1 elements might be different from `tidyr`
        >>> df = tibble(x=[1, [2,3]], y=[[2,3], 1])
        >>> df >> unchop([f.x, f.y])
        >>> # tibble(x=[1,2,3], y=[2,3,1])
        >>> # instead of following in tidyr
        >>> # tibble(x=[1,1,2,3], y=[2,3,1,1])

    Args:
        data: A data frame.
        cols: Columns to unchop.
        keep_empty: By default, you get one row of output for each element
            of the list your unchopping/unnesting.
            This means that if there's a size-0 element
            (like NULL or an empty data frame), that entire row will be
            dropped from the output.
            If you want to preserve all rows, use `keep_empty` = `True` to
            replace size-0 elements with a single row of missing values.
        dtypes: Providing the dtypes for the output columns.
            Could be a single dtype, which will be applied to all columns, or
            a dictionary of dtypes with keys for the columns and values the
            dtypes.
            For nested data frames, we need to specify `col$a` as key. If `col`
            is used as key, all columns of the nested data frames will be casted
            into that dtype.

    Returns:
        A data frame with selected columns unchopped.
    """
    raise _NotImplementedByCurrentBackendError("unchop", data)


@_register_verb()DOCS
def nest(
    _data,
    _names_sep: str = None,
    **cols: str | int,
) -> Any:
    """Nesting creates a list-column of data frames

    Args:
        _data: A data frame
        **cols: Columns to nest
        _names_sep: If `None`, the default, the names will be left as is.
            Inner names will come from the former outer names
            If a string, the inner and outer names will be used together.
            The names of the new outer columns will be formed by pasting
            together the outer and the inner column names, separated by
            `_names_sep`.

    Returns:
        Nested data frame.
    """
    raise _NotImplementedByCurrentBackendError("nest", _data)


@_register_verb()DOCS
def unnest(
    data,
    *cols: str | int,
    keep_empty: bool = False,
    dtypes=None,
    names_sep: str = None,
    names_repair: str | _Callable = "check_unique",
) -> Any:
    """Flattens list-column of data frames back out into regular columns.

    Args:
        data: A data frame to flatten.
        *cols: Columns to unnest.
        keep_empty: By default, you get one row of output for each element
            of the list your unchopping/unnesting.
            This means that if there's a size-0 element
            (like NULL or an empty data frame), that entire row will be
            dropped from the output.
            If you want to preserve all rows, use `keep_empty` = `True` to
            replace size-0 elements with a single row of missing values.
        dtypes: Providing the dtypes for the output columns.
            Could be a single dtype, which will be applied to all columns, or
            a dictionary of dtypes with keys for the columns and values the
            dtypes.
        names_sep: If `None`, the default, the names will be left as is.
            Inner names will come from the former outer names
            If a string, the inner and outer names will be used together.
            The names of the new outer columns will be formed by pasting
            together the outer and the inner column names, separated by
            `names_sep`.
        names_repair: treatment of problematic column names:
            - "minimal": No name repair or checks, beyond basic existence,
            - "unique": Make sure names are unique and not empty,
            - "check_unique": (default value), no name repair,
                but check they are unique,
            - "universal": Make the names unique and syntactic
            - a function: apply custom name repair

    Returns:
        Data frame with selected columns unnested.
    """
    raise _NotImplementedByCurrentBackendError("unnest", data)


@_register_verb()DOCS
def pack(
    _data,
    _names_sep: str = None,
    **cols: str | int,
) -> Any:
    """Makes df narrow by collapsing a set of columns into a single df-column.

    Args:
        _data: A data frame
        **cols: Columns to pack
        _names_sep: If `None`, the default, the names will be left as is.
            Inner names will come from the former outer names
            If a string, the inner and outer names will be used together.
            The names of the new outer columns will be formed by pasting
            together the outer and the inner column names, separated by
            `_names_sep`.
    """
    raise _NotImplementedByCurrentBackendError("pack", _data)


@_register_verb()DOCS
def unpack(
    data,
    cols,
    names_sep: str = None,
    names_repair: str | _Callable = "check_unique",
) -> Any:
    """Makes df wider by expanding df-columns back out into individual columns.

    For empty columns, the column is kept asis, instead of removing it.

    Args:
        data: A data frame
        cols: Columns to unpack
        names_sep: If `None`, the default, the names will be left as is.
            Inner names will come from the former outer names
            If a string, the inner and outer names will be used together.
            The names of the new outer columns will be formed by pasting
            together the outer and the inner column names, separated by
            `_names_sep`.
        name_repair: treatment of problematic column names:
            - "minimal": No name repair or checks, beyond basic existence,
            - "unique": Make sure names are unique and not empty,
            - "check_unique": (default value), no name repair,
                but check they are unique,
            - "universal": Make the names unique and syntactic
            - a function: apply custom name repair

    Returns:
        Data frame with given columns unpacked.
    """
    raise _NotImplementedByCurrentBackendError("unpack", data)


@_register_verb()DOCS
def expand(
    data,
    *args,
    _name_repair: str | _Callable = "check_unique",
    **kwargs,
) -> Any:
    """Generates all combination of variables found in a dataset.

    Args:
        data: A data frame
        *args: and,
        **kwargs: columns to expand. Columns can be atomic lists.
            - To find all unique combinations of x, y and z, including
              those not present in the data, supply each variable as a
              separate argument: `expand(df, x, y, z)`.
            - To find only the combinations that occur in the data, use
              nesting: `expand(df, nesting(x, y, z))`.
            - You can combine the two forms. For example,
              `expand(df, nesting(school_id, student_id), date)` would
              produce a row for each present school-student combination
              for all possible dates.
        _name_repair: treatment of problematic column names:
            - "minimal": No name repair or checks, beyond basic existence,
            - "unique": Make sure names are unique and not empty,
            - "check_unique": (default value), no name repair,
                but check they are unique,
            - "universal": Make the names unique and syntactic
            - a function: apply custom name repair

    Returns:
        A data frame with all combination of variables.
    """
    raise _NotImplementedByCurrentBackendError("expand", data)


@_register_func(dispatchable=True)DOCS
def nesting(
    *args,
    _name_repair: str | _Callable = "check_unique",
    **kwargs,
) -> Any:
    """A helper that only finds combinations already present in the data.

    Args:
        *args: and,
        **kwargs: columns to expand. Columns can be atomic lists.
            - To find all unique combinations of x, y and z, including
              those not present in the data, supply each variable as a
              separate argument: `expand(df, x, y, z)`.
            - To find only the combinations that occur in the data, use
              nesting: `expand(df, nesting(x, y, z))`.
            - You can combine the two forms. For example,
              `expand(df, nesting(school_id, student_id), date)` would
              produce a row for each present school-student combination
              for all possible dates.
        _name_repair: treatment of problematic column names:
            - "minimal": No name repair or checks, beyond basic existence,
            - "unique": Make sure names are unique and not empty,
            - "check_unique": (default value), no name repair,
                but check they are unique,
            - "universal": Make the names unique and syntactic
            - a function: apply custom name repair

    Returns:
        A data frame with all combinations in data.
    """
    raise _NotImplementedByCurrentBackendError("nesting")


@_register_func(dispatchable=True)DOCS
def crossing(
    *args,
    _name_repair: str | _Callable = "check_unique",
    **kwargs,
) -> Any:
    """A wrapper around `expand_grid()` that de-duplicates and sorts its inputs

    When values are not specified by literal `list`, they will be sorted.

    Args:
        *args: and,
        **kwargs: columns to expand. Columns can be atomic lists.
            - To find all unique combinations of x, y and z, including
              those not present in the data, supply each variable as a
              separate argument: `expand(df, x, y, z)`.
            - To find only the combinations that occur in the data, use
              nesting: `expand(df, nesting(x, y, z))`.
            - You can combine the two forms. For example,
              `expand(df, nesting(school_id, student_id), date)` would
              produce a row for each present school-student combination
              for all possible dates.
        _name_repair: treatment of problematic column names:
            - "minimal": No name repair or checks, beyond basic existence,
            - "unique": Make sure names are unique and not empty,
            - "check_unique": (default value), no name repair,
                but check they are unique,
            - "universal": Make the names unique and syntactic
            - a function: apply custom name repair

    Returns:
        A data frame with values deduplicated and sorted.
    """
    raise _NotImplementedByCurrentBackendError("crossing")


@_register_verb()DOCS
def complete(
    data,
    *args,
    fill=None,
    explict: bool = True,
) -> Any:
    """Turns implicit missing values into explicit missing values.

    Args:
        data: A data frame
        *args: columns to expand. Columns can be atomic lists.
            - To find all unique combinations of x, y and z, including
              those not present in the data, supply each variable as a
              separate argument: `expand(df, x, y, z)`.
            - To find only the combinations that occur in the data, use
              nesting: `expand(df, nesting(x, y, z))`.
            - You can combine the two forms. For example,
              `expand(df, nesting(school_id, student_id), date)` would
              produce a row for each present school-student combination
              for all possible dates.
        fill: A named list that for each variable supplies a single value
            to use instead of NA for missing combinations.
        explict: Should both implicit (newly created) and explicit
            (pre-existing) missing values be filled by fill? By default,
            this is TRUE, but if set to FALSE this will limit the fill to only
            implicit missing values.

    Returns:
        Data frame with missing values completed
    """
    raise _NotImplementedByCurrentBackendError("complete", data)


@_register_verb()DOCS
def drop_na(
    _data,
    *columns: str,
    _how: str = "any",
) -> Any:
    """Drop rows containing missing values

    See https://tidyr.tidyverse.org/reference/drop_na.html

    Args:
        data: A data frame.
        *columns: Columns to inspect for missing values.
        _how: How to select the rows to drop
            - all: All columns of `columns` to be `NA`s
            - any: Any columns of `columns` to be `NA`s
            (tidyr doesn't support this argument)

    Returns:
        Dataframe with rows with NAs dropped and indexes dropped
    """
    raise _NotImplementedByCurrentBackendError("drop_na", _data)


@_register_verb()DOCS
def extract(
    data,
    col: str | int,
    into,
    regex: str = r"(\w+)",
    remove: bool = True,
    convert=False,
) -> Any:
    """Given a regular expression with capturing groups, extract() turns each
    group into a new column. If the groups don't match, or the input is NA,
    the output will be NA.

    See https://tidyr.tidyverse.org/reference/extract.html

    Args:
        data: The dataframe
        col: Column name or position.
        into: Names of new variables to create as character vector.
            Use None to omit the variable in the output.
        regex: a regular expression used to extract the desired values.
            There should be one group (defined by ()) for each element of into.
        remove: If TRUE, remove input column from output data frame.
        convert: The universal type for the extracted columns or a dict for
            individual ones

    Returns:
        Dataframe with extracted columns.
    """
    raise _NotImplementedByCurrentBackendError("extract", data)


@_register_verb()DOCS
def fill(
    _data,
    *columns: str | int,
    _direction: str = "down",
) -> Any:
    """Fills missing values in selected columns using the next or
    previous entry.

    See https://tidyr.tidyverse.org/reference/fill.html

    Args:
        _data: A dataframe
        *columns: Columns to fill
        _direction: Direction in which to fill missing values.
            Currently either "down" (the default), "up",
            "downup" (i.e. first down and then up) or
            "updown" (first up and then down).

    Returns:
        The dataframe with NAs being replaced.
    """
    raise _NotImplementedByCurrentBackendError("fill", _data)


@_register_verb()DOCS
def pivot_longer(
    _data,
    cols,
    names_to="name",
    names_prefix: str = None,
    names_sep: str = None,
    names_pattern: str = None,
    names_dtypes=None,
    names_transform: _Callable | _Mapping[str, _Callable] = None,
    names_repair="check_unique",
    values_to: str = "value",
    values_drop_na: bool = False,
    values_dtypes=None,
    values_transform: _Callable | _Mapping[str, _Callable] = None,
) -> Any:
    """ "lengthens" data, increasing the number of rows and
    decreasing the number of columns.

    The row order is a bit different from `tidyr` and `pandas.DataFrame.melt`.
        >>> df = tibble(x=c[1:2], y=c[3:4])
        >>> pivot_longer(df, f[f.x:f.y])
        >>> #    name   value
        >>> # 0  x      1
        >>> # 1  x      2
        >>> # 2  y      3
        >>> # 3  y      4
    But with `tidyr::pivot_longer`, the output will be:
        >>> # # A tibble: 4 x 2
        >>> # name  value
        >>> # <chr> <int>
        >>> # 1 x   1
        >>> # 2 y   3
        >>> # 3 x   2
        >>> # 4 y   4

    Args:
        _data: A data frame to pivot.
        cols: Columns to pivot into longer format.
        names_to: A string specifying the name of the column to create from
            the data stored in the column names of data.
            Can be a character vector, creating multiple columns, if names_sep
            or names_pattern is provided. In this case, there are two special
            values you can take advantage of:
            - `None`/`NA`/`NULL` will discard that component of the name.
            - `.value`/`_value` indicates that component of the name defines
                the name of the column containing the cell values,
                overriding values_to.
            - Different as `tidyr`: With `.value`/`_value`, if there are other
              parts of the names to distinguish the groups, they must be
              captured. For example, use `r'(\\w)_(\\d)'` to match `'a_1'` and
              `['.value', NA]` to discard the suffix, instead of use
              `r'(\\w)_\\d'` to match.
        names_prefix: A regular expression used to remove matching text from
            the start of each variable name.
        names_sep: and
        names_pattern: If names_to contains multiple values,
            these arguments control how the column name is broken up.
            names_sep takes the same specification as separate(), and
            can either be a numeric vector (specifying positions to break on),
            or a single string (specifying a regular expression to split on).
        names_pattern: takes the same specification as extract(),
            a regular expression containing matching groups (()).
        names_dtypes: and
        values_dtypes: A list of column name-prototype pairs.
            A prototype (or dtypes for short) is a zero-length vector
            (like integer() or numeric()) that defines the type, class, and
            attributes of a vector. Use these arguments if you want to confirm
            that the created columns are the types that you expect.
            Note that if you want to change (instead of confirm) the types
            of specific columns, you should use names_transform or
            values_transform instead.
        names_transform: and
        values_transform: A list of column name-function pairs.
            Use these arguments if you need to change the types of
            specific columns. For example,
            names_transform = dict(week = as.integer) would convert a
            character variable called week to an integer.
            If not specified, the type of the columns generated from names_to
            will be character, and the type of the variables generated from
            values_to will be the common type of the input columns used to
            generate them.
        names_repair: Not supported yet.
        values_to: A string specifying the name of the column to create from
            the data stored in cell values. If names_to is a character
            containing the special `.value`/`_value` sentinel, this value
            will be ignored, and the name of the value column will be derived
            from part of the existing column names.
        values_drop_na: If TRUE, will drop rows that contain only NAs in
            the value_to column. This effectively converts explicit missing
            values to implicit missing values, and should generally be used
            only when missing values in data were created by its structure.
        names_repair: treatment of problematic column names:
            - "minimal": No name repair or checks, beyond basic existence,
            - "unique": Make sure names are unique and not empty,
            - "check_unique": (default value), no name repair,
                but check they are unique,
            - "universal": Make the names unique and syntactic
            - a function: apply custom name repair

    Returns:
        The pivoted dataframe.
    """
    raise _NotImplementedByCurrentBackendError("pivot_longer", _data)


@_register_verb()DOCS
def pivot_wider(
    _data,
    id_cols=None,
    names_from="name",
    names_prefix: str = "",
    names_sep: str = "_",
    names_glue: str = None,
    names_sort: bool = False,
    # names_repair: str = "check_unique", # todo
    values_from="value",
    values_fill=None,
    values_fn: _Callable | _Mapping[str, _Callable] = None,
) -> Any:
    """ "widens" data, increasing the number of columns and decreasing
    the number of rows.

    Args:
        _data: A data frame to pivot.
        id_cols: A set of columns that uniquely identifies each observation.
            Defaults to all columns in data except for the columns specified
            in names_from and values_from.
        names_from: and
        values_from: A pair of arguments describing which column
            (or columns) to get the name of the output column (names_from),
            and which column (or columns) to get the cell values from
            (values_from).
        names_prefix: String added to the start of every variable name.
        names_sep: If names_from or values_from contains multiple variables,
            this will be used to join their values together into a single
            string to use as a column name.
        names_glue: Instead of names_sep and names_prefix, you can supply
            a glue specification that uses the names_from columns
            (and special _value) to create custom column names.
        names_sort: Should the column names be sorted? If FALSE, the default,
            column names are ordered by first appearance.
        names_repair: todo
        values_fill: Optionally, a (scalar) value that specifies what
            each value should be filled in with when missing.
        values_fn: Optionally, a function applied to the value in each cell
            in the output. You will typically use this when the combination
            of `id_cols` and value column does not uniquely identify
            an observation.
            This can be a dict you want to apply different aggregations to
            different value columns.
            If not specified, will be `numpy.mean`

    Returns:
        The pivoted dataframe.
    """
    raise _NotImplementedByCurrentBackendError("pivot_wider", _data)


@_register_verb()DOCS
def separate(
    data,
    col: int | str,
    into,
    sep: int | str = r"[^0-9A-Za-z]+",
    remove: bool = True,
    convert=False,
    extra: str = "warn",
    fill: str = "warn",
) -> Any:
    """Given either a regular expression or a vector of character positions,
    turns a single character column into multiple columns.

    Args:
        data: The dataframe
        col: Column name or position.
        into: Names of new variables to create as character vector.
            Use `None`/`NA`/`NULL` to omit the variable in the output.
        sep: Separator between columns.
            If str, `sep` is interpreted as a regular expression.
            The default value is a regular expression that matches
            any sequence of non-alphanumeric values.
            If int, `sep` is interpreted as character positions to split at.
        remove: If TRUE, remove input column from output data frame.
        convert: The universal type for the extracted columns or a dict for
            individual ones
            Note that when given `TRUE`, `DataFrame.convert_dtypes()` is called,
            but it will not convert `str` to other types
            (For example, `'1'` to `1`). You have to specify the dtype yourself.
        extra: If sep is a character vector, this controls what happens when
            there are too many pieces. There are three valid options:
            - "warn" (the default): emit a warning and drop extra values.
            - "drop": drop any extra values without a warning.
            - "merge": only splits at most length(into) times
        fill: If sep is a character vector, this controls what happens when
            there are not enough pieces. There are three valid options:
            - "warn" (the default): emit a warning and fill from the right
            - "right": fill with missing values on the right
            - "left": fill with missing values on the left

    Returns:
        Dataframe with separated columns.
    """
    raise _NotImplementedByCurrentBackendError("separate", data)


@_register_verb()DOCS
def separate_rows(
    data,
    *columns: str,
    sep: str = r"[^0-9A-Za-z]+",
    convert=False,
) -> Any:
    """Separates the values and places each one in its own row.

    Args:
        data: The dataframe
        *columns: The columns to separate on
        sep: Separator between columns.
        convert: The universal type for the extracted columns or a dict for
            individual ones

    Returns:
        Dataframe with rows separated and repeated.
    """
    raise _NotImplementedByCurrentBackendError("separate_rows", data)


@_register_verb()DOCS
def uncount(
    data,
    weights,
    _remove: bool = True,
    _id: str = None,
) -> Any:
    """Duplicating rows according to a weighting variable

    Args:
        data: A data frame
        weights: A vector of weights. Evaluated in the context of data
        _remove: If TRUE, and weights is the name of a column in data,
            then this column is removed.
        _id: Supply a string to create a new variable which gives a
            unique identifier for each created row (0-based).

    Returns:
        dataframe with rows repeated.
    """
    raise _NotImplementedByCurrentBackendError("uncount", data)


@_register_verb()DOCS
def unite(
    data,
    col: str,
    *columns: str | int,
    sep: str = "_",
    remove: bool = True,
    na_rm: bool = True,
) -> Any:
    """Unite multiple columns into one by pasting strings together

    Args:
        data: A data frame.
        col: The name of the new column, as a string or symbol.
        *columns: Columns to unite
        sep: Separator to use between values.
        remove: If True, remove input columns from output data frame.
        na_rm: If True, missing values will be remove prior to uniting
            each value.

    Returns:
        The dataframe with selected columns united
    """
    raise _NotImplementedByCurrentBackendError("unite", data)


@_register_verb()DOCS
def replace_na(
    data,
    data_or_replace=None,
    replace=None,
) -> Any:
    """Replace NA with a value

    This function can be also used not as a verb. As a function called as
    an argument in a verb, data is passed implicitly. Then one could
    pass data_or_replace as the data to replace.

    Args:
        data: The data piped in
        data_or_replace: When called as argument of a verb, this is the
            data to replace. Otherwise this is the replacement.
        replace: The value to replace with
            Can only be a scalar or dict for data frame.
            So replace NA with a list is not supported yet.

    Returns:
        Corresponding data with NAs replaced
    """
    raise _NotImplementedByCurrentBackendError("replace_na", data)