Skip to content

SOURCE CODE regexr.string DOCS

r"""Regular expressions for humans

Predefined patterns:

    START = Raw("^", entire=True)
    START_OF_STRING = Raw(r"\A", entire=True)
    END = Raw("$", entire=True)
    END_OF_STRING = Raw(r"\Z", entire=True)
    NUMBER = DIGIT = Raw(r"\d", entire=True)
    NUMBERS = DIGITS = Raw(r"\d+", entire=True)
    MAYBE_NUMBERS = MAYBE_DIGITS = Raw(r"\d*", entire=True)
    NON_NUMBER = NON_DIGIT = Raw(r"\D", entire=True)
    WORD = Raw(r"\w", entire=True)
    WORDS = Raw(r"\w+", entire=True)
    MAYBE_WORDS = Raw(r"\w*", entire=True)
    NON_WORD = Raw(r"\W", entire=True)
    WORD_BOUNDARY = Raw(r"\b", entire=True)
    NON_WORD_BOUNDARY = Raw(r"\B", entire=True)
    WHITESPACE = Raw(r"\s", entire=True)
    WHITESPACES = Raw(r"\s+", entire=True)
    MAYBE_WHITESPACES = Raw(r"\s*", entire=True)
    NON_WHITESPACE = Raw(r"\S", entire=True)
    SPACE = Raw(" ", entire=True)
    SPACES = Raw(" +", entire=True)
    MAYBE_SPACES = Raw(" *", entire=True)
    TAB = Raw(r"\t", entire=True)
    DOT = Raw(r"\.", entire=True)
    ANYCHAR = Raw(".", entire=True)
    ANYCHARS = Raw(".+", entire=True)
    MAYBE_ANYCHARS = Raw(".*", entire=True)
    LETTER = Raw("[a-zA-Z]", entire=True)
    LETTERS = Raw("[a-zA-Z]+", entire=True)
    MAYBE_LETTERS = Raw("[a-zA-Z]*", entire=True)
    LOWERCASE = Raw("[a-z]", entire=True)
    LOWERCASES = Raw("[a-z]+", entire=True)
    MAYBE_LOWERCASES = Raw("[a-z]*", entire=True)
    UPPERCASE = Raw("[A-Z]", entire=True)
    UPPERCASES = Raw("[A-Z]+", entire=True)
    MAYBE_UPPERCASES = Raw("[A-Z]*", entire=True)
    ALNUM = Raw("[a-zA-Z0-9]", entire=True)
    ALNUMS = Raw("[a-zA-Z0-9]+", entire=True)
    MAYBE_ALNUMS = Raw("[a-zA-Z0-9]*", entire=True)
"""

from __future__ import annotations

import re
from abc import ABC, abstractproperty
from typing import Sequence, Union

SegmentType = Union["Segment", str]


def _flags_to_str(flags: int | str | Sequence[int | str]) -> str:
    """Convert a set of flags to a string

    >>> _flags_to_str(re.IGNORECASE)  # 'i'
    >>> _flags_to_str(re.IGNORECASE | re.MULTILINE)  # 'im'
    """
    if flags is None:
        return ""

    if isinstance(flags, Sequence) and not isinstance(flags, str):
        return "".join(_flags_to_str(flag) for flag in flags)

    if isinstance(flags, int):
        out = []
        if flags & re.A:
            out.append("a")
        if flags & re.I:
            out.append("i")
        if flags & re.L:
            out.append("L")
        if flags & re.M:
            out.append("m")
        if flags & re.S:
            out.append("s")
        if flags & re.U:
            out.append("u")
        if flags & re.X:
            out.append("x")
        return "".join(out)

    # str
    if any(c not in "aiLmsux" for c in flags):
        raise ValueError(
            f"Invalid flag: {flags}, must be a subset of 'aiLmsux'"
        )
    return flags


class Segment(ABC):DOCS
    """Segments of a regular expression

    ClassVars:
        NONCAPTURING_WRAPPING: Whether we should wrap the segment with brackets
            when `capture` is `False`.
            In some cases, for example, `(abc)+` is already an entire group, it
            won't confuse the parser when it comes with other segments, such as
            `(abc)+d`. We don't need an extra brackets to separate it from other
            segments. However, we need brackets for other segments, such as
            `a|b|c`, because `a|b|cd` will confuse the parser. In such a case,
            we need `(?:a|b|c)d` if we don't need to capture the segment.

    Args:
        args: Another segments to be wrapped by this one.
        capture: The name of the capture, False to disable capturing and
            True to capture without name.
        flags: The flags to be used when compiling this segment.
        deflags: Remove the flags from `re.compile()` while compiling
            this segment.
    """

    __slots__ = ("args", "capture", "flags")

    NONCAPTURING_WRAPPING = True

    def __init__(
        self,
        *args: SegmentType,
        capture: bool | str = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        """Constructor"""
        if isinstance(capture, str) and not capture.isidentifier():
            raise ValueError(f"Invalid capture name: {capture}")

        flags = _flags_to_str(flags)
        deflags = _flags_to_str(deflags)
        for flag in flags:
            if flag in deflags:
                raise ValueError(
                    f"Flag `{flag}` turned on and off"
                )

        if deflags:
            deflags = f"-{deflags}"

        self.args = args
        self.capture = capture
        self.flags = f"{flags}{deflags}"

    def _str_raw(self) -> str:
        """Stringify this segment, without capturing/non-capturing brackets

        Returns:
            str: The stringified segment.
        """
        return "".join(
            str(part) if isinstance(part, Segment) else re.escape(part)
            for part in self.args
        )

    def _pretty_raw(self, indent: str) -> str:
        """Pretty string representation of this segment, without
        capturing/non-capturing brackets
        """
        out = []
        for arg in self.args:
            if isinstance(arg, Segment):
                out.append(arg.pretty(indent, 0))
            else:  # str
                out.append(re.escape(arg))

        return "\n".join(out)

    def pretty(self, indent: str, level: int) -> str:DOCS
        """Pretty print this segment, depending on `capture`

        Args:
            indent: The indent string.
            level: The indent level.
        """
        arg = self._pretty_raw(indent)
        if (
            not self.capture
            and not self.NONCAPTURING_WRAPPING
            and not self.flags
        ):
            return "\n".join(
                f"{indent * level}{line}" for line in arg.splitlines()
            )

        capture_start = (
            "(" if self.capture is True
            else f"(?P<{self.capture}>" if self.capture
            else ""
        )
        capture_end = ")" if self.capture else ""
        flags_start = f"(?{self.flags}:" if self.flags else ""
        flags_end = ")" if self.flags else ""

        if self.capture:
            start = f"{indent * level}{capture_start}{flags_start}"
            end = f"{indent * level}{flags_end}{capture_end}"
        elif self.__class__.NONCAPTURING_WRAPPING and not self.flags:
            start = f"{indent * level}(?:"
            end = f"{indent * level})"
        else:  # self.flags not ""
            start = f"{indent * level}{flags_start}"
            end = f"{indent * level}{flags_end}"

        if "\n" not in arg:
            return "".join((start, arg, end))

        return "\n".join(
            (
                f"{start}",
                *(
                    f"{indent * (level + 1)}{line}"
                    for line in arg.splitlines()
                ),
                f"{end}",
            )
        )

    def __str__(self) -> str:DOCS
        """String representation of this segment, depending on
        `capture`

        Returns:
            The final string representation of this segment.
        """
        arg = self._str_raw()
        flags_start = f"(?{self.flags}:" if self.flags else ""
        flags_end = ")" if self.flags else ""
        arg = f"{flags_start}{arg}{flags_end}"
        if self.capture is True:
            return f"({arg})"
        if self.capture:
            return f"(?P<{self.capture}>{arg})"
        if self.__class__.NONCAPTURING_WRAPPING and not self.flags:
            return f"(?:{arg})"
        return arg


# Character classes
class CharClass(Segment, ABC):DOCS
    """Used to indicat a set of characters wrapped by `[]`"""

    NONCAPTURING_WRAPPING = False

    def _pretty_raw(self, indent: str) -> str:
        return self._str_raw()

    def _str_raw(self) -> str:
        """Stringify this segment, without capturing/non-capturing brackets

        Returns:
            str: The stringified segment.
        """
        return "".join(
            str(part) if isinstance(part, Segment) else str(part)
            for part in self.args
        )


class OneOfChars(CharClass):DOCS
    """Positive character set `[...]`"""

    def _str_raw(self) -> str:
        return f"[{super()._str_raw()}]"


class NoneOfChars(CharClass):DOCS
    """Negative character set `[^...]`"""

    def _str_raw(self) -> str:
        return f"[^{super()._str_raw()}]"


# Look ahead/behind
class Look(Segment, ABC):DOCS
    """Look ahead or behind"""

    NONCAPTURING_WRAPPING = False
    PREFIX = "?="

    def _str_raw(self) -> str:
        return f"({self.__class__.PREFIX}{super()._str_raw()})"

    def _pretty_raw(self, indent: str) -> str:
        prettied = super()._pretty_raw(indent)
        if "\n" not in prettied:
            return f"({self.__class__.PREFIX}{prettied})"

        return "\n".join([
            f"({self.__class__.PREFIX}",
            *(f"{indent}{line}" for line in prettied.splitlines()),
            ")",
        ])


class LookAhead(Look):DOCS
    """Look ahead `(?=...)`"""


class LookBehind(Look):DOCS
    """Look behind `(?<=...)`"""

    PREFIX = "?<="


class LookAheadNot(Look):DOCS
    """Look ahead not `(?!...)`"""

    PREFIX = "?!"


class LookBehindNot(Look):DOCS
    """Look behind not `(?<!...)`"""

    PREFIX = "?<!"


# Quantifiers


class Quantifier(Segment, ABC):DOCS
    """Quantifier `+`, `*`, `?`, `{m}` or `{m,n}`"""

    __slots__ = ("lazy",)
    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        *args: SegmentType,
        lazy: bool = False,
        capture: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        super().__init__(*args, capture=capture, flags=flags, deflags=deflags)
        self.lazy = lazy

    @abstractproperty
    def _quantifier(self) -> str:
        """The quantifier to quantify the pattern"""

    def _str_raw(self) -> str:
        qmark = "?" if self.lazy else ""
        if (
            len(self.args) > 1
            or (
                isinstance(self.args[0], str)
                and len(self.args[0]) > 1
            )
            or (
                isinstance(self.args[0], Raw)
                and not self.args[0].entire
                and not self.args[0].capture
            )
            or (
                isinstance(self.args[0], Segment)
                and not isinstance(
                    self.args[0], (Raw, Capture, Captured, CharClass)
                )
                and not self.args[0].NONCAPTURING_WRAPPING
                and not self.args[0].capture
            )
        ):
            return f"(?:{super()._str_raw()}){self._quantifier}{qmark}"

        return f"{super()._str_raw()}{self._quantifier}{qmark}"

    def _pretty_raw(self, indent: str) -> str:
        return self._str_raw()


class ZeroOrMore(Quantifier):DOCS
    """`*` zero or more times"""

    _quantifier = "*"


class OneOrMore(Quantifier):DOCS
    """`+` one or more times"""

    _quantifier = "+"


class Maybe(Quantifier):DOCS
    """`?` zero or one times"""

    _quantifier = "?"


class Repeat(Quantifier):DOCS
    """Match from `m` to `n` repetitions `{m,n}` or `{m,}`"""

    __slots__ = ("m", "n")

    def __init__(
        self,
        *args: SegmentType,
        m: int,
        n: int = None,
        lazy: bool = False,
        capture: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        super().__init__(
            *args,
            capture=capture,
            flags=flags,
            deflags=deflags,
            lazy=lazy,
        )
        if m < 0:
            raise ValueError("`m` must be positive for `Repeat`")
        if n is not None and n < m:
            raise ValueError(
                "`n` must be greater than or equal to `m` for `Repeat`"
            )
        self.m = m
        self.n = n

    @property
    def _quantifier(self) -> str:
        n = "" if self.n is None else self.n
        return f"{{{self.m},{n}}}"


class RepeatExact(Quantifier):DOCS
    """Match exact `m` repetitions `{m}`"""

    __slots__ = ("m",)

    def __init__(
        self,
        *args: SegmentType,
        m: int,
        lazy: bool = False,
        capture: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        super().__init__(
            *args,
            capture=capture,
            flags=flags,
            deflags=deflags,
            lazy=lazy,
        )
        if m <= 0:
            raise ValueError("`m` must be greater than 0 for `Repeat`")
        self.m = m

    @property
    def _quantifier(self) -> str:
        return f"{{{self.m}}}"


class Lazy(Segment):DOCS
    """Non-greedy modifier `+?`, `*?`, `??`, `{m,}?` or `{m,n}?`"""

    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        *args: SegmentType,
        capture: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        if len(args) != 1:
            raise ValueError("`Lazy` must have exactly one positional argument")

        if (
            not isinstance(args[0], Quantifier)
            and not (
                isinstance(args[0], Raw)

            )
            and not (
                isinstance(args[0], str)
                and (
                    args[0][-1] in ("+", "*", "?")
                    or re.match(r"\{\d+(?:,\d+)?\}", args[0])
                )
            )
        ):
            raise ValueError("`Lazy` must be applied to a quantifier")

        super().__init__(*args, capture=capture, flags=flags, deflags=deflags)

    def _str_raw(self) -> str:
        return f"{super()._str_raw()}?"

    def _pretty_raw(self, indent: str) -> str:
        return f"{super()._pretty_raw(indent)}?"


class Flag(Segment):DOCS
    """Flag `(?aiLmsux)`"""
    NONCAPTURING_WRAPPING = False

    def __init__(self, *args: str | int) -> None:
        transformed_args = _flags_to_str(args)
        super().__init__(transformed_args, capture=False)

    def _str_raw(self) -> str:
        return f"(?{super()._str_raw()})"

    def _pretty_raw(self, indent: str) -> str:
        return self._str_raw()


class InlineFlag(Segment):DOCS
    """Inline flag `(?aiLmsux-imsx:...)`"""

    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        *args: SegmentType,
        capture: bool | str = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        if flags is None and deflags is None:
            raise ValueError("`InlineFlag` must have `flags` or `deflags`")

        super().__init__(
            *args, capture=capture, flags=flags, deflags=deflags,
        )


class Raw(Segment):DOCS
    """Raw strings without escaping"""
    __slots__ = ("entire",)
    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        *args: SegmentType,
        capture: bool = False,
        entire: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        if not all(isinstance(arg, str) for arg in args):
            raise ValueError("`Raw` must be applied to strings.")
        super().__init__(*args, capture=capture, flags=flags, deflags=deflags)
        self.entire = entire

    def _str_raw(self) -> str:
        return "".join(str(part) for part in self.args)

    def _pretty_raw(self, indent: str) -> str:
        return self._str_raw()


# Other segments
class Or(Segment):DOCS
    """`|` connected segments"""

    def _str_raw(self) -> str:
        return "|".join(str(part) for part in self.args)

    def _pretty_raw(self, indent: str) -> str:
        """Pretty string representation of this segment, without
        capturing/non-capturing brackets
        """
        out = []
        has_newline = False
        for arg in self.args:
            if isinstance(arg, Segment):
                pretty_str = arg.pretty(indent, 0)
            else:
                pretty_str = str(arg)
            if "\n" in pretty_str:
                has_newline = True
            out.append(pretty_str)

        if not has_newline:
            return "|".join(out)
        return "\n|".join(out)


# Capture/non-capture
class Capture(Segment):DOCS
    """Capture a match `(...)`"""

    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        *args: SegmentType,
        name: bool | str = None,
        capture: bool | str = None,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        """Constructor"""
        if name is None and capture is None:
            capture = True
        elif capture is None:
            capture = name

        assert capture is not False
        super().__init__(*args, capture=capture, flags=flags, deflags=deflags)


class NonCapture(Segment):DOCS
    """Non-capturing grouping `(?:...)`"""

    NONCAPTURING_WRAPPING = True

    def __init__(
        self,
        *args: SegmentType,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        """Constructor"""
        super().__init__(*args, capture=False, flags=flags, deflags=deflags)


class Concat(Segment):DOCS
    """Concatenate segments"""

    NONCAPTURING_WRAPPING = False


class Conditional(Segment):DOCS
    """`(?(...)yes|no)` conditional pattern"""

    __slots__ = ("id_or_name", "yes", "no")
    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        id_or_name: Captured | str | int,
        yes: SegmentType,
        no: SegmentType = None,
        capture: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        """Constructor"""
        if isinstance(capture, str) and not capture.isidentifier():
            raise ValueError(f"Invalid capture name: {capture}")
        if isinstance(id_or_name, str) and not id_or_name.isidentifier():
            raise ValueError(f"Invalid id or name: {id_or_name}")

        self.id_or_name = id_or_name
        self.yes = re.escape(yes) if isinstance(yes, str) else yes
        self.no = re.escape(no) if isinstance(no, str) else no
        super().__init__(capture=capture, flags=flags, deflags=deflags)

    def _str_raw(self) -> str:
        id_or_name = (
            self.id_or_name.id_or_name
            if isinstance(self.id_or_name, Captured)
            else self.id_or_name
        )
        no = f"|{self.no}" if self.no else ""

        return f"(?({id_or_name}){self.yes}{no})"

    def _pretty_raw(self, indent: str) -> str:
        id_or_name = (
            self.id_or_name.id_or_name
            if isinstance(self.id_or_name, Captured)
            else self.id_or_name
        )
        yes = (
            self.yes.pretty(indent, 0)
            if isinstance(self.yes, Segment)
            else str(self.yes)
        )
        no = (
            "" if not self.no
            else f"|{self.no.pretty(indent, 0)}"
            if isinstance(self.no, Segment)
            else f"|{self.no}"
        )

        if "\n" in yes or "\n" in no:
            yes = "\n".join(f"{indent}{line}" for line in yes.split("\n"))
            no = "\n".join(f"{indent}{line}" for line in no.split("\n"))
            return f"(?({id_or_name})\n{yes}\n{no}\n)"

        return f"(?({id_or_name}){yes}{no})"


class Captured(Segment):DOCS
    """`(?P=name)` captured group or \\1, \\2, ..."""

    __slots__ = ("id_or_name",)
    NONCAPTURING_WRAPPING = False

    def __init__(
        self,
        id_or_name: str | int,
        capture: bool = False,
        flags: int | str | Sequence[int | str] = None,
        deflags: int | str | Sequence[int | str] = None,
    ) -> None:
        """Constructor"""
        if isinstance(capture, str) and not capture.isidentifier():
            raise ValueError(f"Invalid capture name: {capture}")
        if isinstance(id_or_name, str) and not id_or_name.isidentifier():
            raise ValueError(f"Invalid id or name: {id_or_name}")

        self.id_or_name = id_or_name
        super().__init__(capture=capture, flags=flags, deflags=deflags)

    def _str_raw(self) -> str:
        if isinstance(self.id_or_name, str):
            return f"(?P={self.id_or_name})"
        return f"\\{self.id_or_name}"

    def _pretty_raw(self, indent: str) -> str:
        return self._str_raw()


# Predefined patterns
START = Raw("^", entire=True)
START_OF_STRING = Raw(r"\A", entire=True)
END = Raw("$", entire=True)
END_OF_STRING = Raw(r"\Z", entire=True)
NUMBER = DIGIT = Raw(r"\d", entire=True)
NUMBERS = DIGITS = Raw(r"\d+", entire=True)
MAYBE_NUMBERS = MAYBE_DIGITS = Raw(r"\d*", entire=True)
NON_NUMBER = NON_DIGIT = Raw(r"\D", entire=True)
WORD = Raw(r"\w", entire=True)
WORDS = Raw(r"\w+", entire=True)
MAYBE_WORDS = Raw(r"\w*", entire=True)
NON_WORD = Raw(r"\W", entire=True)
WORD_BOUNDARY = Raw(r"\b", entire=True)
NON_WORD_BOUNDARY = Raw(r"\B", entire=True)
WHITESPACE = Raw(r"\s", entire=True)
WHITESPACES = Raw(r"\s+", entire=True)
MAYBE_WHITESPACES = Raw(r"\s*", entire=True)
NON_WHITESPACE = Raw(r"\S", entire=True)
SPACE = Raw(" ", entire=True)
SPACES = Raw(" +", entire=True)
MAYBE_SPACES = Raw(" *", entire=True)
TAB = Raw(r"\t", entire=True)
DOT = Raw(r"\.", entire=True)
ANYCHAR = Raw(".", entire=True)
ANYCHARS = Raw(".+", entire=True)
MAYBE_ANYCHARS = Raw(".*", entire=True)
LETTER = Raw("[a-zA-Z]", entire=True)
LETTERS = Raw("[a-zA-Z]+", entire=True)
MAYBE_LETTERS = Raw("[a-zA-Z]*", entire=True)
LOWERCASE = Raw("[a-z]", entire=True)
LOWERCASES = Raw("[a-z]+", entire=True)
MAYBE_LOWERCASES = Raw("[a-z]*", entire=True)
UPPERCASE = Raw("[A-Z]", entire=True)
UPPERCASES = Raw("[A-Z]+", entire=True)
MAYBE_UPPERCASES = Raw("[A-Z]*", entire=True)
ALNUM = Raw("[a-zA-Z0-9]", entire=True)
ALNUMS = Raw("[a-zA-Z0-9]+", entire=True)
MAYBE_ALNUMS = Raw("[a-zA-Z0-9]*", entire=True)


# Main class
class Regexr(str):DOCS
    """The entrance of the package to compose a regular expression

    It is actually a subclass of `str`, but with an extra method `compile`,
    which compiles the regular expression and returns a `re.Pattern` object.

    Args:
        *segments: The segments of the regular expression.
            When composing the regular expression, the segments are concatenated
    """

    __slots__ = ("_segments",)

    def __new__(cls, *segments: SegmentType) -> Regexr:
        if (
            any(isinstance(part, Flag) for part in segments)
            and not isinstance(segments[0], Flag)
        ):
            raise ValueError("Flags must be the first segment.")

        regexr = str.__new__(
            cls,
            "".join(
                re.escape(part) if isinstance(part, str) else str(part)
                for part in segments
            )
        )
        regexr._segments = segments  # type: ignore
        return regexr

    def compile(self, flags: int = 0) -> re.Pattern:DOCS
        """Compile the regular expression and return a `re.Pattern` object

        See also `re.compile()`

        Args:
            flags: The flags to be used when compiling the regular expression.
        """
        return re.compile(self, flags=flags)

    def pretty(self, indent: str = "  ") -> str:DOCS
        """Pretty print the regular expression"""
        return "\n".join(
            part.pretty(indent, level=0)
            if isinstance(part, Segment)
            else re.escape(part)
            for part in self._segments  # type: ignore
        )

    def __repr__(self) -> str:DOCS
        """String representation of this regular expression"""
        return f"<Regexr: r'{self}'>"