Source code for dataiter.regex

# -*- coding: utf-8 -*-

# Copyright (c) 2025 Osmo Salomaa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import numpy as np
import re

from dataiter import dtypes
from dataiter import util
from dataiter import Vector
from numpy.dtypes import StringDType

def _prep(string, dtype, default):
    assert isinstance(string, np.ndarray)
    assert isinstance(string.dtype, StringDType)
    out = np.full_like(string, default, dtype)
    na = string == dtypes.string.na_object
    return out, na


[docs]
def findall(pattern, string, flags=0):
    """
    Return a list of matches of `pattern` in `string`.

    https://docs.python.org/3/library/re.html#re.findall

    >>> x = di.Vector(["asdf", "1234"])
    >>> regex.findall(r"[a-z]", x)
    """
    if util.is_scalar(string):
        return re.findall(pattern, string, flags=flags)
    out, na = _prep(string, object, None)
    for i in np.flatnonzero(~na):
        out[i] = re.findall(pattern, string[i], flags=flags)
    return Vector.fast(out, object)



[docs]
def fullmatch(pattern, string, flags=0):
    """
    Return a ``re.Match`` object or ``None``.

    https://docs.python.org/3/library/re.html#re.fullmatch

    >>> x = di.Vector(["asdf", "1234"])
    >>> regex.fullmatch(r"[a-z]+", x)
    """
    if util.is_scalar(string):
        return re.fullmatch(pattern, string, flags=flags)
    out, na = _prep(string, object, None)
    for i in np.flatnonzero(~na):
        out[i] = re.fullmatch(pattern, string[i], flags=flags)
    return Vector.fast(out, object)



[docs]
def match(pattern, string, flags=0):
    """
    Return a ``re.Match`` object or ``None``.

    https://docs.python.org/3/library/re.html#re.match

    >>> x = di.Vector(["asdf", "1234"])
    >>> regex.match(r"[a-z]", x)
    """
    if util.is_scalar(string):
        return re.match(pattern, string, flags=flags)
    out, na = _prep(string, object, None)
    for i in np.flatnonzero(~na):
        out[i] = re.match(pattern, string[i], flags=flags)
    return Vector.fast(out, object)



[docs]
def search(pattern, string, flags=0):
    """
    Return a ``re.Match`` object or ``None``.

    https://docs.python.org/3/library/re.html#re.search

    >>> x = di.Vector(["asdf", "1234"])
    >>> regex.search(r"[a-z]", x)
    """
    if util.is_scalar(string):
        return re.search(pattern, string, flags=flags)
    out, na = _prep(string, object, None)
    for i in np.flatnonzero(~na):
        out[i] = re.search(pattern, string[i], flags=flags)
    return Vector.fast(out, object)



[docs]
def split(pattern, string, maxsplit=0, flags=0):
    """
    Return a list of `string` split by `pattern`.

    https://docs.python.org/3/library/re.html#re.split

    >>> x = di.Vector(["one two three", "four"])
    >>> regex.split(r" +", x)
    """
    if util.is_scalar(string):
        return re.split(pattern, string, maxsplit=maxsplit, flags=flags)
    out, na = _prep(string, object, None)
    for i in np.flatnonzero(~na):
        out[i] = re.split(pattern, string[i], maxsplit=maxsplit, flags=flags)
    return Vector.fast(out, object)



[docs]
def sub(pattern, repl, string, count=0, flags=0):
    """
    Return `string` with instances of `pattern` replaced with `repl`.

    https://docs.python.org/3/library/re.html#re.sub

    >>> x = di.Vector(["great", "fantastic"])
    >>> regex.sub(r"$", r"!", x)
    """
    if util.is_scalar(string):
        return re.sub(pattern, repl, string, count=count, flags=flags)
    out, na = _prep(string, dtypes.string, dtypes.string.na_object)
    for i in np.flatnonzero(~na):
        out[i] = re.sub(pattern, repl, string[i], count=count, flags=flags)
    return Vector.fast(out, str)



[docs]
def subn(pattern, repl, string, count=0, flags=0):
    """
    Return `string`, count of instances of `pattern` replaced with `repl`.

    https://docs.python.org/3/library/re.html#re.subn

    >>> x = di.Vector(["great", "fantastic"])
    >>> regex.subn(r"$", r"!", x)
    """
    if util.is_scalar(string):
        return re.subn(pattern, repl, string, count=count, flags=flags)
    out, na = _prep(string, object, None)
    for i in np.flatnonzero(~na):
        out[i] = re.subn(pattern, repl, string[i], count=count, flags=flags)
    return Vector.fast(out, object)