Source code for dataiter.regex

# -*- coding: utf-8 -*-

# Copyright (c) 2025 Osmo Salomaa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import numpy as np
import re

from dataiter import dtypes
from dataiter import util
from dataiter import Vector
from numpy.dtypes import StringDType

def _prep(string, dtype, default):
    assert isinstance(string, np.ndarray)
    assert isinstance(string.dtype, StringDType)
    out = np.full_like(string, default, dtype)
    na = string == dtypes.string.na_object
    return out, na

[docs] def findall(pattern, string, flags=0): """ Return a list of matches of `pattern` in `string`. https://docs.python.org/3/library/re.html#re.findall >>> x = di.Vector(["asdf", "1234"]) >>> regex.findall(r"[a-z]", x) """ if util.is_scalar(string): return re.findall(pattern, string, flags=flags) out, na = _prep(string, object, None) for i in np.flatnonzero(~na): out[i] = re.findall(pattern, string[i], flags=flags) return Vector.fast(out, object)
[docs] def fullmatch(pattern, string, flags=0): """ Return a ``re.Match`` object or ``None``. https://docs.python.org/3/library/re.html#re.fullmatch >>> x = di.Vector(["asdf", "1234"]) >>> regex.fullmatch(r"[a-z]+", x) """ if util.is_scalar(string): return re.fullmatch(pattern, string, flags=flags) out, na = _prep(string, object, None) for i in np.flatnonzero(~na): out[i] = re.fullmatch(pattern, string[i], flags=flags) return Vector.fast(out, object)
[docs] def match(pattern, string, flags=0): """ Return a ``re.Match`` object or ``None``. https://docs.python.org/3/library/re.html#re.match >>> x = di.Vector(["asdf", "1234"]) >>> regex.match(r"[a-z]", x) """ if util.is_scalar(string): return re.match(pattern, string, flags=flags) out, na = _prep(string, object, None) for i in np.flatnonzero(~na): out[i] = re.match(pattern, string[i], flags=flags) return Vector.fast(out, object)
[docs] def split(pattern, string, maxsplit=0, flags=0): """ Return a list of `string` split by `pattern`. https://docs.python.org/3/library/re.html#re.split >>> x = di.Vector(["one two three", "four"]) >>> regex.split(r" +", x) """ if util.is_scalar(string): return re.split(pattern, string, maxsplit=maxsplit, flags=flags) out, na = _prep(string, object, None) for i in np.flatnonzero(~na): out[i] = re.split(pattern, string[i], maxsplit=maxsplit, flags=flags) return Vector.fast(out, object)
[docs] def sub(pattern, repl, string, count=0, flags=0): """ Return `string` with instances of `pattern` replaced with `repl`. https://docs.python.org/3/library/re.html#re.sub >>> x = di.Vector(["great", "fantastic"]) >>> regex.sub(r"$", r"!", x) """ if util.is_scalar(string): return re.sub(pattern, repl, string, count=count, flags=flags) out, na = _prep(string, dtypes.string, dtypes.string.na_object) for i in np.flatnonzero(~na): out[i] = re.sub(pattern, repl, string[i], count=count, flags=flags) return Vector.fast(out, str)
[docs] def subn(pattern, repl, string, count=0, flags=0): """ Return `string`, count of instances of `pattern` replaced with `repl`. https://docs.python.org/3/library/re.html#re.subn >>> x = di.Vector(["great", "fantastic"]) >>> regex.subn(r"$", r"!", x) """ if util.is_scalar(string): return re.subn(pattern, repl, string, count=count, flags=flags) out, na = _prep(string, object, None) for i in np.flatnonzero(~na): out[i] = re.subn(pattern, repl, string[i], count=count, flags=flags) return Vector.fast(out, object)