Source code for dataiter.vector

# -*- coding: utf-8 -*-

# Copyright (c) 2020 Osmo Salomaa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import dataiter
import datetime
import numpy as np
import sys

from dataiter import util
from math import inf

TYPE_CONVERSIONS = {
    datetime.date: "datetime64[D]",
    datetime.datetime: "datetime64[us]",
}

[docs] class Vector(np.ndarray): """ A one-dimensional array. Vector is a subclass of NumPy ``ndarray``. Note that not all ``ndarray`` methods have been overridden and thus by careless use of baseclass in-place methods you might manage to twist the data into multi-dimensional or other non-vector form, causing unexpected results. https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html """ def __new__(cls, object, dtype=None): # If given a NumPy array, we can do a fast initialization. if isinstance(object, np.ndarray): dtype = dtype or object.dtype return np.array(object, dtype).view(cls) # If given a Python list, or something else generic, we need # to convert certain types and special values. This is really # slow, see Vector.fast for faster initialization. if (hasattr(object, "__iter__") and not isinstance(object, (list, tuple))): # Evaluate generator/iterator. object = list(object) return cls._std_to_np(object, dtype).view(cls)
[docs] def __init__(self, object, dtype=None): """ Return a new vector. `object` can be any one-dimensional sequence, such as a NumPy array, Python list or tuple. Creating a vector from a NumPy array will be fast, from other types slower as data types and special values will need to be converted. `dtype` is the NumPy-compatible data type for the vector. Providing `dtype` will make creating the vector faster, otherwise the appropriate data type will be guessed by introspecting the elements of `object`, which is potentially slow, especially for large objects. >>> di.Vector([1, 2, 3], int) """ self._check_dimensions()
def __array_wrap__(self, array, context=None): # Avoid returning 0-dimensional arrays. # https://github.com/numpy/numpy/issues/7403 return array[()] if array.shape == () else array def __repr__(self): return self.to_string() def __str__(self): return self.to_string()
[docs] def as_boolean(self): """ Return vector converted to boolean data type. >>> vector = di.Vector([0, 1]) >>> vector.as_boolean() """ if self.is_string(): # NumPy does bool(int(str)), which is weird. # https://github.com/numpy/numpy/issues/20898 # https://github.com/numpy/numpy/pull/21024 return self.map(bool) return self.astype(bool)
[docs] def as_bytes(self): """ Return vector converted to bytes data type. >>> vector = di.Vector(["a", "b"]) >>> vector.as_bytes() """ if self.is_string(): array = np.char.encode(self, "utf-8") return array.view(self.__class__) return self.astype(bytes)
[docs] def as_date(self): """ Return vector converted to date data type. >>> vector = di.Vector(["2020-01-01"]) >>> vector.as_date() """ return self.astype(np.dtype("datetime64[D]"))
[docs] def as_datetime(self, precision="us"): """ Return vector converted to datetime data type. >>> vector = di.Vector(["2020-01-01T12:00:00"]) >>> vector.as_datetime() """ return self.astype(np.dtype(f"datetime64[{precision}]"))
[docs] def as_float(self): """ Return vector converted to float data type. >>> vector = di.Vector([1, 2, 3]) >>> vector.as_float() """ return self.astype(float)
[docs] def as_integer(self): """ Return vector converted to integer data type. >>> vector = di.Vector([1.0, 2.0, 3.0]) >>> vector.as_integer() """ return self.astype(int)
[docs] def as_object(self): """ Return vector converted to object data type. >>> vector = di.Vector([1, 2, 3]) >>> vector.as_object() """ return self.__class__(self.tolist(), object)
[docs] def as_string(self, length=None): """ Return vector converted to string data type. >>> vector = di.Vector([1, 2, 3]) >>> vector.as_string() >>> vector.as_string(64) """ return self.astype(f"U{length}" if length else str)
def _check_dimensions(self): if self.ndim == 1: return raise ValueError(f"Bad dimensions: {self.ndim!r}")
[docs] def concat(self, *others): """ Return vector with elements from `others` appended. >>> a = di.Vector([1, 2, 3]) >>> b = di.Vector([4, 5, 6]) >>> c = di.Vector([7, 8, 9]) >>> a.concat(b, c) """ vectors = [self] + list(others) new = np.concatenate(vectors) return self.__class__(new)
[docs] def drop_na(self): """ Return vector without missing values. >>> vector = di.Vector([1, 2, 3, None]) >>> vector.drop_na() """ return self[~self.is_na()].copy()
[docs] def equal(self, other): """ Return whether vectors are equal. Equality is tested with ``==``. As an exception, corresponding missing values are considered equal as well. >>> a = di.Vector([1, 2, 3, None]) >>> b = di.Vector([1, 2, 3, None]) >>> a >>> b >>> a.equal(b) """ if not (isinstance(other, Vector) and self.length == other.length and str(self.na_value) == str(other.na_value)): return False ii = self.is_na() jj = other.is_na() return (np.all(ii == jj) and np.all(self[~ii] == other[~jj]))
[docs] @classmethod def fast(cls, object, dtype=None): """ Return a new vector. Unlike :meth:`__init__`, this will **not** convert special values in `object`. Use this only if you know `object` doesn't contain special values or if you know they are already of the correct type. """ if (hasattr(object, "__iter__") and not isinstance(object, (np.ndarray, list, tuple))): # Evaluate generator/iterator. object = list(object) return np.array(object, dtype).view(cls)
[docs] def get_memory_use(self): """ Return memory use in bytes. >>> vector = di.Vector(range(100)) >>> vector.get_memory_use() """ if self.is_object(): return sum(sys.getsizeof(x) for x in self) return self.nbytes
[docs] def head(self, n=None): """ Return the first `n` elements. >>> vector = di.Vector(range(100)) >>> vector.head(10) """ if n is None: n = dataiter.DEFAULT_PEEK_ELEMENTS n = min(self.length, n) return self[np.arange(n)].copy()
[docs] def is_boolean(self): """ Return whether vector data type is boolean. """ return np.issubdtype(self.dtype, np.bool_)
[docs] def is_bytes(self): """ Return whether vector data type is bytes. """ return np.issubdtype(self.dtype, np.bytes_)
[docs] def is_datetime(self): """ Return whether vector data type is datetime. Dates are considered datetimes as well. """ return np.issubdtype(self.dtype, np.datetime64)
[docs] def is_float(self): """ Return whether vector data type is float. """ return np.issubdtype(self.dtype, np.floating)
[docs] def is_integer(self): """ Return whether vector data type is integer. """ return np.issubdtype(self.dtype, np.integer)
[docs] def is_na(self): """ Return a boolean vector indicating missing data elements. >>> vector = di.Vector([1, 2, 3, None]) >>> vector >>> vector.is_na() """ if self.is_datetime(): return np.isnat(self) if self.is_timedelta(): return np.isnat(self) if self.is_float(): return np.isnan(self) if self.is_string(): return self == "" return np.isin(self, [None])
[docs] def is_number(self): """ Return whether vector data type is number. """ return np.issubdtype(self.dtype, np.number)
[docs] def is_object(self): """ Return whether vector data type is object. """ return np.issubdtype(self.dtype, np.object_)
[docs] def is_string(self): """ Return whether vector data type is string. """ return np.issubdtype(self.dtype, np.unicode_)
[docs] def is_timedelta(self): """ Return whether vector data type is timedelta. """ return np.issubdtype(self.dtype, np.timedelta64)
@property def length(self): """ Return the amount of elements. >>> vector = di.Vector(range(100)) >>> vector.length """ self._check_dimensions() return self.size
[docs] def map(self, function, *args, dtype=None, **kwargs): """ Apply `function` element-wise and return a new vector. >>> import math >>> vector = di.Vector(range(10)) >>> vector.map(math.pow, 2) """ return self.__class__((function(x, *args, **kwargs) for x in self), dtype)
@property def na_dtype(self): """ Return the corresponding data type that can handle missing data. You might need this for upcasting when missing data is first introduced. >>> vector = di.Vector([1, 2, 3]) >>> vector >>> vector.put([2], vector.na_value) >>> vector = vector.astype(vector.na_dtype) >>> vector >>> vector.put([2], vector.na_value) >>> vector """ if self.is_datetime(): return self.dtype if self.is_timedelta(): return self.dtype if self.is_float(): return self.dtype if self.is_integer(): return float if self.is_string(): return self.dtype return object @property def na_value(self): """ Return the corresponding value to use to represent missing data. Dataiter is built on top of NumPy. NumPy doesn't support a proper missing value ("NA"), only data type specific values: ``np.nan``, ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")``. Dataiter recommends the following values be used and internally supports them to an extent. ========= ======================== datetime ``np.datetime64("NaT")`` float ``np.nan`` integer ``np.nan`` string ``""`` timedelta ``np.timedelta64("NaT")`` other ``None`` ========= ======================== Note that actually using these might require upcasting the vector. Integer will need to be upcast to float to contain ``np.nan``. Other, such as boolean, will need to be upcast to object to contain ``None``. If you need to avoid object columns, you can also consider converting booleans to float using :meth:`as_float`, which will give you 0.0 for false and 1.0 for true. Depending on how you use the data, that might work as well as an object vector of ``True``, ``False`` and ``None``. """ if self.is_datetime(): return np.datetime64("NaT") if self.is_timedelta(): return np.timedelta64("NaT") if self.is_float(): return np.nan if self.is_integer(): return np.nan if self.is_string(): return "" # Note that using None, e.g. for a boolean vector, # might not work directly as it requires upcasting to object. return None
[docs] def range(self): """ Return the minimum and maximum values as a two-element vector. >>> vector = di.Vector(range(100)) >>> vector.range() """ rng = [np.nanmin(self), np.nanmax(self)] return self.__class__(rng, self.dtype)
[docs] def rank(self, *, method="average"): """ Return the order of elements in a sorted vector. `method` determines how ties are resolved. **'min'** assigns each of equal values the same rank, the minimum of the set (also called "competition ranking"). **'max'** is the same, but assigning the maximum of the set. **'average'** is the mean of 'min' and 'max'. **'ordinal'** gives each element a distinct rank with equal values ranked by their order in input. Ranks begin at 1. Missing values are ranked last. **References** * https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html * https://www.rdocumentation.org/packages/base/topics/rank >>> vector = di.Vector([3, 1, 1, 1, 2, 2]) >>> vector.rank(method="min") >>> vector.rank(method="max") >>> vector.rank(method="average") >>> vector.rank(method="ordinal") """ if self.length == 0: return self.__class__([], int) if method not in ["min", "max", "average", "ordinal"]: raise ValueError(f"Unexpected method: {method!r}") na = self.is_na() if na.all(): # Avoid trying to evaluate min/max/mean of all NA. x = self.__class__(np.repeat(1, self.length)) return x.rank(method=method) if method == "average": rank_min = self.rank(method="min") rank_max = self.rank(method="max") rank = np.mean([rank_min, rank_max], axis=0) return self.__class__(rank) if method == "min": # https://stackoverflow.com/a/14672797/16369038 inv = np.unique(self[~na], return_inverse=True)[1] arank = np.concatenate(([0], np.bincount(inv))).cumsum()[inv] zrank = arank.max() + 1 if method == "max": # https://stackoverflow.com/a/14672797/16369038 inv = np.unique(self[~na], return_inverse=True)[1] arank = np.bincount(inv).cumsum()[inv] - 1 zrank = len(self) - 1 if method == "ordinal": # https://stackoverflow.com/a/5284703/16369038 indices = self[~na].argsort() arank = np.empty_like(indices) arank[indices] = np.arange(len(indices)) zrank = arank.max() + 1 + np.arange(na.sum()) out = np.zeros_like(self, int) out[~na] = arank + 1 out[na] = zrank + 1 return self.__class__(out)
[docs] def replace_na(self, value): """ Return vector with missing values replaced with `value`. >>> vector = di.Vector([1, 2, 3, None]) >>> vector.replace_na(0) """ vector = self.copy() vector[vector.is_na()] = value return vector
[docs] def sample(self, n=None): """ Return randomly chosen `n` elements. >>> vector = di.Vector(range(100)) >>> vector.sample(10) """ if n is None: n = dataiter.DEFAULT_PEEK_ELEMENTS n = min(self.length, n) indices = np.random.choice(self.length, n, replace=False) return self[np.sort(indices)].copy()
[docs] def sort(self, *, dir=1): """ Return elements in sorted order. `dir` is ``1`` for ascending sort, ``-1`` for descending. Missing values are sorted last, regardless of `dir`. >>> vector = di.Vector([1, 2, 3, None]) >>> vector.sort(dir=1) >>> vector.sort(dir=-1) """ if self.is_object(): # It's not really clear how objects should be sorted. # Let's use strings, since (1) object vectors are often # used to hold strings and (2) most types probably # implement __str__, so this is actually doable. lst = sorted(self, key=str, reverse=dir<0) new = self.fast(lst, object) na = new.is_na() new = new[~na].concat(new[na]) return self.fast(new, object) new = self.copy() np.ndarray.sort(new) if dir < 0: new = new[::-1] na = new.is_na() new = new[~na].concat(new[na]) return self.fast(new, self.dtype)
@classmethod def _std_to_np(cls, seq, dtype=None): # Convert missing values in seq to NumPy equivalents. # Can be empty if all of seq are missing values. types = util.unique_types(seq) if dtype is not None: na = Vector.fast([], dtype).na_value elif len(types) == 1 and types.copy().pop().__module__ == "numpy": # If we have a regular Python list of NumPy scalars, # infer type. This should be rare, but can happen. dtype = types.copy().pop()().dtype na = Vector.fast([], dtype).na_value else: # Guess the missing value based on types in seq. na = cls._std_to_np_na_value(types) seq = [na if x is None or (isinstance(x, float) and np.isnan(x)) else x for x in seq] if dtype is not None: if np.issubdtype(dtype, np.integer) and np.nan in seq: # Upcast from integer to float as required. dtype = float return np.array(seq, dtype) # NaT values bring in np.datetime64 to types. types.discard(np.datetime64) for fm, to in TYPE_CONVERSIONS.items(): if types and all(x == fm for x in types): return np.array(seq, to) # Let NumPy guess the appropriate dtype. return np.array(seq, dtype) @classmethod def _std_to_np_na_value(cls, types): if not types: return None if str in types: return "" if all(x in [float, int] or np.issubdtype(x, np.floating) or np.issubdtype(x, np.integer) for x in types): return np.nan datetimes = [datetime.date, datetime.datetime, np.datetime64] if all(x in datetimes for x in types): return np.datetime64("NaT") # Usually causes dtype to be object! return None
[docs] def tail(self, n=None): """ Return the last `n` elements. >>> vector = di.Vector(range(100)) >>> vector.tail(10) """ if n is None: n = dataiter.DEFAULT_PEEK_ELEMENTS n = min(self.length, n) return self[np.arange(self.length - n, self.length)].copy()
[docs] def to_string(self, *, max_elements=None): """ Return vector as a string formatted for display. >>> vector = di.Vector([1/2, 1/3, 1/4]) >>> vector.to_string() """ print_width = util.get_print_width() def add_string_element(string, rows): if len(rows[-1]) <= 1: return rows[-1].append(string) row = " ".join(rows[-1] + [string]) if util.ulen(row) < print_width: return rows[-1].append(string) # Start a new row with padding and string. return rows.append([" ", string]) if max_elements is None: max_elements = dataiter.PRINT_MAX_ELEMENTS rows = [["["]] for string in self[:max_elements].to_strings(pad=True): add_string_element(string, rows) if max_elements < self.length: add_string_element("...", rows) add_string_element(f"] {self.dtype}", rows) if len(rows) == 1: # Drop padding for single-line output. rows[0] = [x.strip() for x in rows[0]] return "\n".join(" ".join(x) for x in rows)
[docs] def to_strings(self, *, ksep=None, quote=True, pad=False, truncate_width=inf): """ Return vector as strings formatted for display. >>> vector = di.Vector([1/2, 1/3, 1/4]) >>> vector.to_strings() """ if self.length == 0: return self.__class__.fast([], str) identity = lambda x, *args, **kwargs: x if ksep is None: ksep = dataiter.PRINT_THOUSAND_SEPARATOR quote = util.quote if quote else identity pad = util.upad if pad else identity if self.is_float(): strings = util.format_floats(self, ksep=ksep) return self.__class__.fast(pad(strings), str) if self.is_integer() and not self.is_timedelta(): strings = ["{:,d}".format(x).replace(",", ksep) for x in self] return self.__class__.fast(pad(strings), str) if self.is_object(): strings = [str(x) for x in self] for i in range(len(strings)): lines = strings[i].splitlines() if (util.ulen(strings[i]) > truncate_width or (len(lines) > 1 and truncate_width < inf)): strings[i] = util.utruncate(lines[0], truncate_width-1) + "…" return self.__class__.fast(pad(strings), str) if self.is_string(): strings = [quote(x) for x in self] for i in range(len(strings)): lines = strings[i].splitlines() if (util.ulen(strings[i]) > truncate_width or (len(lines) > 1 and truncate_width < inf)): strings[i] = util.utruncate(lines[0], truncate_width-1) + "…" return self.__class__.fast(pad(strings), str) strings = [str(x) for x in self] return self.__class__.fast(pad(strings), str)
[docs] def tolist(self): """ Return vector as a list with elements of matching Python builtin type. Missing values are replaced with ``None``. """ return np.where(self.is_na(), None, self).tolist()
[docs] def unique(self): """ Return unique elements. >>> vector = di.Vector([1, 1, 1, 2, 2, 3]) >>> vector.unique() """ u, indices = np.unique(self, return_index=True) return self[indices.sort()].copy()