# -*- coding: utf-8 -*-
# Copyright (c) 2020 Osmo Salomaa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import dataiter
import datetime
import numpy as np
import sys
from dataiter import util
from math import inf
TYPE_CONVERSIONS = {
datetime.date: "datetime64[D]",
datetime.datetime: "datetime64[us]",
}
[docs]
class Vector(np.ndarray):
"""
A one-dimensional array.
Vector is a subclass of NumPy ``ndarray``. Note that not all ``ndarray``
methods have been overridden and thus by careless use of baseclass in-place
methods you might manage to twist the data into multi-dimensional or other
non-vector form, causing unexpected results.
https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html
"""
def __new__(cls, object, dtype=None):
# If given a NumPy array, we can do a fast initialization.
if isinstance(object, np.ndarray):
dtype = dtype or object.dtype
return np.array(object, dtype).view(cls)
# If given a Python list, or something else generic, we need
# to convert certain types and special values. This is really
# slow, see Vector.fast for faster initialization.
if (hasattr(object, "__iter__") and
not isinstance(object, (list, tuple))):
# Evaluate generator/iterator.
object = list(object)
return cls._std_to_np(object, dtype).view(cls)
[docs]
def __init__(self, object, dtype=None):
"""
Return a new vector.
`object` can be any one-dimensional sequence, such as a NumPy array,
Python list or tuple. Creating a vector from a NumPy array will be
fast, from other types slower as data types and special values will
need to be converted.
`dtype` is the NumPy-compatible data type for the vector. Providing
`dtype` will make creating the vector faster, otherwise the appropriate
data type will be guessed by introspecting the elements of `object`,
which is potentially slow, especially for large objects.
>>> di.Vector([1, 2, 3], int)
"""
self._check_dimensions()
def __array_wrap__(self, array, context=None):
# Avoid returning 0-dimensional arrays.
# https://github.com/numpy/numpy/issues/7403
return array[()] if array.shape == () else array
def __repr__(self):
return self.to_string()
def __str__(self):
return self.to_string()
[docs]
def as_boolean(self):
"""
Return vector converted to boolean data type.
>>> vector = di.Vector([0, 1])
>>> vector.as_boolean()
"""
if self.is_string():
# NumPy does bool(int(str)), which is weird.
# https://github.com/numpy/numpy/issues/20898
# https://github.com/numpy/numpy/pull/21024
return self.map(bool)
return self.astype(bool)
[docs]
def as_bytes(self):
"""
Return vector converted to bytes data type.
>>> vector = di.Vector(["a", "b"])
>>> vector.as_bytes()
"""
if self.is_string():
array = np.char.encode(self, "utf-8")
return array.view(self.__class__)
return self.astype(bytes)
[docs]
def as_date(self):
"""
Return vector converted to date data type.
>>> vector = di.Vector(["2020-01-01"])
>>> vector.as_date()
"""
return self.astype(np.dtype("datetime64[D]"))
[docs]
def as_datetime(self, precision="us"):
"""
Return vector converted to datetime data type.
>>> vector = di.Vector(["2020-01-01T12:00:00"])
>>> vector.as_datetime()
"""
return self.astype(np.dtype(f"datetime64[{precision}]"))
[docs]
def as_float(self):
"""
Return vector converted to float data type.
>>> vector = di.Vector([1, 2, 3])
>>> vector.as_float()
"""
return self.astype(float)
[docs]
def as_integer(self):
"""
Return vector converted to integer data type.
>>> vector = di.Vector([1.0, 2.0, 3.0])
>>> vector.as_integer()
"""
return self.astype(int)
[docs]
def as_object(self):
"""
Return vector converted to object data type.
>>> vector = di.Vector([1, 2, 3])
>>> vector.as_object()
"""
return self.__class__(self.tolist(), object)
[docs]
def as_string(self, length=None):
"""
Return vector converted to string data type.
>>> vector = di.Vector([1, 2, 3])
>>> vector.as_string()
>>> vector.as_string(64)
"""
return self.astype(f"U{length}" if length else str)
def _check_dimensions(self):
if self.ndim == 1: return
raise ValueError(f"Bad dimensions: {self.ndim!r}")
[docs]
def concat(self, *others):
"""
Return vector with elements from `others` appended.
>>> a = di.Vector([1, 2, 3])
>>> b = di.Vector([4, 5, 6])
>>> c = di.Vector([7, 8, 9])
>>> a.concat(b, c)
"""
vectors = [self] + list(others)
new = np.concatenate(vectors)
return self.__class__(new)
[docs]
def drop_na(self):
"""
Return vector without missing values.
>>> vector = di.Vector([1, 2, 3, None])
>>> vector.drop_na()
"""
return self[~self.is_na()].copy()
[docs]
def equal(self, other):
"""
Return whether vectors are equal.
Equality is tested with ``==``. As an exception, corresponding missing
values are considered equal as well.
>>> a = di.Vector([1, 2, 3, None])
>>> b = di.Vector([1, 2, 3, None])
>>> a
>>> b
>>> a.equal(b)
"""
if not (isinstance(other, Vector) and
self.length == other.length and
str(self.na_value) == str(other.na_value)):
return False
ii = self.is_na()
jj = other.is_na()
return (np.all(ii == jj) and
np.all(self[~ii] == other[~jj]))
[docs]
@classmethod
def fast(cls, object, dtype=None):
"""
Return a new vector.
Unlike :meth:`__init__`, this will **not** convert special values in
`object`. Use this only if you know `object` doesn't contain special
values or if you know they are already of the correct type.
"""
if (hasattr(object, "__iter__") and
not isinstance(object, (np.ndarray, list, tuple))):
# Evaluate generator/iterator.
object = list(object)
return np.array(object, dtype).view(cls)
[docs]
def get_memory_use(self):
"""
Return memory use in bytes.
>>> vector = di.Vector(range(100))
>>> vector.get_memory_use()
"""
if self.is_object():
return sum(sys.getsizeof(x) for x in self)
return self.nbytes
[docs]
def head(self, n=None):
"""
Return the first `n` elements.
>>> vector = di.Vector(range(100))
>>> vector.head(10)
"""
if n is None:
n = dataiter.DEFAULT_PEEK_ELEMENTS
n = min(self.length, n)
return self[np.arange(n)].copy()
[docs]
def is_boolean(self):
"""
Return whether vector data type is boolean.
"""
return np.issubdtype(self.dtype, np.bool_)
[docs]
def is_bytes(self):
"""
Return whether vector data type is bytes.
"""
return np.issubdtype(self.dtype, np.bytes_)
[docs]
def is_datetime(self):
"""
Return whether vector data type is datetime.
Dates are considered datetimes as well.
"""
return np.issubdtype(self.dtype, np.datetime64)
[docs]
def is_float(self):
"""
Return whether vector data type is float.
"""
return np.issubdtype(self.dtype, np.floating)
[docs]
def is_integer(self):
"""
Return whether vector data type is integer.
"""
return np.issubdtype(self.dtype, np.integer)
[docs]
def is_na(self):
"""
Return a boolean vector indicating missing data elements.
>>> vector = di.Vector([1, 2, 3, None])
>>> vector
>>> vector.is_na()
"""
if self.is_datetime():
return np.isnat(self)
if self.is_timedelta():
return np.isnat(self)
if self.is_float():
return np.isnan(self)
if self.is_string():
return self == ""
return np.isin(self, [None])
[docs]
def is_number(self):
"""
Return whether vector data type is number.
"""
return np.issubdtype(self.dtype, np.number)
[docs]
def is_object(self):
"""
Return whether vector data type is object.
"""
return np.issubdtype(self.dtype, np.object_)
[docs]
def is_string(self):
"""
Return whether vector data type is string.
"""
return np.issubdtype(self.dtype, np.unicode_)
[docs]
def is_timedelta(self):
"""
Return whether vector data type is timedelta.
"""
return np.issubdtype(self.dtype, np.timedelta64)
@property
def length(self):
"""
Return the amount of elements.
>>> vector = di.Vector(range(100))
>>> vector.length
"""
self._check_dimensions()
return self.size
[docs]
def map(self, function, *args, dtype=None, **kwargs):
"""
Apply `function` element-wise and return a new vector.
>>> import math
>>> vector = di.Vector(range(10))
>>> vector.map(math.pow, 2)
"""
return self.__class__((function(x, *args, **kwargs) for x in self), dtype)
@property
def na_dtype(self):
"""
Return the corresponding data type that can handle missing data.
You might need this for upcasting when missing data is first introduced.
>>> vector = di.Vector([1, 2, 3])
>>> vector
>>> vector.put([2], vector.na_value)
>>> vector = vector.astype(vector.na_dtype)
>>> vector
>>> vector.put([2], vector.na_value)
>>> vector
"""
if self.is_datetime():
return self.dtype
if self.is_timedelta():
return self.dtype
if self.is_float():
return self.dtype
if self.is_integer():
return float
if self.is_string():
return self.dtype
return object
@property
def na_value(self):
"""
Return the corresponding value to use to represent missing data.
Dataiter is built on top of NumPy. NumPy doesn't support a proper
missing value ("NA"), only data type specific values: ``np.nan``,
``np.datetime64("NaT")`` and ``np.timedelta64("NaT")``. Dataiter
recommends the following values be used and internally supports them to
an extent.
========= ========================
datetime ``np.datetime64("NaT")``
float ``np.nan``
integer ``np.nan``
string ``""``
timedelta ``np.timedelta64("NaT")``
other ``None``
========= ========================
Note that actually using these might require upcasting the vector.
Integer will need to be upcast to float to contain ``np.nan``. Other,
such as boolean, will need to be upcast to object to contain ``None``.
If you need to avoid object columns, you can also consider converting
booleans to float using :meth:`as_float`, which will give you 0.0 for
false and 1.0 for true. Depending on how you use the data, that might
work as well as an object vector of ``True``, ``False`` and ``None``.
"""
if self.is_datetime():
return np.datetime64("NaT")
if self.is_timedelta():
return np.timedelta64("NaT")
if self.is_float():
return np.nan
if self.is_integer():
return np.nan
if self.is_string():
return ""
# Note that using None, e.g. for a boolean vector,
# might not work directly as it requires upcasting to object.
return None
[docs]
def range(self):
"""
Return the minimum and maximum values as a two-element vector.
>>> vector = di.Vector(range(100))
>>> vector.range()
"""
rng = [np.nanmin(self), np.nanmax(self)]
return self.__class__(rng, self.dtype)
[docs]
def rank(self, *, method="average"):
"""
Return the order of elements in a sorted vector.
`method` determines how ties are resolved. **'min'** assigns each of
equal values the same rank, the minimum of the set (also called
"competition ranking"). **'max'** is the same, but assigning the
maximum of the set. **'average'** is the mean of 'min' and 'max'.
**'ordinal'** gives each element a distinct rank with equal values
ranked by their order in input.
Ranks begin at 1. Missing values are ranked last.
**References**
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html
* https://www.rdocumentation.org/packages/base/topics/rank
>>> vector = di.Vector([3, 1, 1, 1, 2, 2])
>>> vector.rank(method="min")
>>> vector.rank(method="max")
>>> vector.rank(method="average")
>>> vector.rank(method="ordinal")
"""
if self.length == 0:
return self.__class__([], int)
if method not in ["min", "max", "average", "ordinal"]:
raise ValueError(f"Unexpected method: {method!r}")
na = self.is_na()
if na.all():
# Avoid trying to evaluate min/max/mean of all NA.
x = self.__class__(np.repeat(1, self.length))
return x.rank(method=method)
if method == "average":
rank_min = self.rank(method="min")
rank_max = self.rank(method="max")
rank = np.mean([rank_min, rank_max], axis=0)
return self.__class__(rank)
if method == "min":
# https://stackoverflow.com/a/14672797/16369038
inv = np.unique(self[~na], return_inverse=True)[1]
arank = np.concatenate(([0], np.bincount(inv))).cumsum()[inv]
zrank = arank.max() + 1
if method == "max":
# https://stackoverflow.com/a/14672797/16369038
inv = np.unique(self[~na], return_inverse=True)[1]
arank = np.bincount(inv).cumsum()[inv] - 1
zrank = len(self) - 1
if method == "ordinal":
# https://stackoverflow.com/a/5284703/16369038
indices = self[~na].argsort()
arank = np.empty_like(indices)
arank[indices] = np.arange(len(indices))
zrank = arank.max() + 1 + np.arange(na.sum())
out = np.zeros_like(self, int)
out[~na] = arank + 1
out[na] = zrank + 1
return self.__class__(out)
[docs]
def replace_na(self, value):
"""
Return vector with missing values replaced with `value`.
>>> vector = di.Vector([1, 2, 3, None])
>>> vector.replace_na(0)
"""
vector = self.copy()
vector[vector.is_na()] = value
return vector
[docs]
def sample(self, n=None):
"""
Return randomly chosen `n` elements.
>>> vector = di.Vector(range(100))
>>> vector.sample(10)
"""
if n is None:
n = dataiter.DEFAULT_PEEK_ELEMENTS
n = min(self.length, n)
indices = np.random.choice(self.length, n, replace=False)
return self[np.sort(indices)].copy()
[docs]
def sort(self, *, dir=1):
"""
Return elements in sorted order.
`dir` is ``1`` for ascending sort, ``-1`` for descending.
Missing values are sorted last, regardless of `dir`.
>>> vector = di.Vector([1, 2, 3, None])
>>> vector.sort(dir=1)
>>> vector.sort(dir=-1)
"""
if self.is_object():
# It's not really clear how objects should be sorted.
# Let's use strings, since (1) object vectors are often
# used to hold strings and (2) most types probably
# implement __str__, so this is actually doable.
lst = sorted(self, key=str, reverse=dir<0)
new = self.fast(lst, object)
na = new.is_na()
new = new[~na].concat(new[na])
return self.fast(new, object)
new = self.copy()
np.ndarray.sort(new)
if dir < 0:
new = new[::-1]
na = new.is_na()
new = new[~na].concat(new[na])
return self.fast(new, self.dtype)
@classmethod
def _std_to_np(cls, seq, dtype=None):
# Convert missing values in seq to NumPy equivalents.
# Can be empty if all of seq are missing values.
types = util.unique_types(seq)
if dtype is not None:
na = Vector.fast([], dtype).na_value
elif len(types) == 1 and types.copy().pop().__module__ == "numpy":
# If we have a regular Python list of NumPy scalars,
# infer type. This should be rare, but can happen.
dtype = types.copy().pop()().dtype
na = Vector.fast([], dtype).na_value
else:
# Guess the missing value based on types in seq.
na = cls._std_to_np_na_value(types)
seq = [na if
x is None or
(isinstance(x, float) and np.isnan(x))
else x for x in seq]
if dtype is not None:
if np.issubdtype(dtype, np.integer) and np.nan in seq:
# Upcast from integer to float as required.
dtype = float
return np.array(seq, dtype)
# NaT values bring in np.datetime64 to types.
types.discard(np.datetime64)
for fm, to in TYPE_CONVERSIONS.items():
if types and all(x == fm for x in types):
return np.array(seq, to)
# Let NumPy guess the appropriate dtype.
return np.array(seq, dtype)
@classmethod
def _std_to_np_na_value(cls, types):
if not types:
return None
if str in types:
return ""
if all(x in [float, int] or
np.issubdtype(x, np.floating) or
np.issubdtype(x, np.integer)
for x in types):
return np.nan
datetimes = [datetime.date, datetime.datetime, np.datetime64]
if all(x in datetimes for x in types):
return np.datetime64("NaT")
# Usually causes dtype to be object!
return None
[docs]
def tail(self, n=None):
"""
Return the last `n` elements.
>>> vector = di.Vector(range(100))
>>> vector.tail(10)
"""
if n is None:
n = dataiter.DEFAULT_PEEK_ELEMENTS
n = min(self.length, n)
return self[np.arange(self.length - n, self.length)].copy()
[docs]
def to_string(self, *, max_elements=None):
"""
Return vector as a string formatted for display.
>>> vector = di.Vector([1/2, 1/3, 1/4])
>>> vector.to_string()
"""
print_width = util.get_print_width()
def add_string_element(string, rows):
if len(rows[-1]) <= 1:
return rows[-1].append(string)
row = " ".join(rows[-1] + [string])
if util.ulen(row) < print_width:
return rows[-1].append(string)
# Start a new row with padding and string.
return rows.append([" ", string])
if max_elements is None:
max_elements = dataiter.PRINT_MAX_ELEMENTS
rows = [["["]]
for string in self[:max_elements].to_strings(pad=True):
add_string_element(string, rows)
if max_elements < self.length:
add_string_element("...", rows)
add_string_element(f"] {self.dtype}", rows)
if len(rows) == 1:
# Drop padding for single-line output.
rows[0] = [x.strip() for x in rows[0]]
return "\n".join(" ".join(x) for x in rows)
[docs]
def to_strings(self, *, ksep=None, quote=True, pad=False, truncate_width=inf):
"""
Return vector as strings formatted for display.
>>> vector = di.Vector([1/2, 1/3, 1/4])
>>> vector.to_strings()
"""
if self.length == 0:
return self.__class__.fast([], str)
identity = lambda x, *args, **kwargs: x
if ksep is None:
ksep = dataiter.PRINT_THOUSAND_SEPARATOR
quote = util.quote if quote else identity
pad = util.upad if pad else identity
if self.is_float():
strings = util.format_floats(self, ksep=ksep)
return self.__class__.fast(pad(strings), str)
if self.is_integer() and not self.is_timedelta():
strings = ["{:,d}".format(x).replace(",", ksep) for x in self]
return self.__class__.fast(pad(strings), str)
if self.is_object():
strings = [str(x) for x in self]
for i in range(len(strings)):
lines = strings[i].splitlines()
if (util.ulen(strings[i]) > truncate_width or
(len(lines) > 1 and truncate_width < inf)):
strings[i] = util.utruncate(lines[0], truncate_width-1) + "…"
return self.__class__.fast(pad(strings), str)
if self.is_string():
strings = [quote(x) for x in self]
for i in range(len(strings)):
lines = strings[i].splitlines()
if (util.ulen(strings[i]) > truncate_width or
(len(lines) > 1 and truncate_width < inf)):
strings[i] = util.utruncate(lines[0], truncate_width-1) + "…"
return self.__class__.fast(pad(strings), str)
strings = [str(x) for x in self]
return self.__class__.fast(pad(strings), str)
[docs]
def tolist(self):
"""
Return vector as a list with elements of matching Python builtin type.
Missing values are replaced with ``None``.
"""
return np.where(self.is_na(), None, self).tolist()
[docs]
def unique(self):
"""
Return unique elements.
>>> vector = di.Vector([1, 1, 1, 2, 2, 3])
>>> vector.unique()
"""
u, indices = np.unique(self, return_index=True)
return self[indices.sort()].copy()