library(tidyverse)
# Avoid hiding print output.
options(pillar.width=1000)
import dataiter as di
import numpy as np
import pandas as pd
# Avoid hiding print output.
pd.set_option("display.max_columns", 1000)
Some of the below code uses other parts of tidyverse besides dplyr too, such as readr. For simplicity, you can load them all via the tidyverse metapackage.
We often need NumPy too for certain calculations.
data = read_csv("data.csv")
write_csv(data, "data.csv")
data = di.read_csv("data.csv")
data.write_csv("data.csv")
data = pd.read_csv("data.csv")
data.to_csv("data.csv", index=False)
data = read_rds("data.rds")
write_rds(data, "data.rds")
data = di.read_npz("data.npz")
data.write_npz("data.npz")
data = pd.read_pickle("data.pkl")
data.to_pickle("data.pkl")
All three support multiple binary formats, the above are sensible defaults (assuming you don't need interoperability) that work out of the box.
nrow(data)
ncol(data)
data.nrow
data.ncol
len(data)
len(data.columns)
nrow, ncol = data.shape
# Column by name
data$x
data[["x"]]
# Column by index
data[[3]]
# Row by index
data[3,]
# Column element
data$x[3]
# Column by name
data.x
data["x"]
# Column by index
data.columns[2]
# Row by index
data.slice(2)
# Column element
data.x[2]
# Column by name
data.x
data["x"]
# Column by index
data.iloc[:,2]
# Row by index
data.iloc[2,:]
# Column element
data["x"][2]
Attribute access to columns (dot notation) is preferred.
Attribute access to columns (dot notation) does not work in all contexts, bracket notation is more common. Pandas uses terms "axis=0" to refer to rows, "axis=1" to refer to columns, "index" to refer to row names and "labels" to refer to row and column names. Certain operations use the "index" for implicit joins called "alignment".
data %>%
filter(year == 2021) %>%
arrange(desc(sales)) %>%
head(10)
(data
.filter(year=2021)
.sort(sales=-1)
.head(10))
(data
.loc[lambda x: x["year"] == 2021]
.sort_values("sales", ascending=False)
.head(10))
Since 4.1.0 R has a native pipe as well |>
, which is mostly
interchangeable with the above magrittr pipe.
Pandas is not really designed for method chaining but it mostly works these days. Note also that the "inplace" arguments that many methods take, which if used are incompatible with method chaining, are apparently not useful.
names = colnames(data)
colnames(data) = c("a", "b", "c")
names = data.colnames
data.colnames = ["a", "b", "c"]
names = data.columns
data.columns = ["a", "b", "c"]
rename(data, to="from")
data.rename(to="from")
data.rename(columns={"from": "to"}, errors="raise")
select(data, a, b, c)
select(data, -a, -b, -c)
data.select("a", "b", "c")
data.unselect("a", "b", "c")
data[["a", "b", "c"]]
data.drop(columns=["a", "b", "c"])
mutate(data, c=(a + b))
data.modify(c=(data.a + data.b))
data.modify(c=lambda x: x.a + x.b)
data.assign(c=(data["a"] + data["b"]))
data.assign(c=lambda x: x["a"] + x["b"])
arrange(data, a, b, desc(c))
data.sort(a=1, b=1, c=-1)
data.sort_values(["a", "b", "c"], ascending=[True, True, False])
head(data, 10)
tail(data, 10)
slice_sample(data, n=10)
data.head(10)
data.tail(10)
data.sample(10)
data.head(10)
data.tail(10)
data.sample(10)
filter(data, year == 2021)
data.filter(year=2021)
data.filter(data.year == 2021)
data.filter(lambda x: x.year == 2021)
data[data["year"] == 2021]
data.loc[data["year"] == 2021]
data[lambda x: x["year"] == 2021]
data.loc[lambda x: x["year"] == 2021]
data.query("year == 2021")
Dataiter also has filter_out
as a shorthand to negate the given
condition.
distinct(data, a, b, c, .keep_all=TRUE)
data.unique("a", "b", "c")
data.drop_duplicates(["a", "b", "c"])
bind_rows(data1, data2)
data1.rbind(data2)
pd.concat([data1, data2])
data1.append(data2)
append
is deprecated.
bind_cols(data1, data2)
data1.cbind(data2)
data1 = data1.reset_index(drop=True)
data2 = data2.reset_index(drop=True)
pd.concat([data1, data2], axis=1)
Pandas wants to do "alignment" by "index" here. Resetting the indices prior to concatenation should give the same result as the plain concatenation in dplyr and Dataiter.
left_join (data1, data2, by="id")
inner_join(data1, data2, by="id")
full_join (data1, data2, by="id")
data1.left_join (data2, "id")
data1.inner_join(data2, "id")
data1.full_join (data2, "id")
data1.merge(data2, how="left", on="id")
data1.merge(data2, how="inner", on="id")
data1.merge(data2, how="outer", on="id")
dplyr and Pandas follow the SQL convention of joining all matching rows, i.e. if doing a
left join with ten rows on the left side, the result will have ten or more rows – ten if all
keys have zero or one match on the right side, twenty if all have two matches etc. Dataiter
differs by only joining the first match, on account of it usually being more practical and
less liable to produce unpleasant surprises. If Dataiter's a.left_join(b)
doesn't give you all the results you're looking for, you might want instead
either b.left_join(a)
or a.full_join(b)
. SQL-style joins are
currently unsupported, but may be added in the future.
semi_join(data1, data2, by="id")
anti_join(data1, data2, by="id")
data1.semi_join(data2, "id")
data1.anti_join(data2, "id")
# No singular operations
data %>%
group_by(year, month) %>%
mutate(fraction=sales/sum(sales))
(data
.group_by("year", "month")
.modify(fraction=lambda x: (
x.sales / x.sales.sum())))
# No singular operation
data %>%
group_by(year, month) %>%
summarise(
sales_total=sum(sales),
sales_per_day=mean(sales))
(data
.group_by("year", "month")
.aggregate(
sales_total=di.sum("sales"),
sales_per_day=di.mean("sales")))
(data
.group_by("year", "month")
.aggregate(
sales_total=lambda x: x.sales.sum(),
sales_per_day=lambda x: x.sales.mean()))
(data
.groupby(["year", "month"], as_index=False)
.agg(
sales_total=("sales", "sum"),
sales_per_day=("sales", "mean")))
(data
.groupby(["year", "month"], as_index=False)
.apply(lambda x: pd.Series({
"sales_total": x["sales"].sum(),
"sales_per_day": x["sales"].mean()})))
Both Dataiter and Pandas have two aggregation forms:
one for fast aggregation limited to common operations with a single column and another for
arbitrary calculation with access to all columns. In Dataiter, these forms are equivalent in
the sense that e.g. di.sum("sales")
returns a function that takes data as
argument and calculates the sum of the "sales" column, and also in the sense that, unlike
with Pandas, you can mix and match both forms within the same aggregate
call.
Pandas' agg
method arguments can take very many forms, the above is called
"named aggregation". Likewise, the apply
method can be used very many ways, the
above is one way. The first form is about equally fast in Dataiter and Pandas, the second
form is a lot faster in Dataiter.