An open API service indexing awesome lists of open source software.

https://github.com/xiaodaigh/tidystanza.jl

Attempting to implement some {tidyverse} APIs in Julia
https://github.com/xiaodaigh/tidystanza.jl

Last synced: 2 months ago
JSON representation

Attempting to implement some {tidyverse} APIs in Julia

Awesome Lists containing this project

README

        

## TidyStanza

Trying to implement {tidyverse}, including {dplyr}, APIs in Julia. This is not intended to be a sustained effort and is meant to be a fun exercise in trying to learn {tidyverse} and to teach Julia programming. So prolonged maintainence are purely accidental!

### Examples:

#### `across` and `where`

* `TidyStanza.Across` and `TidyStanza.across` are synonyms and have the same API as `dplyr::across`
* `TidyStanza.Where` and `TidyStanza.where` are synonyms and have the same API as `dplyr::across(where(...), ...)`

By default, they are NOT exported, and the recommended way is to use `TidyStanza.across` and `TidyStanza.where`
to refer to them.

```julia
# from 1.6 import TidayStanza as tidy
import TidyStanza
const tidy = TidyStanza

tidy.across
tidy.where
```

However, in the examples below, for brevity, I have imported `across` and `where`
directly into the namespace.

```julia
using TidyStanza: across, where

### load some helper packages
using DataFrames
using Statistics # for using mean
using Chain: @chain # for @pipe macro
using RDatasets # for iris dataset

iris = dataset("datasets", "iris");

# a glimpse of the data
first(iris, 8)
```

```julia
# R"""
# iris %>%
# group_by(Species) %>%
# summarise(across(starts_with("Sepal"), mean))
# """

@chain iris begin
groupby(:Species)
combine(across(startswith("Sepal"), mean))
end
```

```julia
using CategoricalArrays: CategoricalArray
# R"""
# iris %>%
# as_tibble() %>%
# mutate(across(where(is.factor), as.character))
# """

# define a convenience function for checking if column is categorical
iscatarray(arr) = typeof(arr) <: CategoricalArray

@chain iris begin
transform(across(where(iscatarray), Vector{String}))
first(8)
end

@chain iris begin
transform(across(where(iscatarray), col->string.(col)))
first(8)
end
```

```julia
# A purrr-style formula
# iris %>%
# group_by(Species) %>%
# summarise(across(starts_with("Sepal"), ~mean(.x, na.rm = TRUE)))
@chain iris begin
groupby(:Species)
combine(across(startswith("Sepal"), x->mean(x |> skipmissing)))
end
```

```julia
# A named list of functions
# iris %>%
# group_by(Species) %>%
# summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd)))

@chain iris begin
groupby(:Species)
combine(across(startswith("Sepal"), (mean, std)))
end
```

```julia
# Use the .names argument to control the output names
# iris %>%
# group_by(Species) %>%
# summarise(across(starts_with("Sepal"), mean, .names = "mean_{col}"))

@chain iris begin
groupby(:Species) |>
combine(across(startswith("Sepal"), mean; names = "mean_{col}"))
end
```

```julia
# iris %>%
# group_by(Species) %>%
# summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{col}_{fn}"))

@chain iris begin
groupby(:Species)
combine(across(startswith("Sepal"), (mean = mean, std = std); names = "{col}_{fn}"))
end
```

```julia
# iris %>%
# group_by(Species) %>%
# summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{col}.fn{fn}"))

@chain iris begin
groupby(:Species)
combine(across(startswith("Sepal"), (mean, std); names = "{col}_fn{fn}"))
end
```

#### `pivot_wider`

```julia
df = DataFrame(x = repeat(1:3,inner = 2,outer = 2),
a = repeat(4:6,inner = 2,outer = 2),
b = repeat(7:9,inner = 2,outer = 2),
val1 = ["ce_val1_1","cf_val1_1","ce_val1_2","cf_val1_2","ce_val1_3","cf_val1_3","de_val1_1",
"df_val1_1","de_val1_2","df_val1_2","de_val1_3","df_val1_3"],
val2 = ["ce_val2_1","cf_val2_1","ce_val2_2","cf_val2_2","ce_val2_3","cf_val2_3","de_val2_1",
"df_val2_1","de_val2_2","df_val2_2","de_val2_3","df_val2_3"],
cname1 = repeat(["c", "d"], inner = 6),
cname2 = repeat(["e", "f"], 6)
)
```

```julia
using TidyStanza: pivot_wider
pivot_wider(df; names_from = [:cname1, :cname2], values_from = [:val1, :val2])
```

#### `relocate` - for relocating columns

This is for relocating columns and implements a replica of [`dplyr::relocate`](https://dplyr.tidyverse.org/reference/relocate.html)

```
using DataFrames
using Chain: @chain
using TidyStanza: relocate, any_of, last_col

# df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")

df = DataFrame(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
```

```
# df %>% relocate(f)
@chain df relocate(:f)
```

```
# df %>% relocate(a, .after = c)
@chain df relocate(:a, after = :c)
```

```
# df %>% relocate(f, .before = b)
@chain df relocate(:f, before = :b)
```

```
# df %>% relocate(a, .after = last_col())
@chain df relocate(:a, after = names(df)[end])
```

```
@chain df relocate(:a, after = last_col())
```

```
middle_col() = df->names(df)[end ÷ 2]
@chain df relocate(:a, after = middle_col())
```

```
using TidyStanza: where

# df %>% relocate(where(is.character))
isstring(x) = eltype(x) <: AbstractString

@chain df relocate(where(isstring))
```

```
@chain df relocate(where(x->eltype(x) <: AbstractString))
```

```
# df %>% relocate(where(is.numeric), .after = last_col())
isnumeric(x) = eltype(x) <: Number

@chain df relocate(where(isnumeric), after = last_col())
```

```
# df %>% relocate(any_of(c("a", "e", "i", "o", "u")))
@chain df relocate(intersect(["a", "e", "i", "o", "u"], names(df)))
```

```
@chain df relocate(any_of(["a", "e", "i", "o", "u"]))
```

```
#df2 <- tibble(a = 1, b = "a", c = 1, d = "a")

df2 = DataFrame(a = 1, b = "a", c = 1, d = "a")
```

```
#df2 %>% relocate(where(is.numeric), .after = where(is.character))

@> df2 relocate(where(isnumeric), after = where(isstring))
```

```
#df2 %>% relocate(where(is.numeric), .before = where(is.character))
@> df2 relocate(where(isnumeric), before = where(isstring))
```

## Why Stanza?
The verse in tidyverse is referring to the universe, but "verse" is a [technical term in poetry](https://en.wikipedia.org/wiki/Verse_(poetry)), so is [stanza](https://en.wikipedia.org/wiki/Stanza).