https://github.com/xiaodaigh/tidystanza.jl

Attempting to implement some {tidyverse} APIs in Julia
https://github.com/xiaodaigh/tidystanza.jl

Last synced: 2 months ago
JSON representation

Attempting to implement some {tidyverse} APIs in Julia

Host: GitHub
URL: https://github.com/xiaodaigh/tidystanza.jl
Owner: xiaodaigh
License: mit
Created: 2020-09-05T05:22:06.000Z (over 4 years ago)
Default Branch: master
Last Pushed: 2023-02-26T03:58:27.000Z (about 2 years ago)
Last Synced: 2025-01-20T11:12:08.465Z (4 months ago)
Language: Julia
Homepage:
Size: 55.7 KB
Stars: 20
Watchers: 3
Forks: 4
Open Issues: 2
Metadata Files:
- Readme: README.jmd
- License: LICENSE

Awesome Lists containing this project

README

        ## TidyStanza

Trying to implement {tidyverse}, including {dplyr}, APIs in Julia. This is not intended to be a sustained effort and is meant to be a fun exercise in trying to learn {tidyverse} and to teach Julia programming. So prolonged maintainence are purely accidental!

### Examples:

#### `across` and `where`

* `TidyStanza.Across` and `TidyStanza.across` are synonyms and have the same API as `dplyr::across`

* `TidyStanza.Where` and `TidyStanza.where` are synonyms and have the same API as `dplyr::across(where(...), ...)`

By default, they are NOT exported, and the recommended way is to use `TidyStanza.across` and `TidyStanza.where`

to refer to them.

```julia

# from 1.6 import TidayStanza as tidy

import TidyStanza

const tidy = TidyStanza

tidy.across

tidy.where

```

However, in the examples below, for brevity, I have imported `across` and `where`

directly into the namespace.

```julia

using TidyStanza: across, where

### load some helper packages

using DataFrames

using Statistics # for using mean

using Chain: @chain # for @pipe macro

using RDatasets # for iris dataset

iris = dataset("datasets", "iris");

# a glimpse of the data

first(iris, 8)

```

```julia

# R"""

# iris %>%

#   group_by(Species) %>%

#   summarise(across(starts_with("Sepal"), mean))

# """

@chain iris begin

  groupby(:Species)

  combine(across(startswith("Sepal"), mean))

 end

```

```julia

using CategoricalArrays: CategoricalArray

# R"""

# iris %>%

    # as_tibble() %>%

    # mutate(across(where(is.factor), as.character))

# """

# define a convenience function for checking if column is categorical

iscatarray(arr) = typeof(arr) <: CategoricalArray

@chain iris begin

  transform(across(where(iscatarray), Vector{String}))  

  first(8)

 end

@chain iris begin

  transform(across(where(iscatarray), col->string.(col)))

  first(8)

end

```

```julia

# A purrr-style formula

# iris %>%

#   group_by(Species) %>%

#   summarise(across(starts_with("Sepal"), ~mean(.x, na.rm = TRUE)))

@chain iris begin

  groupby(:Species)

  combine(across(startswith("Sepal"), x->mean(x |> skipmissing)))

end

```

```julia

# A named list of functions

# iris %>%

#   group_by(Species) %>%

#   summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd)))

@chain iris begin

    groupby(:Species)

    combine(across(startswith("Sepal"), (mean, std)))

end

```

```julia

# Use the .names argument to control the output names

# iris %>%

#   group_by(Species) %>%

#   summarise(across(starts_with("Sepal"), mean, .names = "mean_{col}"))

@chain iris begin

    groupby(:Species) |>

    combine(across(startswith("Sepal"), mean; names = "mean_{col}"))

end

```

```julia

# iris %>%

#   group_by(Species) %>%

#   summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{col}_{fn}"))

@chain iris begin

    groupby(:Species)

    combine(across(startswith("Sepal"), (mean = mean, std = std); names = "{col}_{fn}"))

end

```

```julia

# iris %>%

#   group_by(Species) %>%

#   summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{col}.fn{fn}"))

@chain iris begin

    groupby(:Species)

    combine(across(startswith("Sepal"), (mean, std); names = "{col}_fn{fn}"))

end

```

#### `pivot_wider`

```julia

df = DataFrame(x = repeat(1:3,inner = 2,outer = 2),

       a = repeat(4:6,inner = 2,outer = 2),

       b = repeat(7:9,inner = 2,outer = 2),

       val1 = ["ce_val1_1","cf_val1_1","ce_val1_2","cf_val1_2","ce_val1_3","cf_val1_3","de_val1_1",

               "df_val1_1","de_val1_2","df_val1_2","de_val1_3","df_val1_3"],

       val2 = ["ce_val2_1","cf_val2_1","ce_val2_2","cf_val2_2","ce_val2_3","cf_val2_3","de_val2_1",

               "df_val2_1","de_val2_2","df_val2_2","de_val2_3","df_val2_3"],

       cname1 = repeat(["c", "d"], inner = 6),

       cname2 = repeat(["e", "f"], 6)

       )

```

```julia

using TidyStanza: pivot_wider

pivot_wider(df; names_from = [:cname1, :cname2], values_from = [:val1, :val2])

```

#### `relocate` - for relocating columns

This is for relocating columns and implements a replica of [`dplyr::relocate`](https://dplyr.tidyverse.org/reference/relocate.html)

```

using DataFrames

using Chain: @chain

using TidyStanza: relocate, any_of, last_col

# df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")

df = DataFrame(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")

```

```

# df %>% relocate(f)

@chain df relocate(:f)

```

```

# df %>% relocate(a, .after = c)

@chain df relocate(:a, after = :c)

```

```

# df %>% relocate(f, .before = b)

@chain df relocate(:f, before = :b)

```

```

# df %>% relocate(a, .after = last_col())

@chain df relocate(:a, after = names(df)[end])

```

```

@chain df relocate(:a, after = last_col())

```

```

middle_col() = df->names(df)[end ÷ 2]

@chain df relocate(:a, after = middle_col())

```

```

using TidyStanza: where

# df %>% relocate(where(is.character))

isstring(x) = eltype(x) <: AbstractString

@chain df relocate(where(isstring))

```

```

@chain df relocate(where(x->eltype(x) <: AbstractString))

```

```

# df %>% relocate(where(is.numeric), .after = last_col())

isnumeric(x) = eltype(x) <: Number

@chain df relocate(where(isnumeric), after = last_col())

```

```

# df %>% relocate(any_of(c("a", "e", "i", "o", "u")))

@chain df relocate(intersect(["a", "e", "i", "o", "u"], names(df)))

```

```

@chain df relocate(any_of(["a", "e", "i", "o", "u"]))

```

```

#df2 <- tibble(a = 1, b = "a", c = 1, d = "a")

df2 = DataFrame(a = 1, b = "a", c = 1, d = "a")

```

```

#df2 %>% relocate(where(is.numeric), .after = where(is.character))

@> df2 relocate(where(isnumeric), after = where(isstring))

```

```

#df2 %>% relocate(where(is.numeric), .before = where(is.character))

@> df2 relocate(where(isnumeric), before = where(isstring))

```

## Why Stanza?

The verse in tidyverse is referring to the universe, but "verse" is a [technical term in poetry](https://en.wikipedia.org/wiki/Verse_(poetry)), so is [stanza](https://en.wikipedia.org/wiki/Stanza).

ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Awesome

https://github.com/xiaodaigh/tidystanza.jl

Awesome Lists containing this project

README