https://github.com/poopoothegorilla/fastframe
DataFrame project that utilizes Apache Arrow
https://github.com/poopoothegorilla/fastframe
apache-arrow data-science dataframe golang
Last synced: 4 months ago
JSON representation
DataFrame project that utilizes Apache Arrow
- Host: GitHub
- URL: https://github.com/poopoothegorilla/fastframe
- Owner: poopoothegorilla
- License: apache-2.0
- Created: 2020-04-26T18:40:44.000Z (over 5 years ago)
- Default Branch: master
- Last Pushed: 2020-07-08T03:09:36.000Z (over 5 years ago)
- Last Synced: 2025-05-07T05:12:29.299Z (5 months ago)
- Topics: apache-arrow, data-science, dataframe, golang
- Language: Go
- Homepage:
- Size: 218 KB
- Stars: 8
- Watchers: 6
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: README.md
- License: LICENSE
Awesome Lists containing this project
README
# FastFrame
[](https://goreportcard.com/report/github.com/poopoothegorilla/fastframe)WARNING: DO NOT USE IN PRODUCTION EVERYTHING IS EXPERIMENTAL
FastFrame is a DataFrame project which utilizes Apache Arrow
(https://github.com/apache/arrow/tree/master/go) for the underlying data
storage.## Benchmarks
### Series
Some of the benchmarks show where columnar / arrow based computation are beneficial.
https://github.com/poopoothegorilla/fastframe/wiki/Benchmarks:-Series
### DataFrame
Currently, the code for the joins could be improved.
https://github.com/poopoothegorilla/fastframe/wiki/Benchmarks:-Datafame## Example
This is an example of calculating cosine similarity
see dataframe/example... files
```go
func generateMovieRatingsDF(pool memory.Allocator) dataframe.DataFrame {
f := strings.NewReader(`"user_name","user_id","movie_id","rating"
jim,1,2,3.5
jim,1,29,3.5
jim,1,32,3.5
jim,1,47,3.5
jim,1,50,3.5
jim,1,112,3.5
joe,2,3,4
joe,2,29,5
joe,2,62,5
joe,2,70,5
joe,2,47,4
joe,2,110,4
joe,2,242,3
joe,2,260,5
fig,3,62,4
fig,3,50,3.5
fig,3,70,2
fig,3,112,5`)return dataframe.NewFromCSV(pool, csv.NewReader(f), -1, nil)
}func Example() {
pool := memory.NewGoAllocator()rawMovieRatingsDF := generateMovieRatingsDF(pool)
castList := map[string]arrow.DataType{
"user_id": arrow.PrimitiveTypes.Int32,
"movie_id": arrow.PrimitiveTypes.Int64,
"rating": arrow.PrimitiveTypes.Float64,
}
movieRatingsDF := rawMovieRatingsDF.Cast(castList)
rawMovieRatingsDF.Release()fmt.Println("MOVIE RATINGS:")
table := array.NewTableReader(movieRatingsDF, 5)
n := 0
for table.Next() {
rec := table.Record()
for i, col := range rec.Columns() {
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
}
n++
}
table.Release()// CREATE pivot table
pivot := movieRatingsDF.Pivot("user_id", "movie_id", "rating")
fmt.Println("PIVOT:")
table = array.NewTableReader(pivot, 3)
n = 0
for table.Next() {
rec := table.Record()
for i, col := range rec.Columns() {
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
}
n++
}
table.Release()matrix := pivot.CosineSimilarity()
pivot.Release()fmt.Println("MATRIX:")
table = array.NewTableReader(matrix, 2)
n = 0
for table.Next() {
rec := table.Record()
for i, col := range rec.Columns() {
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
}
n++
}
table.Release()
matrix.Release()
// Output:
// MOVIE RATINGS:
// rec[0]["user_name"]: ["jim" "jim" "jim" "jim" "jim"]
// rec[0]["user_id"]: [1 1 1 1 1]
// rec[0]["movie_id"]: [2 29 32 47 50]
// rec[0]["rating"]: [3.5 3.5 3.5 3.5 3.5]
// rec[1]["user_name"]: ["jim" "joe" "joe" "joe" "joe"]
// rec[1]["user_id"]: [1 2 2 2 2]
// rec[1]["movie_id"]: [112 3 29 62 70]
// rec[1]["rating"]: [3.5 4 5 5 5]
// rec[2]["user_name"]: ["joe" "joe" "joe" "joe" "fig"]
// rec[2]["user_id"]: [2 2 2 2 3]
// rec[2]["movie_id"]: [47 110 242 260 62]
// rec[2]["rating"]: [4 4 3 5 4]
// rec[3]["user_name"]: ["fig" "fig" "fig"]
// rec[3]["user_id"]: [3 3 3]
// rec[3]["movie_id"]: [50 70 112]
// rec[3]["rating"]: [3.5 2 5]
// PIVOT:
// rec[0]["user_id"]: [1 2 3]
// rec[0]["2"]: [3.5 0 0]
// rec[0]["29"]: [3.5 5 0]
// rec[0]["32"]: [3.5 0 0]
// rec[0]["47"]: [3.5 4 0]
// rec[0]["50"]: [3.5 0 3.5]
// rec[0]["112"]: [3.5 0 5]
// rec[0]["3"]: [0 4 0]
// rec[0]["62"]: [0 5 4]
// rec[0]["70"]: [0 5 2]
// rec[0]["110"]: [0 4 0]
// rec[0]["242"]: [0 3 0]
// rec[0]["260"]: [0 5 0]
// MATRIX:
// rec[0]["0"]: [1 0.30588186723552135]
// rec[0]["1"]: [0.30588186723552135 1]
// rec[0]["2"]: [0.46616560472322743 0.3485753093366648]
// rec[1]["0"]: [0.46616560472322743]
// rec[1]["1"]: [0.3485753093366648]
// rec[1]["2"]: [1]
}
```## Similar Projects
Bullseye (https://github.com/go-bullseye/bullseye): Another DataFrame project backed by Arrow. This project might merge with this project.