Ecosyste.ms: Awesome
An open API service indexing awesome lists of open source software.
https://github.com/qlibs/swar
C++20 SIMD Within A Register library
https://github.com/qlibs/swar
cpp20 performance simd swar
Last synced: 3 months ago
JSON representation
C++20 SIMD Within A Register library
- Host: GitHub
- URL: https://github.com/qlibs/swar
- Owner: qlibs
- Created: 2024-07-26T10:36:58.000Z (5 months ago)
- Default Branch: main
- Last Pushed: 2024-08-20T20:33:37.000Z (5 months ago)
- Last Synced: 2024-09-29T23:42:57.910Z (3 months ago)
- Topics: cpp20, performance, simd, swar
- Homepage:
- Size: 22.5 KB
- Stars: 19
- Watchers: 4
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: .github/README.md
Awesome Lists containing this project
README
//
[Overview](#Overview) / [Examples](#Examples) / [API](#API) / [FAQ](#FAQ)## SWAR: [SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) Within A Register library
[![MIT Licence](http://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/license/mit)
[![Version](https://img.shields.io/github/v/release/qlibs/swar)](https://github.com/qlibs/swar/releases)
[![Build](https://img.shields.io/badge/build-green.svg)](https://godbolt.org/z/xob1nGYoP)
[![Try it online](https://img.shields.io/badge/try%20it-online-blue.svg)](https://godbolt.org/z/55K55hqWb)> https://en.wikipedia.org/wiki/SWAR
### Use cases
- Performance (branchless)
- Portable (uses 'normal' registers)### Features
- Single header (https://raw.githubusercontent.com/qlibs/swar/main/swar - for integration see [FAQ](#faq))
- Minimal [API](#api)
- Verifies itself upon include (can be disabled with `-DNTEST` - see [FAQ](#faq))### Requirements
- C++20 ([clang++13+, g++12](https://en.cppreference.com/w/cpp/compiler_support))
---
### Overview
> API (https://godbolt.org/z/b4v9aTEYs)
```cpp
constexpr u8 data[]{1, 2, 3, 5, 5, 6, 7, 8};
constexpr swar lhs{data}; // copy_from
constexpr swar rhs{5}; // broadcast (native: u64)static_assert(8u == lhs.size());
static_assert(sizeof(u64) == sizeof(lhs));constexpr auto match = lhs == rhs;
static_assert(any_of(match));
static_assert(some_of(match));
static_assert(not all_of(match));
static_assert(not none_of(match));static_assert(3u == find_first_set(match));
static_assert(4u == find_last_set(match));
static_assert(2u == popcount(match));
static_assert(match[3u] and match[4u]);static_assert(sizeof(u32) == sizeof(swar));
static_assert(sizeof(u64) == sizeof(swar));
static_assert(sizeof(u32) == sizeof(swar));
static_assert(sizeof(u64) == sizeof(swar));
static_assert(sizeof(u64) == sizeof(swar));
static_assert(sizeof(u128) == sizeof(swar));// and more (see API)...
```> Performance (https://godbolt.org/z/ManGb8aso)
```cpp
auto eq(swar lhs, swar rhs) {
return lhs == rhs;
}
``````cpp
eq: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rdx, -9187201950435737472
xor rdi, rsi
movabs rax, 72340172838076672
or rdi, rdx
sub rax, rdi
and rax, rdx
ret
``````cpp
auto contains(swar lhs, u8 value) {
const auto rhs = swar{value};
const auto match = lhs == rhs;
return any_of(match);
}
``````cpp
contains: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, -9187201950435737472
imul rsi, rax
sub rax, 1
xor rdi, rsi
or rdi, rdx
sub rax, rdi
test rax, rdx
setne al
ret
``````cpp
auto find(swar lhs, u8 value) {
const auto rhs = swar{value};
const auto match = lhs == rhs;
return any_of(match) * find_first_set(match);
}
``````cpp
find: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, 72340172838076672
imul rsi, rax
movabs rax, -9187201950435737472
xor rdi, rsi
or rdi, rax
sub rdx, rdi
and rdx, rax
xor eax, eax
rep bsf rax, rdx
test rdx, rdx
mov edx, 0
cmove rax, rdx
ret
```---
### Examples
> swar vs simd (https://godbolt.org/z/YsG8evqr8)
```cpp
template auto eq(T lhs, T rhs) { return lhs == rhs; }
``````cpp
eq(swar, swar): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rdx, -9187201950435737472
xor rdi, rsi
movabs rax, 72340172838076672
or rdi, rdx
sub rax, rdi
and rax, rdx
reteq(simd, simd): // $CXX -O3 -mavx512f
vpcmpeqb xmm0, xmm0, xmm1
ret
``````cpp
template auto contains(T lhs, auto value) {
const auto rhs = T{value};
const auto match = lhs == rhs;
return any_of(match);
}
``````cpp
cointains(swar, swar): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, -9187201950435737472
imul rsi, rax
sub rax, 1
xor rdi, rsi
or rdi, rdx
sub rax, rdi
test rax, rdx
setne al
retcontains(simd, simd): // $CXX -O3 -mavx512f
vmovd xmm1, edi
vpbroadcastb xmm1, xmm1
vpcmpeqb xmm0, xmm1, xmm0
vptest xmm0, xmm0
setne al
ret
``````cpp
template auto find(T lhs, auto value) {
const auto rhs = T{value};
const auto match = lhs == rhs;
return any_of(match) * find_first_set(match);
}
``````cpp
find(swar, swar): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
movabs rax, 72340172838076673
movzx esi, sil
movabs rdx, 72340172838076672
imul rsi, rax
movabs rax, -9187201950435737472
xor rdi, rsi
or rdi, rax
sub rdx, rdi
and rdx, rax
xor eax, eax
rep bsf rax, rdx
test rdx, rdx
mov edx, 0
cmove rax, rdx
retfind(simd, simd): // $CXX -O3 -mavx512f
vmovd xmm1, edi
vpbroadcastb xmm1, xmm1
vpcmpeqb xmm0, xmm1, xmm0
vpmovmskb eax, xmm0
or eax, 65536
rep bsf ecx, eax
xor eax, eax
vptest xmm0, xmm0
cmovne eax, ecx
ret
```---
### API
```cpp
template>
requires ((sizeof(T) * Width) <= sizeof(TAbi))
struct swar {
using value_type = T;
using abi_type = TAbi;constexpr swar() noexcept = default;
constexpr swar(const swar&) noexcept = default;
constexpr swar(swar&&) noexcept = default;
constexpr explicit swar(const auto value) noexcept;
constexpr explicit swar(const auto* mem) noexcept;
constexpr explicit swar(const auto& gen) noexcept;
[[nodiscard]] constexpr explicit operator abi_type() const noexcept;
[[nodiscard]] constexpr auto operator[](size_t) const noexcept -> T;
[[nodiscard]] static constexpr auto size() noexcept -> size_t;
[[nodiscard]] friend constexpr auto operator==(const swar&, const swar&) noexcept;
};template>
requires ((sizeof(T) * Width) <= sizeof(TAbi))
struct swar_mask {
using value_type = bool; /// predefined
using abi_type = TAbi;constexpr swar_mask() noexcept = default;
constexpr swar_mask(const swar_mask&) noexcept = default;
constexpr swar_mask(swar_mask&&) noexcept = default;
constexpr explicit swar_mask(const abi_type value) noexcept;[[nodiscard]] constexpr auto operator[](const size_t index) const noexcept -> bool;
[[nodiscard]] static constexpr auto size() noexcept -> size_t { return Width; }
};template
[[nodiscard]] constexpr auto all_of(const swar_mask& s) noexcept -> bool;template
[[nodiscard]] constexpr auto any_of(const swar_mask& s) noexcept -> bool;template
[[nodiscard]] constexpr auto some_of(const swar_mask& s) noexcept -> bool;template
[[nodiscard]] constexpr auto none_of(const swar_mask& s) noexcept -> bool;template
[[nodiscard]] constexpr auto find_first_set(const swar_mask& s) noexcept;template
[[nodiscard]] constexpr auto find_last_set(const swar_mask& s) noexcept;template
[[nodiscard]] constexpr auto popcount(const swar_mask& s) noexcept;template inline constexpr bool is_swar_v = /* unspecified */;
template inline constexpr bool is_swar_mask_v = /* unspecified */;
```---
### FAQ
- How to disable running tests at compile-time?
> When `-DNTEST` is defined static_asserts tests wont be executed upon include.
Note: Use with caution as disabling tests means that there are no gurantees upon include that given compiler/env combination works as expected.- How to integrate with [CMake.FetchContent](https://cmake.org/cmake/help/latest/module/FetchContent.html)?
```
include(FetchContent)FetchContent_Declare(
qlibs.swar
GIT_REPOSITORY https://github.com/qlibs/swar
GIT_TAG v1.0.0
)FetchContent_MakeAvailable(qlibs.swar)
``````
target_link_libraries(${PROJECT_NAME} PUBLIC qlibs.swar);
```- Acknowledgments
> https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, https://wg21.link/P1928