{"id":49679151,"url":"https://github.com/raullenchai/rapid-mlx","last_synced_at":"2026-06-12T07:00:45.072Z","repository":{"id":340469237,"uuid":"1166182351","full_name":"raullenchai/Rapid-MLX","owner":"raullenchai","description":"The fastest local AI engine for Apple Silicon. 4.2x faster than Ollama, 0.08s cached TTFT, 100% tool calling. 17 tool parsers, prompt cache, reasoning separation, cloud routing. Drop-in OpenAI replacement. Works with Claude Code, Cursor, Aider.","archived":false,"fork":false,"pushed_at":"2026-06-12T05:06:22.000Z","size":23951,"stargazers_count":2752,"open_issues_count":32,"forks_count":339,"subscribers_count":57,"default_branch":"main","last_synced_at":"2026-06-12T06:13:24.183Z","etag":null,"topics":["apple-silicon","claude-code","cursor","deepseek","fastapi","hacktoberfest","inference","llm","local-llm","m1","m2","m3","macos","mlx","ollama-alternative","openai-api","python","qwen","tool-calling"],"latest_commit_sha":null,"homepage":"https://pypi.org/project/rapid-mlx","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":"waybarrios/vllm-mlx","license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/raullenchai.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":"CONTRIBUTING.md","funding":".github/FUNDING.yml","license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":"ROADMAP.md","authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null},"funding":{"github":"raullenchai"}},"created_at":"2026-02-25T00:41:44.000Z","updated_at":"2026-06-12T05:36:20.000Z","dependencies_parsed_at":null,"dependency_job_id":"80192dfd-a386-4551-86dc-68b863235d0f","html_url":"https://github.com/raullenchai/Rapid-MLX","commit_stats":null,"previous_names":["raullenchai/vllm-mlx"],"tags_count":118,"template":false,"template_full_name":null,"purl":"pkg:github/raullenchai/Rapid-MLX","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/raullenchai%2FRapid-MLX","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/raullenchai%2FRapid-MLX/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/raullenchai%2FRapid-MLX/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/raullenchai%2FRapid-MLX/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/raullenchai","download_url":"https://codeload.github.com/raullenchai/Rapid-MLX/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/raullenchai%2FRapid-MLX/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":34232790,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-05-26T15:22:16.424Z","status":"online","status_checked_at":"2026-06-12T02:00:06.859Z","response_time":109,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["apple-silicon","claude-code","cursor","deepseek","fastapi","hacktoberfest","inference","llm","local-llm","m1","m2","m3","macos","mlx","ollama-alternative","openai-api","python","qwen","tool-calling"],"created_at":"2026-05-07T05:05:53.683Z","updated_at":"2026-06-12T07:00:45.028Z","avatar_url":"https://github.com/raullenchai.png","language":"Python","funding_links":["https://github.com/sponsors/raullenchai"],"categories":[],"sub_categories":[],"readme":"\u003cp align=\"center\"\u003e\n  \u003cimg src=\"https://raw.githubusercontent.com/raullenchai/Rapid-MLX/main/docs/assets/logo.png\" alt=\"Rapid-MLX\" width=\"200\"\u003e\n\u003c/p\u003e\n\n\u003ch1 align=\"center\"\u003eRapid-MLX\u003c/h1\u003e\n\n\u003cp align=\"center\"\u003e\n  \u003cstrong\u003eRun AI on your Mac. Faster than anything else.\u003c/strong\u003e\n\u003c/p\u003e\n\n\u003cp align=\"center\"\u003e\n  \u003ca href=\"LICENSE\"\u003e\u003cimg src=\"https://img.shields.io/badge/License-Apache_2.0-blue.svg\" alt=\"License\"\u003e\u003c/a\u003e\n  \u003ca href=\"https://www.python.org/downloads/\"\u003e\u003cimg src=\"https://img.shields.io/badge/python-3.10+-blue.svg\" alt=\"Python 3.10+\"\u003e\u003c/a\u003e\n  \u003ca href=\"tests/\"\u003e\u003cimg src=\"https://img.shields.io/badge/tests-3300%2B-brightgreen.svg\" alt=\"Tests\"\u003e\u003c/a\u003e\n  \u003ca href=\"https://support.apple.com/en-us/HT211814\"\u003e\u003cimg src=\"https://img.shields.io/badge/Apple_Silicon-M1%20|%20M2%20|%20M3%20|%20M4-black.svg?logo=apple\" alt=\"Apple Silicon\"\u003e\u003c/a\u003e\n  \u003ca href=\"https://github.com/raullenchai/Rapid-MLX/stargazers\"\u003e\u003cimg src=\"https://img.shields.io/github/stars/raullenchai/Rapid-MLX?style=social\" alt=\"GitHub stars\"\u003e\u003c/a\u003e\n\u003c/p\u003e\n\n\u003cp align=\"center\"\u003e\n  Run local AI models on your Mac — no cloud, no API costs. Works with Cursor, Claude Code, and any OpenAI-compatible app.\n\u003c/p\u003e\n\n\u003cp align=\"center\"\u003e\n  \u003cimg src=\"https://raw.githubusercontent.com/raullenchai/Rapid-MLX/main/docs/assets/demo.gif\" alt=\"Rapid-MLX demo — install, serve Gemma 4, chat, tool calling\" width=\"700\"\u003e\n  \u003cbr\u003e\n  \u003cem\u003epip install → serve Gemma 4 26B → chat + tool calling → works with PydanticAI, LangChain, Aider, and more.\u003c/em\u003e\n\u003c/p\u003e\n\n| | Your Mac | Model | Speed (tok/s = words/sec) | What works |\n|:---|:---:|:---:|:---:|:---:|\n| **16 GB** MacBook Air | Qwen3.5-4B | 160 tok/s | Chat, coding, tools |\n| **32+ GB** Mac Mini / Studio | Nemotron-Nano 30B | 141 tok/s | 🆕 Fastest 30B, 100% tools |\n| **32+ GB** Mac Mini / Studio | Qwen3.6-35B | 95 tok/s | 256 experts, 262K context |\n| **64 GB** Mac Mini / Studio | Qwen3.5-35B | 83 tok/s | Best balance of smart + fast |\n| **96+ GB** Mac Studio / Pro | Qwen3.5-122B | 57 tok/s | Frontier-level intelligence |\n| **128+ GB** Mac Studio Ultra | 🆕 DeepSeek V4 Flash 158B-A13B | 31-56 tok/s | Day-0 frontier MoE, 1M context |\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cb\u003eNew to local AI? Quick glossary\u003c/b\u003e\u003c/summary\u003e\n\n- **tok/s** (tokens per second) — roughly how many words the AI generates per second. Higher = faster.\n- **4bit / 8bit** — compression levels for models. 4bit uses less memory (recommended); 8bit is higher quality.\n- **TTFT** (Time To First Token) — how long before the AI starts responding.\n- **Tool calling** — the AI can call functions in your code. Used by Cursor, Claude Code, and coding assistants.\n- **OpenAI API compatible** — Rapid-MLX speaks the same language as ChatGPT's API, so any app that works with ChatGPT can work with Rapid-MLX by just changing the server address.\n- **Ollama / llama.cpp** — other popular tools for running local AI. Rapid-MLX is 2-4x faster on Apple Silicon.\n\n\u003c/details\u003e\n\n---\n\n## Quick Start\n\n**Step 1 — Install** (pick one):\n\n```bash\n# Homebrew (recommended — just works, no Python version issues)\nbrew install raullenchai/rapid-mlx/rapid-mlx\n\n# pip (requires Python 3.10+ — macOS ships 3.9, so install Python first if needed)\npip install rapid-mlx\n\n# Or one-liner with auto-setup (installs Python if needed)\ncurl -fsSL https://raullenchai.github.io/Rapid-MLX/install.sh | bash\n```\n\n\u003e **Vision/multimodal models** (Gemma 4, Qwen-VL, etc.) need extras: `pip install 'rapid-mlx[vision]'`. Text-only install is ~460 MB; vision adds ~322 MB. See [Optional Extras](#optional-extras) for the full list.\n\n\u003e **\"No matching distribution\" error?** Your Python is too old. Run `python3 --version` — if it says 3.9, install a newer Python: `brew install python@3.12` then `python3.12 -m pip install rapid-mlx`\n\n\u003e **`Tapping homebrew/core` / `Operation not permitted` during `brew install`?** Brew 5.x's install sandbox can't auto-tap `homebrew/core` mid-install. Pre-tap it once, then retry:\n\u003e ```bash\n\u003e brew tap homebrew/core --force   # ~1.3 GB, one-time\n\u003e brew install raullenchai/rapid-mlx/rapid-mlx\n\u003e ```\n\n**Step 2 — Talk to a model right now** (one command, no second terminal):\n```bash\nrapid-mlx chat\n```\nDefaults to `qwen3.5-4b`. First run downloads the model (~2.5 GB) — you'll see a progress bar. Drops you into a REPL when it's ready. Type `/help` for slash commands, `/exit` to quit. Pass `--think` to surface chain-of-thought.\n\n**Step 2b — Or serve a model for use from other apps:**\n```bash\nrapid-mlx serve qwen3.5-4b\n```\nSame model, same download — but this starts an OpenAI-compatible HTTP server instead of a REPL. Wait for `Ready: http://localhost:8000/v1`.\n\n\u003e Want vision? `pip install 'rapid-mlx[vision]'` then `rapid-mlx serve gemma-4-26b` (~14 GB).\n\n**Step 3 — Hit the API** (from a second terminal tab):\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"model\":\"default\",\"messages\":[{\"role\":\"user\",\"content\":\"Say hello\"}]}'\n```\n\nThat's it — you now have an OpenAI-compatible AI server on `localhost:8000`. Point any app at `http://localhost:8000/v1` and it just works.\n\n**Step 4 — Share it publicly** (optional — get a `https://` URL anyone can hit):\n```bash\nrapid-mlx share qwen3.6-27b-8bit\n```\nThis spawns the same local serve and tunnels it through `rapidserver.quicksilverpro.io` over a WebSocket. Your terminal prints a public OpenAI-compatible endpoint plus a bearer key — point any chat UI or OpenAI SDK at it. Bearer auth, a locked-down CORS allowlist, and a default 120 RPM rate-limit are wired on the spawned child; closing the terminal tears the tunnel down.\n\nThe default chat surface is our hosted Big-AGI fork (tool calling, personas, voice — no signup); any OpenAI-compatible client also works, e.g. `OPENAI_API_BASE_URL=\u003cshare-url\u003e/v1 OPENAI_API_KEY=\u003cbearer\u003e open-webui serve`.\n\n\u003e Pick a 27B-class model or larger for a usable share experience — 4B is fine for local dev but too small for live chat (`rapid-mlx models` lists all aliases).\n\n\u003e **Want a Claude Code-like TUI?** Rapid-MLX is the *backend* — pair it with an open-source agent CLI like [OpenCode](https://github.com/sst/opencode) or [codex](https://github.com/openai/codex) for the full slash-commands / tool-use / multi-turn experience. Run `rapid-mlx agents opencode --setup` (or `codex --setup`) to wire it up automatically.\n\n\u003e **Tip:** Run `rapid-mlx models` to see all available model aliases. For a smaller/faster model, try `rapid-mlx serve qwen3.5-9b` (~5 GB).\n\n\u003cdetails\u003e\n\u003csummary\u003eMore install options\u003c/summary\u003e\n\n**From source** (for development):\n```bash\ngit clone https://github.com/raullenchai/Rapid-MLX.git\ncd Rapid-MLX \u0026\u0026 pip install -e .\n```\n\n**Vision models** (adds mlx-vlm + opencv + torch, ~322 MB extra):\n```bash\npip install 'rapid-mlx[vision]'\n```\n\n**Audio** (TTS/STT via mlx-audio):\n```bash\npip install 'rapid-mlx[audio]'\n```\n\u003c/details\u003e\n\n**Try it with Python** (make sure the server is running, then `pip install openai`):\n\n```python\nfrom openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000/v1\", api_key=\"not-needed\")  # any value works, no real key needed\n\nresponse = client.chat.completions.create(\n    model=\"default\",\n    messages=[{\"role\": \"user\", \"content\": \"Say hello\"}],\n)\nprint(response.choices[0].message.content)\n```\n\n---\n\n## Works With\n\n### Agent Harnesses (MHI-tested)\n\n| Harness | Type | Notes |\n|---------|------|-------|\n| [Hermes Agent](https://github.com/NousResearch/hermes-agent) | Agent | 62 tools, multi-turn ([test](tests/integrations/test_hermes.py)) |\n| [PydanticAI](https://ai.pydantic.dev) | Framework | Typed agents, structured output ([test](tests/integrations/test_pydantic_ai_full.py)) |\n| [LangChain](https://langchain.com) | Framework | `ChatOpenAI`, tools, streaming ([test](tests/integrations/test_langchain.py)) |\n| [smolagents](https://github.com/huggingface/smolagents) | Framework | CodeAgent + ToolCallingAgent ([test](tests/integrations/test_smolagents_full.py)) |\n| [OpenClaude](https://github.com/Gitlawb/openclaude) (Anthropic SDK) | Agent | `CLAUDE_CODE_USE_OPENAI=1` ([test](tests/integrations/test_anthropic_sdk.py)) |\n| [Aider](https://aider.chat) | Agent | CLI edit-and-commit, architect mode ([test](tests/integrations/test_aider.sh)) |\n| [Goose](https://github.com/block/goose) | Agent | Ollama provider via `OLLAMA_HOST` |\n| [OpenCode](https://github.com/sst/opencode) | TUI Agent | Claude Code-like terminal UX, OpenAI-compat provider |\n| [Claw Code](https://github.com/ultraworkers/claw-code) | Agent | OpenAI \u0026 Anthropic endpoints |\n\n### UI / IDE Clients\n\n| Client | Status | Setup |\n|--------|--------|-------|\n| [Cursor](https://cursor.com) | Compatible | Settings → OpenAI Base URL |\n| [Continue.dev](https://continue.dev) | Compatible | VS Code / JetBrains extension |\n| [LibreChat](https://librechat.ai) | Tested | Docker ([test](tests/integrations/test_librechat_docker.py)) |\n| [Open WebUI](https://github.com/open-webui/open-webui) | Tested | Docker ([test](tests/integrations/test_openwebui.py)) |\n| Any OpenAI-compatible app | Compatible | Point at `http://localhost:8000/v1` |\n\n### Model-Harness Index (MHI)\n\nMHI measures how well a model works with a specific agent harness. It combines three dimensions:\n\n| Dimension | Weight | What it measures | Source |\n|---|---|---|---|\n| **Tool Calling** | 50% | Can the model+harness execute function calls correctly? | `rapid-mlx agents --test` |\n| **HumanEval** | 30% | Can the model generate correct code? | [HumanEval](https://github.com/openai/human-eval) (10 tasks) |\n| **MMLU** | 20% | Does the harness degrade base knowledge? | [tinyMMLU](https://huggingface.co/datasets/tinyBenchmarks/tinyMMLU) (10 tasks) |\n\n**MHI = 0.50 × ToolCalling + 0.30 × HumanEval + 0.20 × MMLU** (scale 0-100)\n\n| Model | Best MHI | Best Harness | Tool Calling |\n|---|---|---|---|\n| **Qwopus 27B** | **92** | All (Hermes, PydanticAI, LangChain, smolagents) | 100% |\n| **Qwen3.5 27B** | **82** | Hermes / PydanticAI / LangChain | 100% |\n| **Llama 3.3 70B** | **83** | smolagents (text-based) | 100% |\n| **Nemotron Nano 30B** | **59** | PydanticAI / LangChain | 91-93% |\n| **Gemma 4 26B** | **62** | Hermes / smolagents | 100% |\n\n\u003cdetails\u003e\n\u003csummary\u003eFull MHI table (25 model-harness combinations) + methodology\u003c/summary\u003e\n\n**MHI = 0.50 × ToolCalling + 0.30 × HumanEval + 0.20 × MMLU** (scale 0-100)\n\nRun `rapid-mlx agents` to see all supported agents and `python3 scripts/mhi_eval.py` to compute MHI on your own setup.\n\n| Model + Harness | Tool Calling | HumanEval | MMLU | **MHI** |\n|---|---|---|---|---|\n| **Qwopus 27B** + Hermes | 100% | 80% | 90% | **92** |\n| **Qwopus 27B** + PydanticAI | 100% | 80% | 90% | **92** |\n| **Qwen3.5 27B** + Hermes | 100% | 40% | 100% | **82** |\n| **Llama 3.3 70B** + smolagents | 100% | 50% | 90% | **83** |\n| **DeepSeek-R1 32B** + smolagents | 100% | 30% | 100% | **79** |\n| **Gemma 4 26B** + Hermes | 100% | 0% | 60% | **62** |\n| **Nemotron Nano 30B** + PydanticAI | 93% | 0% | 60% | **59** |\n\n\u003c/details\u003e\n\n**Quick setup for popular apps:**\n\n**Cursor:** Settings → Models → Add Model:\n```\nOpenAI API Base:  http://localhost:8000/v1\nAPI Key:          not-needed\nModel name:       default          (or qwen3.5-9b — either works)\n```\nCursor's agent/composer mode uses tool calls automatically — Rapid-MLX handles them natively with Qwen3.5 models, no extra flags needed.\n\n**Claw Code:**\n```bash\nexport OPENAI_BASE_URL=http://localhost:8000/v1\nexport OPENAI_API_KEY=not-needed\nclaw --model \"openai/default\" prompt \"summarize this repo\"\n```\n\n**OpenClaude:**\n```bash\nCLAUDE_CODE_USE_OPENAI=1 OPENAI_BASE_URL=http://localhost:8000/v1 \\\nOPENAI_API_KEY=not-needed OPENAI_MODEL=default openclaude -p \"hello\"\n```\n\n**Hermes Agent** (`~/.hermes/config.yaml`):\n```yaml\nmodel:\n  provider: \"custom\"\n  default: \"default\"\n  base_url: \"http://localhost:8000/v1\"\n  context_length: 32768\n```\n\n**Goose:**\n```bash\nGOOSE_PROVIDER=ollama OLLAMA_HOST=http://localhost:8000 \\\nGOOSE_MODEL=default goose run --text \"hello\"\n```\n\n**Claude Code:**\n```bash\nOPENAI_BASE_URL=http://localhost:8000/v1 claude\n```\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eMore client setup instructions\u003c/strong\u003e\u003c/summary\u003e\n\n**Continue.dev** (`~/.continue/config.yaml`):\n```yaml\nmodels:\n  - name: rapid-mlx\n    provider: openai\n    model: default\n    apiBase: http://localhost:8000/v1\n    apiKey: not-needed\n```\n\n**Aider:**\n```bash\naider --openai-api-base http://localhost:8000/v1 --openai-api-key not-needed\n```\n\n**Swival** (`~/.swival/config.toml`):\n```toml\n[profiles.rapidmlx]\nprovider = \"generic\"\nbase_url = \"http://127.0.0.1:8000\"\nmodel = \"default\"\n```\n\nRun with:\n```bash\nswival --profile rapidmlx \"summarize this repo\"\n```\n\n**Open WebUI** (Docker one-liner):\n```bash\ndocker run -d -p 3000:8080 \\\n  --add-host=host.docker.internal:host-gateway \\\n  -e ENABLE_OLLAMA_API=False \\\n  -e OPENAI_API_BASE_URL=http://host.docker.internal:8000/v1 \\\n  -e OPENAI_API_KEY=not-needed \\\n  -v open-webui:/app/backend/data \\\n  --name open-webui \\\n  ghcr.io/open-webui/open-webui:main\n```\n\n**OpenCode** (`opencode.json` in your project root):\n```json\n{\n  \"provider\": {\n    \"openai\": {\n      \"api\": \"http://localhost:8000/v1\",\n      \"models\": {\n        \"default\": {\n          \"name\": \"rapid-mlx local\",\n          \"limit\": { \"context\": 32768, \"output\": 8192 }\n        }\n      },\n      \"options\": { \"apiKey\": \"not-needed\" }\n    }\n  }\n}\n```\n\n**PydanticAI** (`pip install pydantic-ai`):\n```python\nfrom pydantic_ai import Agent\nfrom pydantic_ai.models.openai import OpenAIChatModel\nfrom pydantic_ai.providers.openai import OpenAIProvider\n\nmodel = OpenAIChatModel(\n    model_name=\"default\",\n    provider=OpenAIProvider(\n        base_url=\"http://localhost:8000/v1\",\n        api_key=\"not-needed\",\n    ),\n)\nagent = Agent(model)\nprint(agent.run_sync(\"What is 2+2?\").output)\n```\n\n**smolagents** (`pip install smolagents`):\n```python\nfrom smolagents import CodeAgent, OpenAIServerModel\n\nmodel = OpenAIServerModel(\n    model_id=\"default\",\n    api_base=\"http://localhost:8000/v1\",\n    api_key=\"not-needed\",\n)\nagent = CodeAgent(tools=[], model=model)\nagent.run(\"What is 5 multiplied by 7?\")\n```\n\n**LibreChat** (`librechat.yaml`, under `endpoints.custom`):\n```yaml\n- name: \"Rapid-MLX\"\n  apiKey: \"rapid-mlx\"\n  baseURL: \"http://localhost:8000/v1/\"\n  models:\n    default: [\"default\"]\n    fetch: true\n  titleConvo: true\n  titleModel: \"current_model\"\n  modelDisplayLabel: \"Rapid-MLX\"\n```\n\n**Anthropic SDK** (`pip install anthropic`):\n```python\nfrom anthropic import Anthropic\nclient = Anthropic(base_url=\"http://localhost:8000\", api_key=\"not-needed\")\n\nmessage = client.messages.create(\n    model=\"default\",\n    max_tokens=1024,\n    messages=[{\"role\": \"user\", \"content\": \"Say hello\"}],\n)\nprint(message.content[0].text)\n```\n\n\u003c/details\u003e\n\n---\n\n## Choose Your Model\n\n### What fits my Mac?\n\nThe model has to fit in your Mac's RAM. If your Mac slows down or Activity Monitor shows red memory pressure, pick a smaller model from the table below.\n\n| Your Mac | Best Model | RAM Used | Speed | Quality |\n|----------|-----------|---------|-------|---------|\n| **16 GB** MacBook Air/Pro | [Qwen3.5-4B 4bit](https://huggingface.co/mlx-community/Qwen3.5-4B-MLX-4bit) | 2.4 GB | 160 tok/s | Good for chat and simple tasks |\n| **24 GB** MacBook Pro | [Qwen3.5-9B 4bit](https://huggingface.co/mlx-community/Qwen3.5-9B-4bit) | 5.1 GB | 108 tok/s | Great all-rounder |\n| **32 GB** Mac Mini / Studio | [Qwen3.5-27B 4bit](https://huggingface.co/mlx-community/Qwen3.5-27B-4bit) | 15.3 GB | 39 tok/s | Solid coding model |\n| **32 GB** Mac Mini / Studio | 🆕 [Nemotron-Nano 30B 4bit](https://huggingface.co/lmstudio-community/NVIDIA-Nemotron-3-Nano-30B-A3B-MLX-4bit) | 18 GB | 141 tok/s | Fastest 30B, 100% tool calling |\n| **32 GB** Mac Mini / Studio | [Qwen3.6-35B-A3B 4bit](https://huggingface.co/mlx-community/Qwen3.6-35B-A3B-4bit) | 20 GB | 95 tok/s | 256 MoE experts, 262K context |\n| **36 GB** MacBook Pro M3/M4 Pro | [Qwen3.5-27B 4bit](https://huggingface.co/mlx-community/Qwen3.5-27B-4bit) | 15.3 GB | 39 tok/s | Same as 32 GB — extra headroom for long contexts |\n| **48 GB** Mac Mini / Studio | [Qwen3.5-35B-A3B 8bit](https://huggingface.co/mlx-community/Qwen3.5-35B-A3B-8bit) | 37 GB | 83 tok/s | **Sweet spot** — smart + fast |\n| **64 GB** Mac Mini / Studio | [Qwen3.5-35B-A3B 8bit](https://huggingface.co/mlx-community/Qwen3.5-35B-A3B-8bit) | 37 GB | 83 tok/s | Same model, more room for KV cache |\n| **96 GB** Mac Studio / Pro | [Qwen3.5-122B mxfp4](https://huggingface.co/nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx) | 65 GB | 57 tok/s | Best model, fits comfortably |\n| **128 GB** Mac Studio / Pro | 🆕 [DeepSeek V4 Flash 2-bit DQ](https://huggingface.co/mlx-community/DeepSeek-V4-Flash-2bit-DQ) | 91 GB | 56 tok/s | 158B-A13B frontier MoE, day-0 (chat only) |\n| **192 GB** Mac Studio / Pro | [Qwen3.5-122B 8bit](https://huggingface.co/mlx-community/Qwen3.5-122B-A10B-8bit) | 130 GB | 44 tok/s | Maximum quality |\n| **256 GB** Mac Studio Ultra | 🆕 [DeepSeek V4 Flash 8-bit](https://huggingface.co/mlx-community/DeepSeek-V4-Flash-8bit) | 136 GB | 31 tok/s | 158B-A13B frontier MoE, 1M context (chat only) |\n\n\u003e **4bit vs 8bit:** 4bit models are compressed to use less memory (recommended for most users). 8bit models are higher quality but need more RAM. \"mxfp4\" is a high-quality 4bit format.\n\n### Full model lineup\n\n66 short aliases across 13 families ship today. Run `rapid-mlx models` for the live list with quant tier, MoE / hybrid flags, and DFlash eligibility.\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eShow all 66 aliases by family\u003c/strong\u003e\u003c/summary\u003e\n\n| Family | Aliases | Notable |\n|---|---|---|\n| **Qwen3.5** | `qwen3.5-4b`, `-4b-8bit`, `-9b`, `-9b-8bit`, `-27b`, `-27b-8bit` ✨, `-35b`, `-35b-4bit`, `-122b`, `-122b-8bit` | DeltaNet hybrid; **27b-8bit DFlash-eligible** |\n| **Qwen3.6** | `qwen3.6-27b`, `-27b-8bit` ✨, `-27b-ud`, `-35b`, `-35b-6bit`, `-35b-8bit`, `-35b-dwq`, `-35b-ud` | 262K ctx, 256 MoE experts; **27b-8bit DFlash-eligible** |\n| **Qwen3** | `qwen3-0.6b-8bit`, `-4b-8bit`, `-8b-8bit`, `qwen3-coder`, `qwen3-coder-30b`, `qwen3-vl-4b`, `-8b`, `-30b` | Coding + vision |\n| **Qwopus** | `qwopus-9b`, `qwopus-27b`, `qwopus-27b-8bit` | 92 MHI on tool calling |\n| **DeepSeek** | `deepseek-r1-8b`, `-32b`, `deepseek-v4-flash` (2/4/8-bit) | R1 reasoning + V4 Flash 158B-A13B day-0 |\n| **Gemma** | `gemma-3n-e4b`, `gemma-4-26b`, `-31b`, `-31b-8bit`, `gemma3-1b`, `-12b`, `-27b` | Vision-capable (gemma-4) |\n| **Llama / Hermes** | `llama3-1b`, `-3b`, `llama-3.1-8b-8bit`, `hermes3-8b`, `hermes4-70b` | |\n| **GLM** | `glm4.5-air`, `glm4.7-9b` | |\n| **GPT-OSS** | `gpt-oss-20b` | Harmony native |\n| **MiniMax / Kimi** | `minimax-m2.5`, `minimax-m2.7`, `kimi-48b`, `kimi-k2.5` | |\n| **Mistral / Devstral** | `mistral-24b`, `devstral-24b`, `devstral-v2-24b`, `ministral-3b` | |\n| **Other** | `phi4-14b`, `smollm3-3b`, `nemotron-30b` / `-nano`, `bonsai-1.7b/4b/8b`, `granite4-tiny` | |\n\n✨ = DFlash speculative decoding supported (opt in with `--enable-dflash`). `rapid-mlx info \u003calias\u003e` shows per-alias capabilities.\n\n\u003c/details\u003e\n\n### Copy-paste commands\n\nPick the one that matches your Mac. Short aliases work — run `rapid-mlx models` to see all available models.\n\n```bash\n# 16 GB — lightweight, fast\nrapid-mlx serve qwen3.5-4b --port 8000\n\n# 24 GB — best small model\nrapid-mlx serve qwen3.5-9b --port 8000\n\n# 32 GB — solid coding model\nrapid-mlx serve qwen3.5-27b --port 8000\n\n# 32 GB — Nemotron Nano (fastest 30B, 141 tok/s, NVIDIA MoE)\nrapid-mlx serve nemotron-30b --port 8000\n\n# 32+ GB — Qwen 3.6 (256 experts, 262K context)\nrapid-mlx serve qwen3.6-35b --port 8000\n\n# 64 GB — sweet spot\nrapid-mlx serve qwen3.5-35b --prefill-step-size 8192 --port 8000  # faster first response\n\n# 96+ GB — best model\nrapid-mlx serve qwen3.5-122b --prefill-step-size 8192 --port 8000\n\n# Coding agent — fast MoE, great for Claude Code / Cursor\nrapid-mlx serve qwen3-coder --prefill-step-size 8192 --port 8000  # MoE = only uses part of the model, so it's fast\n\n# Vision — image understanding (see note below)\nrapid-mlx serve qwen3-vl-4b --mllm --port 8000\n```\n\n\u003e **Vision deps:** Install into the same environment where rapid-mlx lives:\n\u003e - `install.sh` users: `~/.rapid-mlx/bin/pip install 'rapid-mlx[vision]'`\n\u003e - `pip` users: `pip install 'rapid-mlx[vision]'` (in the same venv)\n\u003e - `brew` users: `$(brew --prefix)/opt/rapid-mlx/libexec/bin/pip install 'rapid-mlx[vision]'`\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eParser auto-detection \u0026 manual overrides\u003c/strong\u003e\u003c/summary\u003e\n\nParsers are **auto-detected from the model name** — you don't need to specify `--tool-call-parser` or `--reasoning-parser` for supported families. Explicit flags always override auto-detection.\n\n| Model Family | Auto-detected `--tool-call-parser` | Auto-detected `--reasoning-parser` | Notes |\n|-------------|---------------------|---------------------|-------|\n| Qwen3.5 (all sizes) | `hermes` | `qwen3` | **Recommended** — 100% tool calling |\n| 🆕 Qwen3.6 | `qwen3_coder_xml` | `qwen3` | XML tool format, 262K context |\n| Qwen3-Coder-Next | `hermes` | *(none)* | Fast coding, non-thinking mode |\n| DeepSeek R1-0528 / V3.1 | `deepseek_v31` | `deepseek_r1` | Dedicated V3.1 parser |\n| DeepSeek R1 (older) | `deepseek` | `deepseek_r1` | With reasoning |\n| DeepSeek V3 / V2.5 | `deepseek` | *(none)* | No reasoning parser |\n| GLM-4.7 | `glm47` | *(none)* | 100% tool calling |\n| MiniMax-M2.5 | `minimax` | `minimax` | XML tool format |\n| GPT-OSS | `harmony` | `harmony` | Native format |\n| Kimi-Linear | `kimi` | *(none)* | Kimi tool format |\n| Llama 3.x | `llama` | *(none)* | JSON tool format |\n| Mistral / Devstral | `hermes` | *(none)* | Hermes-compatible |\n| Gemma | `hermes` | *(none)* | Hermes-compatible |\n| Phi-3/4 | `hermes` | *(none)* | Hermes-compatible |\n\nAll 17 parsers include automatic recovery — if a quantized model outputs broken tool calls as text, they're auto-converted back to structured format.\n\n\u003c/details\u003e\n\n---\n\n## Benchmarks\n\nTested on **Mac Studio M3 Ultra (256GB)**. Rapid-MLX uses Apple's [MLX framework](https://github.com/ml-explore/mlx) — purpose-built for unified memory with native Metal compute kernels — which is why it beats C++-based engines (Ollama, llama.cpp) on most models. Ollama numbers tested with **v0.20.4** (latest, with MLX backend).\n\n| Model | Rapid-MLX | Best Alternative | Speedup |\n|-------|----------|-----------------|---------|\n| **Phi-4 Mini 14B** | **180** tok/s | 77 (mlx-lm) / 56 (Ollama) | **2.3x** / **3.2x** |\n| **Qwen3.5-4B** | **160** tok/s | 155 (mlx-lm serve) | **1.0x** |\n| **Nemotron-Nano 30B** | **141** tok/s · 100% tools | — | — |\n| 🆕 **DeepSeek V4 Flash 158B-A13B** (2-bit DQ) | **56** tok/s | — (only MLX engine, day-0) | — |\n| 🆕 **DeepSeek V4 Flash 158B-A13B** (8-bit) | **31** tok/s | — (only MLX engine, day-0) | — |\n| **GPT-OSS 20B** | **127** tok/s · 100% tools | 79 (mlx-lm serve) | **1.6x** |\n| **Qwen3.5-9B** | **108** tok/s | 41 (Ollama) | **2.6x** |\n| **Qwen3.6-35B-A3B** | **95** tok/s · 100% tools | — | — |\n| **Kimi-Linear-48B** | **94** tok/s · 100% tools | — (only engine) | — |\n| **Gemma 4 26B-A4B** | **85** tok/s | 68 (Ollama) | **1.3x** |\n| **Gemma 4 E4B** | **83** tok/s | — | — |\n| **Qwen3.5-35B-A3B** | **83** tok/s · 100% tools | 75 (oMLX) | **1.1x** |\n| **Qwen3-Coder 80B** | **74** tok/s · 100% tools | 69 (mlx-lm serve) | **1.1x** |\n| **Qwen3.5-122B** | **44** tok/s · 100% tools | 43 (mlx-lm serve) | ~1.0x |\n| **Gemma 4 31B** | **31** tok/s | — | — |\n\n*Full benchmark data with all models, TTFT tables, DeltaNet snapshots, and engine comparison below.*\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eTTFT — Prompt Cache Advantage\u003c/strong\u003e\u003c/summary\u003e\n\nPrompt cache keeps multi-turn conversations fast. For standard transformers, KV cache trimming gives sub-100ms TTFT. For hybrid RNN models (Qwen3.5 DeltaNet), we use state snapshots — the first technique to bring prompt cache to non-trimmable architectures on MLX.\n\n**Pure KV cache (transformers):**\n\n| Model | Rapid-MLX (cached) | mlx-lm serve | Speedup |\n|-------|-------------------|-------------------|---------|\n| Kimi-Linear-48B | **0.08s** | — | — |\n| Llama 3.2 3B | **0.10s** | — | — |\n| Hermes-3-Llama 8B | **0.10s** | 0.18s | 1.8x |\n| Phi-4 Mini 14B | **0.13s** | 0.15s | 1.2x |\n| Devstral-Small-2 24B | **0.13s** | 0.38s | 2.9x |\n| Mistral Small 24B | **0.13s** | 0.38s | 2.9x |\n| GLM-4.7-Flash 9B | **0.13s** | 0.23s | 1.8x |\n| GLM-4.5-Air | **0.14s** | 0.47s | 3.4x |\n| Qwen3-Coder-Next 80B | **0.16s** | 0.27s | 1.7x |\n| GPT-OSS 20B | **0.16s** | 0.27s | 1.7x |\n| Qwen3.5-9B | **0.22s** | 0.26s | 1.2x |\n| Gemma 4 E4B | **0.25s** | — (day-0) | — |\n| Gemma 4 26B-A4B | **0.25s** | — (day-0) | — |\n| Gemma 4 31B | **0.34s** | 0.57s (mlx-vlm bf16) | **1.7x** |\n\n**DeltaNet state snapshots (hybrid RNN + attention):**\n\nQwen3.5 uses Gated DeltaNet (75% RNN) + full attention (25% KV). Other engines recreate the entire cache from scratch every request — we snapshot the RNN state at the system prompt boundary, restoring in ~0.1ms instead of re-running hundreds of tokens through the recurrent layers.\n\n| Model | Cold TTFT | Snapshot TTFT | Speedup |\n|-------|-----------|---------------|---------|\n| Qwen3-Coder-Next 6bit (48L) | 0.66s | **0.16s** | **4.3x** |\n| Qwen3.5-35B-A3B 8bit (40L) | 0.49s | **0.19s** | **2.6x** |\n| Qwen3.5-27B 4bit (40L) | 0.58s | **0.27s** | **2.1x** |\n| Qwen3.5-9B 4bit (40L) | 0.27s | **0.22s** | **1.2x** |\n| Qwen3.5-4B 4bit (32L) | 0.24s | **0.16s** | **1.5x** |\n\n\u003c/details\u003e\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eCapability Comparison\u003c/strong\u003e\u003c/summary\u003e\n\n| Feature | Rapid-MLX | oMLX | Ollama | llama.cpp | mlx-lm serve |\n|---------|-----------|------|--------|-----------|-------------|\n| **Tool calling** | 100% (Qwen/GLM/GPT-OSS/Kimi) | N/A | 100% (Qwen) | 80% (Phi-4) | N/A |\n| **Tool call recovery** | 100% | N/A | 100% | 100% | N/A |\n| **Tool injection fallback** | Yes | No | No | No | No |\n| **Think-tag leak** | 0% | N/A | 0% | 0% | N/A |\n| **Prompt cache** | KV + DeltaNet | No | No | No | No |\n| **Vision** | Yes | Yes | Yes | No | No |\n| **Audio (STT/TTS)** | Yes | No | No | No | No |\n| **17 tool parsers** | Yes | No | No | No | No |\n| **Cloud routing** | Yes | No | No | No | No |\n| **Streaming** | Yes | Yes | Yes | Yes | Yes |\n| **OpenAI API** | Yes | Yes | Yes | Yes | Yes |\n\n\u003c/details\u003e\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eOptimization Techniques Per Model\u003c/strong\u003e\u003c/summary\u003e\n\n| Technique | What it does | Models |\n|-----------|-------------|--------|\n| **KV prompt cache** | Trim KV cache to common prefix, skip re-prefill | All transformer models |\n| **DeltaNet state snapshots** | Deep-copy RNN state at prefix boundary, restore in ~0.1ms | Qwen3.5 (4B, 9B, 27B, 35B, 122B), Qwen3-Coder-Next |\n| **Hybrid cache sync** | Keep trimmable KV + non-trimmable RNN layers in sync | Qwen3.5 (Gated DeltaNet + attention) |\n| **Tool logits bias** | Jump-forward decoding — bias logits toward structured tokens | All models with `--enable-tool-logits-bias` |\n| **Auto tool recovery** | Detect broken text-format tool calls, convert to structured | All 17 parser formats (incl. Gemma 4) |\n| **TurboQuant V-cache** | Rotate + Lloyd-Max compress V cache (86% savings on dense models) | All models with `--kv-cache-turboquant` |\n| **KV cache quantization** | Quantize prefix cache entries to reduce memory | All models with `--kv-cache-quantization` |\n| **DFlash speculative decoding** | Block-diffusion drafter, parallel draft + verify | `qwen3.5-27b-8bit`, `qwen3.6-27b-8bit` (single-user) |\n| **SuffixDecoding** | Drafter-free, statistical n-gram lookup speculative decoding | All BatchedEngine models with `--suffix-decoding` |\n| **Prefill chunking** | Configurable step size for large-prompt throughput | All models |\n| **Cloud routing** | Offload high-token requests to cloud LLM when local is slow | All models with `--cloud-model` |\n\n\u003c/details\u003e\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eEval benchmarks (20 models, 4 suites)\u003c/strong\u003e\u003c/summary\u003e\n\nTool calling (30 scenarios), coding (HumanEval+), reasoning (MATH-500), general knowledge (MMLU-Pro). Top models:\n\n| Model | Decode | Tools | Code | Reason | General | Avg |\n|-------|--------|-------|------|--------|---------|-----|\n| Qwen3.5-122B 8bit | 44 t/s | 87% | 90% | 90% | 90% | **89%** |\n| Qwen3.5-35B 8bit | 83 t/s | 90% | 90% | 80% | 80% | **85%** |\n| Qwen3-Coder-Next 4bit | 74 t/s | 90% | 90% | 70% | 70% | **80%** |\n| Qwen3.5-27B 4bit | 39 t/s | 83% | 90% | 50% | 80% | **76%** |\n| Qwen3.5-9B 4bit | 108 t/s | 83% | 70% | 60% | 70% | **71%** |\n\nRun your own: `python scripts/benchmark_engines.py --engine rapid-mlx ollama --runs 3`\n\n\u003c/details\u003e\n\n---\n\n## Features\n\n### Tool Calling\n\nFull OpenAI-compatible tool calling with 17 parser formats and **automatic recovery when quantized models break**. Models at 4-bit degrade after multiple tool rounds — Rapid-MLX auto-detects broken output and converts it back to structured `tool_calls`.\n\n### Reasoning Separation\n\nModels with chain-of-thought (Qwen3, DeepSeek-R1) output reasoning in a separate `reasoning_content` field — cleanly separated from `content` in streaming mode. Works with Qwen3, DeepSeek-R1, MiniMax, and GPT-OSS reasoning formats.\n\n### Prompt Cache\n\nPersistent cache across requests — only new tokens are prefilled on each turn. For standard transformers, KV cache trimming. For hybrid models (Qwen3.5 DeltaNet), RNN state snapshots restore non-trimmable layers from memory instead of re-computing. 2-5x faster TTFT on all architectures. Always on, no flags needed.\n\n### Smart Cloud Routing\n\nLarge-context requests auto-route to a cloud LLM (GPT-5, Claude, etc.) when local prefill would be slow. Routing based on new tokens after cache hit. `--cloud-model openai/gpt-5 --cloud-threshold 20000`\n\n### Multimodal\n\nVision, audio (STT/TTS), video understanding, and text embeddings — all through the same OpenAI-compatible API.\n\n### DFlash Speculative Decoding (single-user)\n\nz-lab's block-diffusion drafter (via mlx-vlm) accelerates single-stream generation on validated Qwen3.5/3.6 27B aliases. Opt in with `--enable-dflash`:\n\n| Alias | Drafter | Avg speedup | Min / Max |\n|---|---|---|---|\n| `qwen3.6-27b-8bit` | `z-lab/Qwen3.6-27B-DFlash` | **1.49×** | 1.06× / 2.07× |\n| `qwen3.5-27b-8bit` | `z-lab/Qwen3.5-27B-DFlash` | **1.31×** | 0.59× / 2.15× |\n\n```bash\npip install 'rapid-mlx[dflash]'\nrapid-mlx info qwen3.5-27b-8bit       # check per-gate eligibility\nrapid-mlx serve qwen3.5-27b-8bit --enable-dflash\n```\n\n**Workload sensitivity**: speedup varies by entropy. Coding / math / summarization typically see **1.5-2.7×**; high-entropy creative writing and long-form chat can dip to **0.6-0.9×** because the drafter's training distribution diverges from open-ended generation. This is a known pattern in spec-decode literature ([arXiv 2604.14682](https://arxiv.org/abs/2604.14682), [AdaEDL](https://arxiv.org/abs/2410.18351)) — not a bug. Other Qwen3.5/3.6 sizes (35B-A3B MoE, 122B-A10B MoE) were benched and rejected because their average speedup was below the gate.\n\n**v1 limitations**: DFlash mode runs a dedicated single-user server (mlx-vlm doesn't expose a batched DFlash kernel yet). Tool calling, MCP, and embeddings aren't available in DFlash mode — restart without `--enable-dflash` for those.\n\nAlso: logprobs API, structured JSON output (`response_format`), continuous batching, KV cache quantization (`--kv-cache-quantization`), and [3300+ tests](tests/).\n\n---\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eServer Flags Reference\u003c/strong\u003e\u003c/summary\u003e\n\n\u003e You don't need any flags to get started — the defaults work for most setups. These are for advanced tuning.\n\n### Core\n\n| Flag | Description | Default |\n|------|-------------|---------|\n| `\u003cmodel\u003e` | HuggingFace model name, local path, or alias (positional arg) | *(required)* |\n| `--host` | Host to bind to | `0.0.0.0` |\n| `--port` | Port to bind to | `8000` |\n| `--max-tokens` | Default max tokens for generation | `32768` |\n\n### Tool Calling \u0026 Reasoning\n\n| Flag | Description | Default |\n|------|-------------|---------|\n| `--tool-call-parser` | Parser: `hermes`, `minimax`, `qwen`, `llama`, `deepseek`, etc. | *(auto-detected)* |\n| `--reasoning-parser` | Parser: `qwen3`, `deepseek_r1`, `minimax`, `gpt_oss`, `harmony`, `glm4`, `gemma4` | *(auto-detected)* |\n| `--enable-tool-logits-bias` | Jump-forward decoding for faster tool calls | off |\n\n### Performance\n\n| Flag | Description | Default |\n|------|-------------|---------|\n| `--prefill-step-size` | Tokens per prefill chunk | `2048` |\n| `--kv-cache-turboquant` | TurboQuant V-cache compression (3-4 bit, 86% savings on dense models) | off |\n| `--kv-cache-quantization` | Quantize prefix cache entries for memory savings | off |\n| `--enable-prefix-cache` / `--disable-prefix-cache` | Cache common prefixes across requests | on |\n| `--enable-dflash` | DFlash speculative decoding (single-user; `qwen3.5-27b-8bit` / `qwen3.6-27b-8bit`) | off |\n| `--suffix-decoding` | Drafter-free n-gram speculative decoding (BatchedEngine path) | off |\n| `--enable-mtp` | MTP head speculative decoding (requires MTP-trained model) | off |\n| `--gpu-memory-utilization` | Fraction of device memory to use (0.0-1.0) | `0.90` |\n\n### Cloud Routing\n\n| Flag | Description | Default |\n|------|-------------|---------|\n| `--cloud-model` | litellm model string (e.g. `openai/gpt-5`) | *(disabled)* |\n| `--cloud-threshold` | New token threshold to trigger cloud routing | `20000` |\n\n### Security \u0026 Other\n\n| Flag | Description | Default |\n|------|-------------|---------|\n| `--api-key` | API key for authentication | *(no auth)* |\n| `--rate-limit` | Requests per minute per client | *(unlimited)* |\n| `--timeout` | Request timeout in seconds | `1800` |\n| `--mllm` | Force multimodal (vision) mode | auto-detect |\n| `--mcp-config` | MCP configuration file for tool integration | *(none)* |\n| `--embedding-model` | Pre-load embedding model at startup | *(none)* |\n\n\u003c/details\u003e\n\n\u003cdetails\u003e\n\u003csummary\u003e\u003cstrong\u003eCommon Issues\u003c/strong\u003e\u003c/summary\u003e\n\n**\"parameters not found in model\" warnings at startup** — Normal for VLMs. Vision weights are auto-skipped.\n\n**Out of memory / very slow (\u003c5 tok/s)** — Model too big. Check [What fits my Mac?](#what-fits-my-mac) Try a smaller quantization (4bit) or smaller model.\n\n**Empty responses** — Remove `--reasoning-parser` for non-thinking models.\n\n**Tool calls as plain text** — Set the correct `--tool-call-parser` for your model. Even without it, Rapid-MLX auto-recovers most cases.\n\n**Other issues?** Run `rapid-mlx doctor` for self-diagnostics.\n\n**Slow first response** — Two different causes: (1) Qwen3.5 models reason before answering — add `--no-thinking` to skip reasoning for faster responses, or (2) cold start on long prompts — add `--prefill-step-size 8192` to speed up processing. Subsequent turns hit prompt cache and are 10-30x faster.\n\n\u003c/details\u003e\n\n---\n\n## Optional Extras\n\nThe base `pip install rapid-mlx` is ~460 MB and covers all text-only models. Vision, audio, and other features ship as opt-in extras:\n\n| Extra | Install | Adds | What it unlocks |\n|---|---|---|---|\n| `vision` | `pip install 'rapid-mlx[vision]'` | ~322 MB | Gemma 4, Qwen-VL, video understanding (mlx-vlm + opencv + torch) |\n| `audio` | `pip install 'rapid-mlx[audio]'` | ~600 MB | TTS / STT (mlx-audio + spacy + scipy) |\n| `embeddings` | `pip install 'rapid-mlx[embeddings]'` | ~50 MB | `/v1/embeddings` endpoint (mlx-embeddings) |\n| `chat` | `pip install 'rapid-mlx[chat]'` | ~150 MB | Built-in Gradio chat UI |\n| `guided` | `pip install 'rapid-mlx[guided]'` | ~80 MB | Schema-constrained JSON generation (outlines) |\n| `all` | `pip install 'rapid-mlx[all]'` | ~1.1 GB | Vision + audio + chat + embeddings |\n\nIf you installed via Homebrew and want vision/audio support, use `pip install 'rapid-mlx[vision]'` (or `[audio]`) inside your own Python 3.10+ venv — that gives you the full feature set without rebuilding the brew formula.\n\n---\n\n## Troubleshooting\n\nRun the built-in self-diagnostic (works from `pip install`, no dev tools needed):\n\n```bash\nrapid-mlx doctor\n```\n\n```\nRapid-MLX Doctor\n============================================================\n  [metal] OK        # Apple Silicon Metal GPU available\n  [imports] OK      # Core modules import cleanly\n  [cli] OK          # CLI commands respond\n  [model_load] OK   # Inference pipeline works\nResult: PASS\n```\n\n---\n\n## Telemetry\n\nRapid-MLX **can** send anonymous usage data to help us prioritise the right models and catch regressions. **It is off by default and never starts collecting without your explicit opt-in.**\n\n### What we collect (only if you opt in)\n\n- Subcommand names (`serve` / `chat` / `agents` / `bench` / `doctor`)\n- Model alias names (`qwen3.5-9b`) or canonical HF repo IDs (`mlx-community/...`) — local paths are redacted to `\u003clocal\u003e`\n- Bucketed counts: prompt/completion tokens, TTFT, tokens/sec — never exact values\n- Error categories + a hash fingerprint of the failure site (exception class name + per-frame `file:function:lineno` only — never the message text or absolute paths)\n- OS, arch, Apple chip name, RAM (rounded to GB), Python major.minor\n\n### What we never collect\n\n- Prompts, completions, tool-call arguments, file contents, or any user-generated text\n- Local file paths, working directory, or model paths beyond their HF repo ID\n- IPs or hostnames (Phase 2 will route through a Cloudflare Worker that strips IPs before forwarding to the aggregator; Phase 1 ships no transport at all)\n- API keys, environment variable values, auth headers\n- Stack trace messages or argument values\n\n### Manage it\n\n```bash\nrapid-mlx telemetry status     # show current state and why\nrapid-mlx telemetry preview    # print the exact JSON payload that would be sent\nrapid-mlx telemetry enable     # opt in\nrapid-mlx telemetry disable    # opt out\nrapid-mlx telemetry reset      # delete consent + client-id files (re-prompts on next run)\n```\n\n### Force-disable in scripts / CI\n\nEither of these always wins, regardless of stored consent:\n\n```bash\nRAPID_MLX_TELEMETRY=0 rapid-mlx serve qwen3.5-9b\nrapid-mlx --no-telemetry serve qwen3.5-9b\n```\n\nThere is intentionally **no env-var equivalent for force-on** — opting in must be an explicit one-time `rapid-mlx telemetry enable`. CI agents will never silently contribute.\n\n### Where the code lives\n\nEverything is in [`vllm_mlx/telemetry/`](vllm_mlx/telemetry/) — read it. Phase 1 (this release) ships the consent mechanism and CLI surface; **no network code is in the codebase yet**. Phase 2 will add the transport behind the same opt-in gate; the schema is documented in [`vllm_mlx/telemetry/schema.py`](vllm_mlx/telemetry/schema.py). Tracking issue: [#236](https://github.com/raullenchai/Rapid-MLX/issues/236).\n\n---\n\n## Development\n\n### Quick start\n\n```bash\ngit clone https://github.com/raullenchai/Rapid-MLX.git\ncd Rapid-MLX\npip install -e \".[dev]\"\n```\n\n### Testing\n\nTwo layers: **user-facing doctor** (ships with pip) and **dev test suite** (source checkout only).\n\n#### Dev test commands\n\n| Command | What | Time | Needs server? |\n|---------|------|------|---------------|\n| `make lint` | ruff lint | ~10s | No |\n| `make test` | pytest unit suite (3300+ tests) | ~30s | No |\n| `make smoke` | lint + unit | ~1 min | No |\n| `make stress` | 8-scenario stress test | ~5 min | Yes |\n| `make soak` | 10-min agent soak test | 10 min | Yes |\n\nFor stress/soak, start a server first:\n```bash\nrapid-mlx serve mlx-community/Qwen3.5-4B-MLX-4bit --enable-auto-tool-choice --tool-call-parser hermes\n# In another terminal:\nmake stress\n```\n\nOr use the script directly for more options:\n```bash\npython scripts/dev_test.py smoke              # lint + unit\npython scripts/dev_test.py stress --port 8000 # custom port\npython scripts/dev_test.py full               # everything\n```\n\n#### Regression harness (multi-model)\n\n```bash\nmake check              # 1 model (~10 min, auto starts server)\nmake full               # 3 models + 12 agent profiles (~1 hr)\nmake benchmark          # all local models (overnight)\n```\n\n### Architecture\n\n```\nvllm_mlx/\n  server.py              # App factory + model loading + CLI entry\n  config/                # ServerConfig singleton\n  service/\n    helpers.py           # Shared request helpers\n    postprocessor.py     # Streaming pipeline (100% test coverage)\n  routes/\n    chat.py              # /v1/chat/completions\n    completions.py       # /v1/completions\n    anthropic.py         # /v1/messages (Anthropic API)\n    health.py, models.py, embeddings.py, audio.py, mcp_routes.py\n  engine/                # BatchedEngine (continuous batching)\n  reasoning/             # 7 reasoning parsers (Qwen3, DeepSeek, MiniMax, ...)\n  tool_parsers/          # 17 tool call parsers\n  speculative/           # DFlash, SuffixDecoding, MTP drafters\n  agents/                # 12 agent profiles (YAML)\n  runtime/               # Model registry, cache persistence\n  doctor/                # User self-diagnostic\nscripts/                 # Dev-only (NOT shipped with pip)\n  dev_test.py            # Unified test entry point\n  stress_test.py         # 8-scenario stress test\n  agent_soak_test.py     # 10-min agent soak test\n  mhi_eval.py            # Compute MHI scores against a running server\ntests/                   # pytest unit tests (3300+)\nharness/                 # Regression baselines + thresholds\n```\n\n---\n\n## Roadmap\n\n| Technique | Expected Gain | Status |\n|-----------|---------------|--------|\n| [DFlash](https://arxiv.org/abs/2602.06036) — block-diffusion drafter, single-user | 1.3-2× decode | **Shipping** (qwen3.5-27b-8bit, qwen3.6-27b-8bit) |\n| [SuffixDecoding](https://arxiv.org/abs/2411.04975) — drafter-free n-gram speculative | 1.1-1.5× decode | Shipping (`--suffix-decoding`, per-model tier sweep ongoing) |\n| MTP — Multi-Token Prediction head | 1.4-1.7× decode | Experimental (requires MTP-trained checkpoint) |\n| [EAGLE-3](https://arxiv.org/abs/2503.01840) — feature-level draft on Metal | 3-6.5× decode | Not started |\n| [ReDrafter](https://arxiv.org/abs/2403.09919) — Apple's RNN draft head | 1.4-1.5× decode | Not started |\n\n---\n\n## Contributing\n\nWe welcome contributions of all sizes! See [CONTRIBUTING.md](CONTRIBUTING.md) for setup and guidelines.\n\n**Easy first contributions** (no model download needed):\n- [Add a model alias](https://github.com/raullenchai/Rapid-MLX/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) — map a short name to a HuggingFace model ID\n- [Request model support](https://github.com/raullenchai/Rapid-MLX/issues/new?template=model_support.yml) — tell us which model you want\n\n**Testing contributions** (needs a Mac with Apple Silicon):\n- Benchmark a model and share results\n- Test with your favorite AI client (Cursor, Aider, LangChain, etc.)\n- [Report a bug](https://github.com/raullenchai/Rapid-MLX/issues/new?template=bug_report.yml)\n\n### Contributors\n\n\u003ca href=\"https://github.com/raullenchai/Rapid-MLX/graphs/contributors\"\u003e\n  \u003cimg src=\"https://contrib.rocks/image?repo=raullenchai/Rapid-MLX\" /\u003e\n\u003c/a\u003e\n\n## Star History\n\n\u003ca href=\"https://star-history.com/#raullenchai/Rapid-MLX\u0026Date\"\u003e\n  \u003cpicture\u003e\n    \u003csource media=\"(prefers-color-scheme: dark)\" srcset=\"https://api.star-history.com/svg?repos=raullenchai/Rapid-MLX\u0026type=Date\u0026theme=dark\" /\u003e\n    \u003csource media=\"(prefers-color-scheme: light)\" srcset=\"https://api.star-history.com/svg?repos=raullenchai/Rapid-MLX\u0026type=Date\" /\u003e\n    \u003cimg alt=\"Star History Chart\" src=\"https://api.star-history.com/svg?repos=raullenchai/Rapid-MLX\u0026type=Date\" /\u003e\n  \u003c/picture\u003e\n\u003c/a\u003e\n\n## License\n\nApache 2.0 — see [LICENSE](LICENSE).\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fraullenchai%2Frapid-mlx","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fraullenchai%2Frapid-mlx","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fraullenchai%2Frapid-mlx/lists"}