{"id":22686416,"url":"https://github.com/NVlabs/Sana","last_synced_at":"2025-08-07T00:32:44.689Z","repository":{"id":259013233,"uuid":"871368615","full_name":"NVlabs/Sana","owner":"NVlabs","description":"SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer","archived":false,"fork":false,"pushed_at":"2025-07-17T16:32:14.000Z","size":259740,"stargazers_count":4376,"open_issues_count":67,"forks_count":282,"subscribers_count":77,"default_branch":"main","last_synced_at":"2025-07-17T19:16:37.151Z","etag":null,"topics":["diffusion","dit","pytorch","sana","text-to-image-generation","transformers"],"latest_commit_sha":null,"homepage":"https://nvlabs.github.io/Sana","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":"apache-2.0","status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/NVlabs.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":"LICENSE","code_of_conduct":null,"threat_model":null,"audit":null,"citation":"CITATION.cff","codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null}},"created_at":"2024-10-11T20:19:45.000Z","updated_at":"2025-07-17T16:32:12.000Z","dependencies_parsed_at":"2024-11-01T16:23:24.899Z","dependency_job_id":"0b992947-7c43-4c16-857a-f04d8f8c6ece","html_url":"https://github.com/NVlabs/Sana","commit_stats":null,"previous_names":["nvlabs/sana"],"tags_count":2,"template":false,"template_full_name":null,"purl":"pkg:github/NVlabs/Sana","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVlabs%2FSana","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVlabs%2FSana/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVlabs%2FSana/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVlabs%2FSana/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/NVlabs","download_url":"https://codeload.github.com/NVlabs/Sana/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/NVlabs%2FSana/sbom","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":269180602,"owners_count":24373833,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","status":"online","status_checked_at":"2025-08-06T02:00:09.910Z","response_time":99,"last_error":null,"robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":true,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["diffusion","dit","pytorch","sana","text-to-image-generation","transformers"],"created_at":"2024-12-09T23:01:16.769Z","updated_at":"2025-08-07T00:32:44.662Z","avatar_url":"https://github.com/NVlabs.png","language":"Python","funding_links":[],"categories":["Python","Repos"],"sub_categories":[],"readme":"\u003cp align=\"center\" style=\"border-radius: 10px\"\u003e\n  \u003cimg src=\"asset/logo.png\" width=\"35%\" alt=\"logo\"/\u003e\n\u003c/p\u003e\n\n# ⚡️Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer\n\n### \u003cdiv align=\"center\"\u003e ICLR 2025 Oral Presentation \u003cdiv\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003ca href=\"https://nvlabs.github.io/Sana/\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Project\u0026message=Github\u0026color=blue\u0026logo=github-pages\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://hanlab.mit.edu/projects/sana/\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Page\u0026message=MIT\u0026color=darkred\u0026logo=github-pages\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://arxiv.org/abs/2410.10629\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Arxiv\u0026message=Sana\u0026color=red\u0026logo=arxiv\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://nv-sana.mit.edu/\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Demo:6x3090\u0026message=SANA\u0026color=yellow\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://nv-sana.mit.edu/4bit/\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Demo:1x3090\u0026message=4bit\u0026color=yellow\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://nv-sana.mit.edu/ctrlnet/\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Demo:1x3090\u0026message=ControlNet\u0026color=yellow\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://nv-sana.mit.edu/sprint/\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Demo:1x3090\u0026message=SANA-Sprint\u0026color=yellow\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://huggingface.co/spaces/Efficient-Large-Model/SanaSprint\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Huggingface Demo\u0026message=SANA-Sprint\u0026color=yellow\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://replicate.com/chenxwh/sana\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=API:H100\u0026message=Replicate\u0026color=pink\"\u003e\u003c/a\u003e \u0026ensp;\n  \u003ca href=\"https://discord.gg/rde6eaE5Ta\"\u003e\u003cimg src=\"https://img.shields.io/static/v1?label=Discuss\u0026message=Discord\u0026color=purple\u0026logo=discord\"\u003e\u003c/a\u003e \u0026ensp;\n\u003c/div\u003e\n\n\u003cp align=\"center\" border-radius=\"10px\"\u003e\n  \u003cimg src=\"asset/Sana.jpg\" width=\"90%\" alt=\"teaser_page1\"/\u003e\n\u003c/p\u003e\n\n## 💡 TLDR: Explore everything you want here!\n\n### 🚶 Basic:\n\n**Demo**: [SANA-1.5](https://nv-sana.mit.edu/) | [SANA-ControlNet](https://nv-sana.mit.edu/ctrlnet/) | [SANA-4bit](https://nv-sana.mit.edu/4bit/) | [SANA-Sprint](https://nv-sana.mit.edu/sprint/) | [SANA-Sprint (HF)](https://huggingface.co/spaces/Efficient-Large-Model/SanaSprint) \u003cbr\u003e\n**ComfyUI**: [ComfyUI Guidance](asset/docs/ComfyUI/comfyui.md) \u003cbr\u003e\n**Model Zoo:** [Model Card Collects All Models](asset/docs/model_zoo.md) \u003cbr\u003e\n**Env Preparation:** [One-Click Env Install](#-1-dependencies-and-installation) \u003cbr\u003e\n**Inference:** \u003cbr\u003e      1) [diffusers:SanaPipeline](#1-how-to-use-sanapipeline-with-diffusers) \u003cbr\u003e      2) [diffusers:SanaPAGPipeline](#2-how-to-use-sanapagpipeline-with-diffusers) \u003cbr\u003e      3) [Ours:SanaPipeline](#3-how-to-use-sana-in-this-repo) \u003cbr\u003e      4) [Inference with Docker](#4-run-sana-inference-with-docker) \u003cbr\u003e      5) [Inference with TXT or JSON Files](#5-run-inference-with-txt-or-json-files) \u003cbr\u003e\n**Training and Data:** \u003cbr\u003e      1) [Image-Text Pairs](#1-train-with-image-text-pairs-in-directory) \u003cbr\u003e      2) [Multi-Scale Webdataset](#2-train-with-multi-scale-webdataset) \u003cbr\u003e      3) [TAR File Multi-Scale Webdataset](#3-train-with-tar-file) \u003cbr\u003e      4) [FSDP Launch](#3-train-with-tar-file) \u003cbr\u003e      5) [LoRA Training](asset/docs/sana_lora_dreambooth.md) \u003cbr\u003e      6) [SANA-Sprint Diffusers Training](https://github.com/huggingface/diffusers/blob/main/examples/research_projects/sana/README.md) \u003cbr\u003e\n\n### 🏃 Applications:\n\n**2K \u0026 4K Resolution Generation**: [SANA is Capable to Generate 2K \u0026 4K Images (Only 8BG)](asset/docs/model_zoo.md#-3-2k--4k-models) \u003cbr\u003e\n**ControlNet**: [Train\u0026Inference Guidance](asset/docs/sana_controlnet.md) | [Model Zoo](asset/docs/model_zoo.md#sana) | [Demo](https://nv-sana.mit.edu/ctrlnet/) \u003cbr\u003e\n**Dreambooth / LoRA Training**: [Train\u0026Inference Guidance](asset/docs/sana_lora_dreambooth.md) \u003cbr\u003e\n**Quantization**: [Inference with 8bit](asset/docs/quantize/8bit_sana.md) | [Inference with 4bit (8BG)](asset/docs/quantize/4bit_sana.md) | [4bit Model](asset/docs/model_zoo.md#sana) | [4bit Demo](https://svdquant.mit.edu/) | [4bit Demo2](https://nv-sana.mit.edu/4bit/) \u003cbr\u003e\n**8bit Optimizer**: [How to Config](https://github.com/NVlabs/Sana/blob/main/configs/sana_config/1024ms/Sana_1600M_img1024_CAME8bit.yaml#L86) \u003cbr\u003e\n**Inference Scaling:** [SANA Generate VILA Pick Inference Scaling](asset/docs/inference_scaling/inference_scaling.md) \u003cbr\u003e\n**Metrics:** [Metric Toolkit: (FID, CLIP-Score, GenEval, DPG-Bench)](#-4-metric-toolkit) \u003cbr\u003e\n\n### 🚗 Advance:\n\n**SANA-Sprint: One-Step Diffusion**: [Arxiv](https://arxiv.org/pdf/2503.09641) | [Train\u0026Inference Guidance](asset/docs/sana_sprint.md) | [Model Zoo](asset/docs/model_zoo.md#sana-sprint) | [HF Weights](https://huggingface.co/collections/Efficient-Large-Model/sana-sprint-67d6810d65235085b3b17c76) \u003cbr\u003e\n**SANA-1.5: Efficient Model Scaling:** [Arxiv](https://arxiv.org/abs/2501.18427) | [Model Zoo](asset/docs/model_zoo.md#sana-15) | [HF Weights](https://huggingface.co/collections/Efficient-Large-Model/sana-15-67d6803867cb21c230b780e4) \u003cbr\u003e\n\n### 🚀 Future:\n\n**Mission**: [TODO](#to-do-list)\n\n## 🔥🔥 News\n\n- (🔥 New) \\[2025/6/25\\] [SANA-Sprint](https://nvlabs.github.io/Sana/Sprint/) was accepted to ICCV'25 🏖️\n- (🔥 New) \\[2025/6/4\\] SANA-Sprint [ComfyUI Node](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) is released [\\[Example\\]](asset/docs/ComfyUI/SANA-Sprint.json) | [\\[PR\\]](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels/pull/15).\n- (🔥 New) \\[2025/5/8\\] SANA-Sprint (One-step diffusion) diffusers training code is released [\\[Guidance\\]](https://github.com/huggingface/diffusers/blob/main/examples/research_projects/sana/README.md).\n- (🔥 New) \\[2025/5/4\\] **SANA-1.5 (Inference-time scaling) is accepted by ICML-2025.** 🎉🎉🎉\n- (🔥 New) \\[2025/3/22\\] 🔥**SANA-Sprint demo is hosted on Huggingface, try it!** 🎉 [\\[Demo Link\\]](https://huggingface.co/spaces/Efficient-Large-Model/SanaSprint)\n- (🔥 New) \\[2025/3/22\\] 🔥**SANA-1.5 is supported in ComfyUI!** 🎉: [ComfyUI Guidance](asset/docs/ComfyUI/comfyui.md) | [ComfyUI Work Flow SANA-1.5 4.8B](asset/docs/ComfyUI/SANA-1.5_FlowEuler.json)\n- (🔥 New) \\[2025/3/22\\] 🔥**SANA-Sprint code \u0026 weights are released!** 🎉 Include: [Training \u0026 Inference](asset/docs/sana_sprint.md) code and [Weights](asset/docs/model_zoo.md) / [HF](https://huggingface.co/collections/Efficient-Large-Model/sana-15-67d6803867cb21c230b780e4) are all released. [\\[Guidance\\]](asset/docs/sana_sprint.md)\n- (🔥 New) \\[2025/3/21\\] 🚀Sana + **Inference Scaling** is released. [\\[Guidance\\]](asset/docs/inference_scaling/inference_scaling.md)\n- (🔥 New) \\[2025/3/16\\] 🔥**SANA-1.5 code \u0026 weights are released!** 🎉 Include: [DDP/FSDP](#3-train-with-tar-file) | [TAR file WebDataset](#3-train-with-tar-file) | [Multi-Scale](#3-train-with-tar-file) Training code and [Weights](asset/docs/model_zoo.md) | [HF](https://huggingface.co/collections/Efficient-Large-Model/sana-15-67d6803867cb21c230b780e4) are all released.\n- (🔥 New) \\[2025/3/14\\] 🏃**SANA-Sprint is coming out!** 🎉 A new one/few-step generator of Sana. 0.1s per 1024px image on H100, 0.3s on RTX 4090. Find out more details: [\\[Page\\]](https://nvlabs.github.io/Sana/Sprint/) | [\\[Arxiv\\]](https://arxiv.org/abs/2503.09641). Code is coming very soon along with `diffusers`\n- (🔥 New) \\[2025/2/10\\] 🚀Sana + ControlNet is released. [\\[Guidance\\]](asset/docs/sana_controlnet.md) | [\\[Model\\]](asset/docs/model_zoo.md) | [\\[Demo\\]](https://nv-sana.mit.edu/ctrlnet/)\n- (🔥 New) \\[2025/1/30\\] Release CAME-8bit optimizer code. Saving more GPU memory during training. [\\[How to config\\]](https://github.com/NVlabs/Sana/blob/main/configs/sana_config/1024ms/Sana_1600M_img1024_CAME8bit.yaml#L86)\n- (🔥 New) \\[2025/1/29\\] 🎉 🎉 🎉**SANA 1.5 is out! Figure out how to do efficient training \u0026 inference scaling!** 🚀[\\[Tech Report\\]](https://arxiv.org/abs/2501.18427)\n- (🔥 New) \\[2025/1/24\\] 4bit-Sana is released, powered by [SVDQuant and Nunchaku](https://github.com/mit-han-lab/nunchaku) inference engine. Now run your Sana within **8GB** GPU VRAM [\\[Guidance\\]](asset/docs/quantize/4bit_sana.md) [\\[Demo\\]](https://svdquant.mit.edu/) [\\[Model\\]](asset/docs/model_zoo.md)\n- (🔥 New) \\[2025/1/24\\] DCAE-1.1 is released, better reconstruction quality. [\\[Model\\]](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.1) [\\[diffusers\\]](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers)\n- (🔥 New) \\[2025/1/23\\] **Sana is accepted as Oral by ICLR-2025.** 🎉🎉🎉\n\n\u003cdetails\u003e\n  \u003csummary\u003eClick to show all updates\u003c/summary\u003e\n\n- (🔥 New) \\[2025/1/12\\] DC-AE tiling makes Sana-4K inferences 4096x4096px images within 22GB GPU memory. With model offload and 8bit/4bit quantize. The 4K Sana run within **8GB** GPU VRAM. [\\[Guidance\\]](asset/docs/model_zoo.md#-3-2k--4k-models)\n- (🔥 New) \\[2025/1/11\\] Sana code-base license changed to Apache 2.0.\n- (🔥 New) \\[2025/1/10\\] Inference Sana with 8bit quantization.[\\[Guidance\\]](asset/docs/quantize/8bit_sana.md#quantization)\n- (🔥 New) \\[2025/1/8\\] 4K resolution [Sana models](asset/docs/model_zoo.md) is supported in [Sana-ComfyUI](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) and [work flow](asset/docs/ComfyUI/Sana_FlowEuler_4K.json) is also prepared. [\\[4K guidance\\]](asset/docs/ComfyUI/comfyui.md)\n- (🔥 New) \\[2025/1/8\\] 1.6B 4K resolution [Sana models](asset/docs/model_zoo.md) are released: [\\[BF16 pth\\]](https://huggingface.co/Efficient-Large-Model/Sana_1600M_4Kpx_BF16) or [\\[BF16 diffusers\\]](https://huggingface.co/Efficient-Large-Model/Sana_1600M_4Kpx_BF16_diffusers). 🚀 Get your 4096x4096 resolution images within 20 seconds! Find more samples in [Sana page](https://nvlabs.github.io/Sana/). Thanks [SUPIR](https://github.com/Fanghua-Yu/SUPIR) for their wonderful work and support.\n- (🔥 New) \\[2025/1/2\\] Bug in the `diffusers` pipeline is solved. [Solved PR](https://github.com/huggingface/diffusers/pull/10431)\n- (🔥 New) \\[2025/1/2\\] 2K resolution [Sana models](asset/docs/model_zoo.md) is supported in [Sana-ComfyUI](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) and [work flow](asset/docs/ComfyUI/Sana_FlowEuler_2K.json) is also prepared.\n- ✅ \\[2024/12\\] 1.6B 2K resolution [Sana models](asset/docs/model_zoo.md) are released: [\\[BF16 pth\\]](https://huggingface.co/Efficient-Large-Model/Sana_1600M_2Kpx_BF16) or [\\[BF16 diffusers\\]](https://huggingface.co/Efficient-Large-Model/Sana_1600M_2Kpx_BF16_diffusers). 🚀 Get your 2K resolution images within 4 seconds! Find more samples in [Sana page](https://nvlabs.github.io/Sana/). Thanks [SUPIR](https://github.com/Fanghua-Yu/SUPIR) for their wonderful work and support.\n- ✅ \\[2024/12\\] `diffusers` supports Sana-LoRA fine-tuning! Sana-LoRA's training and convergence speed is super fast. [\\[Guidance\\]](asset/docs/sana_lora_dreambooth.md) or  [\\[diffusers docs\\]](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_sana.md).\n- ✅ \\[2024/12\\] `diffusers` has Sana! [All Sana models in diffusers safetensors](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) are released and diffusers pipeline `SanaPipeline`, `SanaPAGPipeline`, `DPMSolverMultistepScheduler(with FlowMatching)` are all supported now. We prepare a [Model Card](asset/docs/model_zoo.md) for you to choose.\n- ✅ \\[2024/12\\] 1.6B BF16 [Sana model](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_BF16) is released for stable fine-tuning.\n- ✅ \\[2024/12\\] We release the [ComfyUI node](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) for Sana. [\\[Guidance\\]](asset/docs/ComfyUI/comfyui.md)\n- ✅ \\[2024/11\\] All multi-linguistic (Emoji \u0026 Chinese \u0026 English) SFT models are released: [1.6B-512px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing), [1.6B-1024px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing), [600M-512px](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px), [600M-1024px](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px). The metric performance is shown [here](#performance)\n- ✅ \\[2024/11\\] Sana Replicate API is launching at [Sana-API](https://replicate.com/chenxwh/sana).\n- ✅ \\[2024/11\\] 1.6B [Sana models](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) are released.\n- ✅ \\[2024/11\\] Training \u0026 Inference \u0026 Metrics code are released.\n- ✅ \\[2024/11\\] Working on [`diffusers`](https://github.com/huggingface/diffusers/pull/9982).\n- \\[2024/10\\] [Demo](https://nv-sana.mit.edu/) is released.\n- \\[2024/10\\] [DC-AE Code](https://github.com/mit-han-lab/efficientvit/blob/master/applications/dc_ae/README.md) and [weights](https://huggingface.co/collections/mit-han-lab/dc-ae-670085b9400ad7197bb1009b) are released!\n- \\[2024/10\\] [Paper](https://arxiv.org/abs/2410.10629) is on Arxiv!\n\n\u003c/details\u003e\n\n## 💡 Introduction\n\nWe introduce Sana, a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution.\nSana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.\nCore designs include:\n\n(1) [**DC-AE**](https://hanlab.mit.edu/projects/dc-ae): unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. \\\n(2) **Linear DiT**: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. \\\n(3) **Decoder-only text encoder**: we replaced T5 with a modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. \\\n(4) **Efficient training and sampling**: we propose **Flow-DPM-Solver** to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence.\n\nAs a result, Sana-0.6B is very competitive with modern giant diffusion models (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024 × 1024 resolution image. Sana enables content creation at low cost.\n\n\u003cp align=\"center\" border-raduis=\"10px\"\u003e\n  \u003cimg src=\"asset/model-incremental.jpg\" width=\"90%\" alt=\"teaser_page2\"/\u003e\n\u003c/p\u003e\n\n## Performance\n\n| Methods (1024x1024)                                                                              | Throughput (samples/s) | Latency (s) | Params (B) | Speedup | FID 👇      | CLIP 👆      | GenEval 👆  | DPG 👆        |\n|--------------------------------------------------------------------------------------------------|------------------------|-------------|------------|---------|-------------|--------------|-------------|---------------|\n| FLUX-dev                                                                                         | 0.04                   | 23.0        | 12.0       | 1.0×    | 10.15       | 27.47        | 0.67        | 84.0          |\n| **Sana-0.6B**                                                                                    | 1.7                    | 0.9         | 0.6        | 39.5×   | _5.81_      | 28.36        | 0.64        | 83.6          |\n| **[Sana-0.6B](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px)**                   | 1.7                    | 0.9         | 0.6        | 39.5×   | **5.61**    | 28.80        | 0.68        | _84.2_        |\n| **[Sana-1.6B](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing)**        | 1.0                    | 1.2         | 1.6        | 23.3×   | 5.92        | _28.94_      | _0.69_      | \u003cu\u003e84.5\u003c/u\u003e   |\n| **[Sana-1.5 1.6B](https://huggingface.co/Efficient-Large-Model/SANA1.5_1.6B_1024px_diffusers)**  | 1.0                    | 1.2         | 1.6        | 23.3×   | \u003cu\u003e5.70\u003c/u\u003e | \u003cu\u003e29.12\u003c/u\u003e | **0.82**    | \u003cu\u003e84.5\u003c/u\u003e   |\n| **[Sana-1.5 4.8B](https://huggingface.co/Efficient-Large-Model/SANA1.5_4.8B_1024px_diffusers)**  | 0.26                   | 4.2         | 4.8        | 6.5×    | 5.99        | **29.23**    | \u003cu\u003e0.81\u003c/u\u003e | **84.7**      |\n\n\u003cdetails\u003e\n  \u003csummary\u003e\u003ch4\u003eClick to show all performance\u003c/h4\u003e\u003c/summary\u003e\n\n| Methods                      | Throughput (samples/s) | Latency (s) | Params (B) | Speedup   | FID 👆      | CLIP 👆      | GenEval 👆  | DPG 👆      |\n|------------------------------|------------------------|-------------|------------|-----------|-------------|--------------|-------------|-------------|\n| _**512 × 512 resolution**_   |                        |             |            |           |             |              |             |             |\n| PixArt-α                     | 1.5                    | 1.2         | 0.6        | 1.0×      | 6.14        | 27.55        | 0.48        | 71.6        |\n| PixArt-Σ                     | 1.5                    | 1.2         | 0.6        | 1.0×      | _6.34_      | _27.62_      | \u003cu\u003e0.52\u003c/u\u003e | _79.5_      |\n| **Sana-0.6B**                | 6.7                    | 0.8         | 0.6        | 5.0×      | \u003cu\u003e5.67\u003c/u\u003e | \u003cu\u003e27.92\u003c/u\u003e | _0.64_      | \u003cu\u003e84.3\u003c/u\u003e |\n| **Sana-1.6B**                | 3.8                    | 0.6         | 1.6        | 2.5×      | **5.16**    | **28.19**    | **0.66**    | **85.5**    |\n| _**1024 × 1024 resolution**_ |                        |             |            |           |             |              |             |             |\n| LUMINA-Next                  | 0.12                   | 9.1         | 2.0        | 2.8×      | 7.58        | 26.84        | 0.46        | 74.6        |\n| SDXL                         | 0.15                   | 6.5         | 2.6        | 3.5×      | 6.63        | _29.03_      | 0.55        | 74.7        |\n| PlayGroundv2.5               | 0.21                   | 5.3         | 2.6        | 4.9×      | _6.09_      | **29.13**    | 0.56        | 75.5        |\n| Hunyuan-DiT                  | 0.05                   | 18.2        | 1.5        | 1.2×      | 6.54        | 28.19        | 0.63        | 78.9        |\n| PixArt-Σ                     | 0.4                    | 2.7         | 0.6        | 9.3×      | 6.15        | 28.26        | 0.54        | 80.5        |\n| DALLE3                       | -                      | -           | -          | -         | -           | -            | _0.67_      | 83.5        |\n| SD3-medium                   | 0.28                   | 4.4         | 2.0        | 6.5×      | 11.92       | 27.83        | 0.62        | \u003cu\u003e84.1\u003c/u\u003e |\n| FLUX-dev                     | 0.04                   | 23.0        | 12.0       | 1.0×      | 10.15       | 27.47        | _0.67_      | _84.0_      |\n| FLUX-schnell                 | 0.5                    | 2.1         | 12.0       | 11.6×     | 7.94        | 28.14        | **0.71**    | **84.8**    |\n| **Sana-0.6B**                | 1.7                    | 0.9         | 0.6        | **39.5×** | \u003cu\u003e5.81\u003c/u\u003e | 28.36        | 0.64        | 83.6        |\n| **Sana-1.6B**                | 1.0                    | 1.2         | 1.6        | **23.3×** | **5.76**    | \u003cu\u003e28.67\u003c/u\u003e | \u003cu\u003e0.66\u003c/u\u003e | **84.8**    |\n\n\u003c/details\u003e\n\n## Contents\n\n- [Env](#-1-dependencies-and-installation)\n- [Demo](#-2-how-to-play-with-sana-inference)\n- [Model Zoo](asset/docs/model_zoo.md)\n- [Training](#-3-how-to-train-sana)\n- [Testing](#-4-metric-toolkit)\n- [TODO](#to-do-list)\n- [Citation](#bibtex)\n\n# 🔧 1. Dependencies and Installation\n\n- Python \u003e= 3.10.0 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))\n- [PyTorch \u003e= 2.0.1+cu12.1](https://pytorch.org/)\n\n```bash\ngit clone https://github.com/NVlabs/Sana.git\ncd Sana\n\n./environment_setup.sh sana\n# or you can install each components step by step following environment_setup.sh\n```\n\n# 💻 2. How to Play with Sana (Inference)\n\n## 💰Hardware requirement\n\n- 9GB VRAM is required for 0.6B model and 12GB VRAM for 1.6B model. Our later quantization version will require less than 8GB for inference.\n- All the tests are done on A100 GPUs. Different GPU version may be different.\n\n## 🔛 Choose your model: [Model card](asset/docs/model_zoo.md)\n\n## 🔛 Quick start with [Gradio](https://www.gradio.app/guides/quickstart)\n\n```bash\n# official online demo\nDEMO_PORT=15432 \\\npython app/app_sana.py \\\n    --share \\\n    --config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml \\\n    --model_path=hf://Efficient-Large-Model/Sana_1600M_1024px_BF16/checkpoints/Sana_1600M_1024px_BF16.pth \\\n    --image_size=1024\n```\n\n### 1. How to use `SanaPipeline` with `🧨diffusers`\n\n\u003e \\[!IMPORTANT\\]\n\u003e Upgrade your `diffusers\u003e=0.32.0.dev` to make the `SanaPipeline` and `SanaPAGPipeline` available!\n\u003e\n\u003e ```bash\n\u003e pip install git+https://github.com/huggingface/diffusers\n\u003e ```\n\u003e\n\u003e Make sure to specify `pipe.transformer` to default `torch_dtype` and `variant` according to [Model Card](asset/docs/model_zoo.md).\n\u003e\n\u003e Set `pipe.text_encoder` to BF16 and `pipe.vae` to FP32 or BF16. For more info, [docs](https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana#sanapipeline) are here.\n\n```python\n# run `pip install git+https://github.com/huggingface/diffusers` before use Sana in diffusers\nimport torch\nfrom diffusers import SanaPipeline\n\npipe = SanaPipeline.from_pretrained(\n    \"Efficient-Large-Model/SANA1.5_1.6B_1024px_diffusers\",\n    torch_dtype=torch.bfloat16,\n)\npipe.to(\"cuda\")\n\npipe.vae.to(torch.bfloat16)\npipe.text_encoder.to(torch.bfloat16)\n\nprompt = 'a cyberpunk cat with a neon sign that says \"Sana\"'\nimage = pipe(\n    prompt=prompt,\n    height=1024,\n    width=1024,\n    guidance_scale=4.5,\n    num_inference_steps=20,\n    generator=torch.Generator(device=\"cuda\").manual_seed(42),\n)[0]\n\nimage[0].save(\"sana.png\")\n```\n\n### 2. How to use `SanaPAGPipeline` with `🧨diffusers`\n\n\u003cdetails\u003e\n\u003csummary\u003eClick to show all\u003c/summary\u003e\n\n```python\n# run `pip install git+https://github.com/huggingface/diffusers` before use Sana in diffusers\nimport torch\nfrom diffusers import SanaPAGPipeline\n\npipe = SanaPAGPipeline.from_pretrained(\n  \"Efficient-Large-Model/SANA1.5_1.6B_1024px_diffusers\",\n  torch_dtype=torch.bfloat16,\n  pag_applied_layers=\"transformer_blocks.8\",\n)\npipe.to(\"cuda\")\n\npipe.text_encoder.to(torch.bfloat16)\npipe.vae.to(torch.bfloat16)\n\nprompt = 'a cyberpunk cat with a neon sign that says \"Sana\"'\nimage = pipe(\n    prompt=prompt,\n    guidance_scale=5.0,\n    pag_scale=2.0,\n    num_inference_steps=20,\n    generator=torch.Generator(device=\"cuda\").manual_seed(42),\n)[0]\nimage[0].save('sana.png')\n```\n\n\u003c/details\u003e\n\n### 3. How to use Sana in this repo\n\n\u003cdetails\u003e\n\u003csummary\u003eClick to show all\u003c/summary\u003e\n\n```python\nimport torch\nfrom app.sana_pipeline import SanaPipeline\nfrom torchvision.utils import save_image\n\ndevice = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\ngenerator = torch.Generator(device=device).manual_seed(42)\n\nsana = SanaPipeline(\"configs/sana1-5_config/1024ms/Sana_1600M_1024px_allqknorm_bf16_lr2e5.yaml\")\nsana.from_pretrained(\"hf://Efficient-Large-Model/SANA1.5_1.6B_1024px/checkpoints/SANA1.5_1.6B_1024px.pth\")\nprompt = 'a cyberpunk cat with a neon sign that says \"Sana\"'\n\nimage = sana(\n    prompt=prompt,\n    height=1024,\n    width=1024,\n    guidance_scale=4.5,\n    pag_guidance_scale=1.0,\n    num_inference_steps=20,\n    generator=generator,\n)\nsave_image(image, 'output/sana.png', nrow=1, normalize=True, value_range=(-1, 1))\n```\n\n\u003c/details\u003e\n\n### 4. Run Sana (Inference) with Docker\n\n\u003cdetails\u003e\n\u003csummary\u003eClick to show all\u003c/summary\u003e\n\n```\n# Pull related models\nhuggingface-cli download google/gemma-2b-it\nhuggingface-cli download google/shieldgemma-2b\nhuggingface-cli download mit-han-lab/dc-ae-f32c32-sana-1.1\nhuggingface-cli download Efficient-Large-Model/Sana_1600M_1024px\n\n# Run with docker\ndocker build . -t sana\ndocker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \\\n    -v ~/.cache:/root/.cache \\\n    sana\n```\n\n\u003c/details\u003e\n\n### 5. Run inference with TXT or JSON files\n\n```bash\n# Run samples in a txt file\npython scripts/inference.py \\\n      --config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml \\\n      --model_path=hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth \\\n      --txt_file=asset/samples/samples_mini.txt\n\n# Run samples in a json file\npython scripts/inference.py \\\n      --config=configs/sana_config/1024ms/Sana_1600M_img1024.yaml \\\n      --model_path=hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth \\\n      --json_file=asset/samples/samples_mini.json\n```\n\nwhere each line of [`asset/samples/samples_mini.txt`](asset/samples/samples_mini.txt) contains a prompt to generate\n\n# 🔥 3. How to Train Sana\n\n## 💰Hardware requirement\n\n- 32GB VRAM is required for both 0.6B and 1.6B model's training\n\n### 1). Train with image-text pairs in directory\n\nWe provide a training example here and you can also select your desired config file from [config files dir](configs/sana_config) based on your data structure.\n\nTo launch Sana training, you will first need to prepare data in the following formats. [Here](asset/example_data) is an example for the data structure for reference.\n\n```bash\nasset/example_data\n├── AAA.txt\n├── AAA.png\n├── BCC.txt\n├── BCC.png\n├── ......\n├── CCC.txt\n└── CCC.png\n```\n\nThen Sana's training can be launched via\n\n```bash\n# Example of training Sana 0.6B with 512x512 resolution from scratch\nbash train_scripts/train.sh \\\n  configs/sana_config/512ms/Sana_600M_img512.yaml \\\n  --data.data_dir=\"[asset/example_data]\" \\\n  --data.type=SanaImgDataset \\\n  --model.multi_scale=false \\\n  --train.train_batch_size=32\n\n# Example of fine-tuning Sana 1.6B with 1024x1024 resolution\nbash train_scripts/train.sh \\\n  configs/sana_config/1024ms/Sana_1600M_img1024.yaml \\\n  --data.data_dir=\"[asset/example_data]\" \\\n  --data.type=SanaImgDataset \\\n  --model.load_from=hf://Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth \\\n  --model.multi_scale=false \\\n  --train.train_batch_size=8\n```\n\n### 2). Train with Multi-Scale WebDataset\n\nWe also provide conversion scripts to convert your data to the required format. You can refer to the [data conversion scripts](tools/convert_ImgDataset_to_WebDatasetMS_format.py) for more details.\n\n```bash\npython tools/convert_ImgDataset_to_WebDatasetMS_format.py\n```\n\nThen Sana's training can be launched via\n\n```bash\n# Example of training Sana 0.6B with 512x512 resolution from scratch\nbash train_scripts/train.sh \\\n  configs/sana_config/512ms/Sana_600M_img512.yaml \\\n  --data.data_dir=\"[asset/example_data_tar]\" \\\n  --data.type=SanaWebDatasetMS \\\n  --model.multi_scale=true \\\n  --train.train_batch_size=32\n```\n\n### 3). Train with TAR file\n\nWe prepared a toy TAR dataset containing 100 random images from Journey-DB, duplicated for testing purposes. Note that this dataset is not intended for training.\n\n```bash\nhuggingface-cli download Efficient-Large-Model/toy_data --repo-type dataset --local-dir ./data/toy_data --local-dir-use-symlinks False\n```\n\nThen, you are ready to run with FSDP or DDP:\n\n```bash\n# DDP\n# Example of training Sana 1.6B with 512x512 resolution from scratch\nbash train_scripts/train.sh \\\n      configs/sana1-5_config/1024ms/Sana_1600M_1024px_allqknorm_bf16_lr2e5.yaml \\\n      --data.data_dir=\"[data/toy_data]\" \\\n      --data.type=SanaWebDatasetMS \\\n      --model.multi_scale=true \\\n      --data.load_vae_feat=true \\\n      --train.train_batch_size=2\n```\n\n```bash\n# FSDP\n# Example of training Sana 1.6B with 512x512 resolution from scratch\nbash train_scripts/train.sh \\\n      configs/sana1-5_config/1024ms/Sana_1600M_1024px_AdamW_fsdp.yaml \\\n      --data.data_dir=\"[data/toy_data]\" \\\n      --data.type=SanaWebDatasetMS \\\n      --model.multi_scale=true \\\n      --data.load_vae_feat=true \\\n      --train.use_fsdp=true \\\n      --train.train_batch_size=2\n```\n\n# 💻 4. Metric toolkit\n\nRefer to [Toolkit Manual](asset/docs/metrics_toolkit.md).\n\n# 🚀 5. Inference Scaling\n\nWe trained a specialized [NVILA-2B](https://huggingface.co/Efficient-Large-Model/NVILA-Lite-2B-Verifier) model to score images, which we named VISA (VIla as SAna verifier). By selecting the top 4 images from 2,048 candidates, we enhanced the GenEval performance of SD1.5 and SANA-1.5-4.8B v2, increasing their scores from 42 to 87 and 81 to 96, respectively.\nDetails refer to [Inference Scaling Manual](asset/docs/inference_scaling/inference_scaling.md).\n\n| Method                         | Overall | Single | Two  | Counting | Colors | Position | Color Attribution |\n|--------------------------------|---------|--------|------|----------|--------|----------|------------------|\n| SD1.5                          | 0.42    | 0.98   | 0.39 | 0.31     | 0.72   | 0.04     | 0.06             |\n| **+ Inference Scaling**        | **0.87** | **1.00** | **0.97** | **0.93** | **0.96** | **0.75** | **0.62** |\n| SANA-1.5 4.8B v2              | 0.81    | 0.99   | 0.86 | 0.86     | 0.84   | 0.59     | 0.65             |\n| **+ Inference Scaling**        | **0.96** | **1.00** | **1.00** | **0.97** | **0.94** | **0.96** | **0.87** |\n\n# 🏃 6. SANA-Sprint\n\nOur SANA-Sprint models focus on timestep distillation, achieving high-quality generation with 1-4 inference steps. Refer to [SANA-Sprint Manual](asset/docs/sana_sprint.md) for more details.\n\n\u003cdiv align=\"center\"\u003e\n  \u003ca href=\"https://www.youtube.com/watch?v=nI_Ohgf8eOU\" target=\"_blank\"\u003e\n    \u003cimg src=\"https://img.youtube.com/vi/nI_Ohgf8eOU/0.jpg\" alt=\"Demo Video of SANA-Sprint\" style=\"width: 60%; margin: 0 auto; display: block\"\u003e\n  \u003c/a\u003e\n\u003c/div\u003e\n\n# 💪To-Do List\n\nWe will try our best to achieve\n\n- \\[✅\\] Training code\n- \\[✅\\] Inference code\n- \\[✅\\] Model zoo\n- \\[✅\\] [ComfyUI Nodes](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels)(SANA, SANA-1.5, SANA-Sprint)\n- \\[✅\\] DC-AE Diffusers\n- \\[✅\\] Sana merged in Diffusers(https://github.com/huggingface/diffusers/pull/9982)\n- \\[✅\\] LoRA training by [@paul](https://github.com/sayakpaul)(`diffusers`: https://github.com/huggingface/diffusers/pull/10234)\n- \\[✅\\] 2K/4K resolution models.(Thanks [@SUPIR](https://github.com/Fanghua-Yu/SUPIR) to provide a 4K super-resolution model)\n- \\[✅\\] 8bit / 4bit Laptop development\n- \\[✅\\] ControlNet (train \u0026 inference \u0026 models)\n- \\[✅\\] FSDP Training\n- \\[✅\\] SANA-1.5 (Larger model size / Inference Scaling)\n- \\[✅\\] SANA-Sprint: Few-step generator\n- \\[🚀\\] Video Generation\n\n# 🤗Acknowledgements\n\n**Thanks to the following open-sourced codebase for their wonderful work and codebase!**\n\n- [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)\n- [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma)\n- [Efficient-ViT](https://github.com/mit-han-lab/efficientvit)\n- [ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels)\n- [SVDQuant and Nunchaku](https://github.com/mit-han-lab/nunchaku)\n- [diffusers](https://github.com/huggingface/diffusers)\n\n## Contribution\n\nThanks goes to these wonderful contributors:\n\n\u003ca href=\"https://github.com/NVlabs/Sana/graphs/contributors\"\u003e\n  \u003cimg src=\"https://contrib.rocks/image?repo=NVlabs/Sana\" /\u003e\n\u003c/a\u003e\n\n## 🌟 Star History\n\n[![Star History Chart](https://api.star-history.com/svg?repos=NVlabs/sana\u0026type=Date)](https://www.star-history.com/#NVlabs/sana\u0026Date)\n\n# 📖BibTeX\n\n```\n@misc{xie2024sana,\n      title={Sana: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformer},\n      author={Enze Xie and Junsong Chen and Junyu Chen and Han Cai and Haotian Tang and Yujun Lin and Zhekai Zhang and Muyang Li and Ligeng Zhu and Yao Lu and Song Han},\n      year={2024},\n      eprint={2410.10629},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV},\n      url={https://arxiv.org/abs/2410.10629},\n    }\n@misc{xie2025sana,\n      title={SANA 1.5: Efficient Scaling of Training-Time and Inference-Time Compute in Linear Diffusion Transformer},\n      author={Xie, Enze and Chen, Junsong and Zhao, Yuyang and Yu, Jincheng and Zhu, Ligeng and Lin, Yujun and Zhang, Zhekai and Li, Muyang and Chen, Junyu and Cai, Han and others},\n      year={2025},\n      eprint={2501.18427},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV},\n      url={https://arxiv.org/abs/2501.18427},\n    }\n@misc{chen2025sanasprint,\n      title={SANA-Sprint: One-Step Diffusion with Continuous-Time Consistency Distillation},\n      author={Junsong Chen and Shuchen Xue and Yuyang Zhao and Jincheng Yu and Sayak Paul and Junyu Chen and Han Cai and Song Han and Enze Xie},\n      year={2025},\n      eprint={2503.09641},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV},\n      url={https://arxiv.org/abs/2503.09641},\n    }\n```\n","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2FNVlabs%2FSana","html_url":"https://awesome.ecosyste.ms/projects/github.com%2FNVlabs%2FSana","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2FNVlabs%2FSana/lists"}