Awesome-LLM-Compression
  
  
    Awesome LLM compression research papers and tools. 
    https://github.com/HuangOwen/Awesome-LLM-Compression
  
        Last synced: about 2 hours ago 
        JSON representation
    
- 
            
Papers
- 
                    
Quantization
- [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - han-lab/smoothquant)
 - [Paper
 - [Paper - tYCaP0phY_&name=supplementary_material)
 - [Paper
 - [Paper - DASLab/gptq)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - han-lab/llm-awq)
 - [Paper - QAT)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - ai/INT-FP-QSim)
 - [Paper - DASLab/QIGen)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - compressor)
 - [Paper - lora)
 - [Paper
 - [Paper - LLM)
 - [Paper
 - [Paper - Transformers)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - compressor)
 - [Paper - AMP)
 - [Paper - DASLab/QUIK)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - AI-research/outlier-free-transformers)
 - [Paper - extension-for-transformers)
 - [Paper
 - [Paper - 98/llm-mixed-q)
 - [Paper
 - [Paper - Watermark)
 - [Paper
 - [Paper - FP4)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - chee/QuIP)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - dmx/project-resq)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - yongqi/systematic-outliers)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - 778/SliM-LLM)
 - [Paper
 - [Paper
 - [Paper - easl/deltazip)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - llm)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - 778/BiLLM)
 - [Paper - RelaxML/quip-sharp)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - qlora)
 - [Paper
 - [Paper
 - [Paper - DuDa/BitDistiller)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - RelaxML/qtip)
 - [Paper - Lab/moe-quantization)
 - [Paper
 - [Paper
 - [Paper - han-lab/qserve)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Quantization) [[Model]](https://huggingface.co/LLMQ)
 - [Paper
 - [Paper - pretrain)
 - [Paper
 - [Paper
 - [Paper - Point-RND/GIFT_SW-v2-Gaussian-noise-Injected-Fine-Tuning-of-Salient-Weights-for-LLMs)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - DASLab/marlin) [[Code (Sparse Marlin)]](https://github.com/IST-DASLab/Sparse-Marlin)
 - [Paper
 - [Paper - fi/MobileQuant)
 - [Paper
 - [Paper
 - [Paper - ai-research/gptvq)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Ouyang)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - compensation)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - LLM)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Computing-Lab-Yale/TesseraQ)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - ml/SageAttention)
 - [Paper
 - [Paper
 - [Paper - lab/MX-QLLM)
 - [Paper
 - [Paper - HPCA-25)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - EIC/ShiftAddLLM)
 - [Paper
 - [Paper
 - [Paper - Inscrutable-X/TACQ)
 - [Paper
 - [Paper - round)
 - [Paper - EIC/Edge-LLM)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - MAC)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Group/Q-GaLore)
 - [Paper - AILab/flash-attention)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - lab/EfficientLLMs)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Computing-Lab-Yale/GPTQv2)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - mllab/GuidedQuant)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - QAF)
 - [Paper - all-the-way)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 
 - 
                    
Pruning and Sparsity
- [Paper
 - [Paper
 - [Paper
 - [Paper - PruMerge)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Pruner)
 - [Paper - DASLab/ZipLM)
 - [Paper
 - [Paper
 - [Paper - Group/essential_sparsity)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - DASLab/sparsegpt)
 - [Paper
 - [Paper - science/llm-interpret)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - llm)
 - [Paper
 - [Paper - Group/Junk_DNA_Hypothesis)
 - [Paper
 - [Paper
 - [Paper - nlp/LLM-Shearing)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Pruner)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - pruning)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Mozaffari/slim)
 - [Paper
 - [Paper
 - [Paper - Xing2/EfficientLLM)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Aware-Automated-Machine-Learning/tree/main/Shears)
 - [Paper
 - [Paper
 - [Paper - Pruner)
 - [Paper
 - [Paper - DASLab/EvoPress)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - han-lab/Quest)
 - [Paper - Barber)
 - [Paper
 - [Paper - Aware-Tuning)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - cybernetics/Relative-importance-and-activation-pruning)
 - [Paper - AI-Lab/Sirius)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Aware-Automated-Machine-Learning/tree/main/LoNAS)
 - [Paper - Aware-Automated-Machine-Learning/tree/main/SQFT)
 - [Paper - ai/ReplaceMe)
 - [Paper
 - [Paper - Zero)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - PEFT)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - He/LLM-Drop)
 - [Paper
 - [Paper - lab/shadow_llm/)
 - [Paper - nlp/Edge-Pruning)
 - [Paper - research/EEP)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Aware-Automated-Machine-Learning/tree/main/MultiPruner)
 - [Paper - Aware-Automated-Machine-Learning/tree/main/Mamba-Shedder)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - miu.github.io/Dobi-SVD.page/)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Aware-Automated-Machine-Learning/tree/main/LoNAS)
 
 - 
                    
Distillation
- [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - nlp/LaMini-LM)
 - [Paper
 - [Paper
 - [Paper - ai/gpt4all)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - kd)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Collaborative-Knowledge-Distillation)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - sy/lillama)
 - [Paper
 - [Paper - distillation)
 - [Paper
 - [Paper
 - [Paper - neo-66e3c882f5579b829ff57eba)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - NeMo-Minitron-8B-Base)
 
 - 
                    
Efficient Prompting
- [Paper
 - [Paper - for-Prompt-Compression)
 - [Paper - instruction-effectiveness)
 - [Paper - prompting)
 - [Paper - nlp/AutoCompressors)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - mllab/context-memory)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - COCO)
 - [Paper
 - [Paper - Influx)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 
 - 
                    
KV Cache Compression
- [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - ai/lexico)
 - [Paper
 - [Paper
 - [Paper - ai/SCOPE)
 - [Paper
 - [Paper
 - [Paper - KVCacheQuantization)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - yuan/KIVI)
 - [Paper
 - [Paper - sg/SimLayerKV)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - AdaQuant-8F58)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - NACL)
 - [Paper
 - [Paper - Lab/ZeroMerge)
 - [Paper
 - [Paper - TAO/VidKV)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - lab/xKV)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - chen/Titanus-for-LLM-acceleration)
 - [Paper
 - [Paper - nics/PM-KVQ)
 - [Paper - mllab/KVzip)
 - [Paper
 - [Paper - kv-compression)
 - [Paper
 
 - 
                    
Survey
- [Paper
 - [Paper
 - [Paper - MLSys-Lab/Efficient-LLMs-Survey)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Low-Precision-Training)
 - [Paper - LLMs-on-device) [[Download On-device LLMs]](https://nexaai.com/models)
 - [Paper - Knowledge-Distillation-of-LLMs)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - for-Prompt-Compression)
 - [Paper
 - [Paper - LLM-Inference-Engine)
 - [Paper - LLM-Survey)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 
 - 
                    
Other
- [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - Scheduling)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - research/LongLoRA)
 - [Paper
 - [Paper
 - [Paper - han-lab/streaming-llm)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - research/Dataset_Quantization)
 - [Paper - zha/Align)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - han-lab/duo-attention)
 - [Paper - IPADS/PowerInfer)
 - [Paper - Lab-UMD/Unified-MoE-Compression)
 - [paper
 - [Paper
 - [paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - lin/RapidIn)
 - [Paper
 - [Paper - AILab/flash-attention)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - MLSys-Lab/SVD-LLM)
 - [Paper
 - [Paper
 - [Paper
 - [Paper - ai-lab/Consistency_LLM)
 - [Paper - AI-Lab/TriForce)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper - ml/SageAttention)
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [paper
 - [paper
 - [Paper
 - [paper
 - [Paper
 - [Paper
 - [paper - MLSys-Lab/SVD-LLM)
 - [paper
 - [paper
 - [paper
 - [paper
 - [paper
 - [paper
 - [paper
 - [Paper
 - [Paper
 - [Paper
 - [Paper
 - [paper - lab/PromptDistill)
 - [Paper
 
 
 - 
                    
 - 
            
Tools
 - 
            
Star History
- 
                    
Other
- ![Star History Chart - history.com/#HuangOwen/Awesome-LLM-Compression&Date)
 - ![Star History Chart - history.com/#HuangOwen/Awesome-LLM-Compression&Date)
 
 
 - 
                    
 
            Programming Languages
          
          
        
            Categories
          
          
        
            Sub Categories
          
          
        
            Keywords
          
          
              
                quantization
                9
              
              
                llm
                9
              
              
                llama
                7
              
              
                large-language-models
                6
              
              
                pytorch
                4
              
              
                gpt
                3
              
              
                pruning
                3
              
              
                post-training-quantization
                3
              
              
                llama3
                3
              
              
                qwen
                2
              
              
                cpp
                2
              
              
                ai
                2
              
              
                qlora
                2
              
              
                awq
                2
              
              
                deep-learning
                2
              
              
                c
                2
              
              
                smoothquant
                2
              
              
                transformer
                2
              
              
                alpaca
                2
              
              
                chatglm
                2
              
              
                llama2
                2
              
              
                instruction-tuning
                2
              
              
                lora
                2
              
              
                llamacpp
                2
              
              
                transformers
                2
              
              
                quantization-aware-training
                2
              
              
                language-model
                2
              
              
                lama
                1
              
              
                lamacpp
                1
              
              
                model-para
                1
              
              
                python
                1
              
              
                tensorrt
                1
              
              
                sparse
                1
              
              
                rwkv
                1
              
              
                chatbot
                1
              
              
                onnxruntime
                1
              
              
                chatgpt
                1
              
              
                cot
                1
              
              
                moss
                1
              
              
                p-tuning
                1
              
              
                parameter-efficient
                1
              
              
                tabul
                1
              
              
                tabular-data
                1
              
              
                tabular-model
                1
              
              
                amd
                1
              
              
                cuda
                1
              
              
                deepseek
                1
              
              
                hpu
                1
              
              
                inference
                1
              
              
                inferentia
                1