https://github.com/mddct/usm-tokenizer

semantic tokenizer for speech and music
https://github.com/mddct/usm-tokenizer
Last synced: 11 months ago
JSON representation
semantic tokenizer for speech and music
Host: GitHub
URL: https://github.com/mddct/usm-tokenizer
Owner: Mddct
Created: 2025-05-18T08:56:18.000Z (about 1 year ago)
Default Branch: main
Last Pushed: 2025-07-06T05:41:49.000Z (11 months ago)
Last Synced: 2025-07-06T06:28:54.598Z (11 months ago)
Language: Python
Size: 32.2 KB
Stars: 20
Watchers: 2
Forks: 3
Open Issues: 3
Metadata Files:
- Readme: README.md
Awesome Lists containing this project

README

          # usm-tokenizer

## ongoning

- [ ] wenet xxxformers with ctc

- [ ] sequence-vq  https://github.com/Mddct/sequence-vector-quantize/blob/main/quantize.py

```

@misc{chen2025diffusionautoencodersscalableimage,

      title={Diffusion Autoencoders are Scalable Image Tokenizers}, 

      author={Yinbo Chen and Rohit Girdhar and Xiaolong Wang and Sai Saketh Rambhatla and Ishan Misra},

      year={2025},

      eprint={2501.18593},

      archivePrefix={arXiv},

      primaryClass={cs.CV},

      url={https://arxiv.org/abs/2501.18593}, 

}

@misc{liu2025dgaediffusionguidedautoencoderefficient,

      title={DGAE: Diffusion-Guided Autoencoder for Efficient Latent Representation Learning}, 

      author={Dongxu Liu and Yuang Peng and Haomiao Tang and Yuwei Chen and Chunrui Han and Zheng Ge and Daxin Jiang and Mingxue Liao},

      year={2025},

      eprint={2506.09644},

      archivePrefix={arXiv},

      primaryClass={cs.CV},

      url={https://arxiv.org/abs/2506.09644}, 

}

@misc{song2025magicodecsimplemaskedgaussianinjected,

      title={MagiCodec: Simple Masked Gaussian-Injected Codec for High-Fidelity Reconstruction and Generation}, 

      author={Yakun Song and Jiawei Chen and Xiaobin Zhuang and Chenpeng Du and Ziyang Ma and Jian Wu and Jian Cong and Dongya Jia and Zhuo Chen and Yuping Wang and Yuxuan Wang and Xie Chen},

      year={2025},

      eprint={2506.00385},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2506.00385}, 

}

@misc{zhu2025muqselfsupervisedmusicrepresentation,

      title={MuQ: Self-Supervised Music Representation Learning with Mel Residual Vector Quantization}, 

      author={Haina Zhu and Yizhi Zhou and Hangting Chen and Jianwei Yu and Ziyang Ma and Rongzhi Gu and Yi Luo and Wei Tan and Xie Chen},

      year={2025},

      eprint={2501.01108},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2501.01108}, 

}

@misc{rubenstein2023audiopalmlargelanguagemodel,

      title={AudioPaLM: A Large Language Model That Can Speak and Listen}, 

      author={Paul K. Rubenstein and Chulayuth Asawaroengchai and Duc Dung Nguyen and Ankur Bapna and Zalán Borsos and Félix de Chaumont Quitry and Peter Chen and Dalia El Badawy and Wei Han and Eugene Kharitonov and Hannah Muckenhirn and Dirk Padfield and James Qin and Danny Rozenberg and Tara Sainath and Johan Schalkwyk and Matt Sharifi and Michelle Tadmor Ramanovich and Marco Tagliasacchi and Alexandru Tudor and Mihajlo Velimirović and Damien Vincent and Jiahui Yu and Yongqiang Wang and Vicky Zayats and Neil Zeghidour and Yu Zhang and Zhishuai Zhang and Lukas Zilka and Christian Frank},

      year={2023},

      eprint={2306.12925},

      archivePrefix={arXiv},

      primaryClass={cs.CL},

      url={https://arxiv.org/abs/2306.12925}, 

}

```

```

@misc{agostinelli2023musiclmgeneratingmusictext,

      title={MusicLM: Generating Music From Text}, 

      author={Andrea Agostinelli and Timo I. Denk and Zalán Borsos and Jesse Engel and Mauro Verzetti and Antoine Caillon and Qingqing Huang and Aren Jansen and Adam Roberts and Marco Tagliasacchi and Matt Sharifi and Neil Zeghidour and Christian Frank},

      year={2023},

      eprint={2301.11325},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2301.11325}, 

}

```

```

@misc{cideron2024diversityrewardedcfgdistillation,

      title={Diversity-Rewarded CFG Distillation}, 

      author={Geoffrey Cideron and Andrea Agostinelli and Johan Ferret and Sertan Girgin and Romuald Elie and Olivier Bachem and Sarah Perrin and Alexandre Ramé},

      year={2024},

      eprint={2410.06084},

      archivePrefix={arXiv},

      primaryClass={cs.LG},

      url={https://arxiv.org/abs/2410.06084}, 

}

```

```

@article{park2024long,

  author       = {Se Jin Park and

                  Julian Salazar and

                  Aren Jansen and

                  Keisuke Kinoshita and

                  Yong Man Ro and

                  R. J. Skerry{-}Ryan},

  title        = {Long-Form Speech Generation with Spoken Language Models},

  journal      = {CoRR},

  volume       = {abs/2412.18603},

  year         = {2024}

}

```

```

@misc{borsos2023soundstormefficientparallelaudio,

      title={SoundStorm: Efficient Parallel Audio Generation}, 

      author={Zalán Borsos and Matt Sharifi and Damien Vincent and Eugene Kharitonov and Neil Zeghidour and Marco Tagliasacchi},

      year={2023},

      eprint={2305.09636},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2305.09636}, 

}

```

```

@misc{zhang2025minimaxspeechintrinsiczeroshottexttospeech,

      title={MiniMax-Speech: Intrinsic Zero-Shot Text-to-Speech with a Learnable Speaker Encoder}, 

      author={Bowen Zhang and Congchao Guo and Geng Yang and Hang Yu and Haozhe Zhang and Heidi Lei and Jialong Mai and Junjie Yan and Kaiyue Yang and Mingqi Yang and Peikai Huang and Ruiyang Jin and Sitan Jiang and Weihua Cheng and Yawei Li and Yichen Xiao and Yiying Zhou and Yongmao Zhang and Yuan Lu and Yucen He},

      year={2025},

      eprint={2505.07916},

      archivePrefix={arXiv},

      primaryClass={eess.AS},

      url={https://arxiv.org/abs/2505.07916}, 

}

```

```

@misc{kharitonov2023speakreadprompthighfidelity,

      title={Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision}, 

      author={Eugene Kharitonov and Damien Vincent and Zalán Borsos and Raphaël Marinier and Sertan Girgin and Olivier Pietquin and Matt Sharifi and Marco Tagliasacchi and Neil Zeghidour},

      year={2023},

      eprint={2302.03540},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2302.03540}, 

}

```

```

@misc{anastassiou2024seedttsfamilyhighqualityversatile,

      title={Seed-TTS: A Family of High-Quality Versatile Speech Generation Models}, 

      author={Philip Anastassiou and Jiawei Chen and Jitong Chen and Yuanzhe Chen and Zhuo Chen and Ziyi Chen and Jian Cong and Lelai Deng and Chuang Ding and Lu Gao and Mingqing Gong and Peisong Huang and Qingqing Huang and Zhiying Huang and Yuanyuan Huo and Dongya Jia and Chumin Li and Feiya Li and Hui Li and Jiaxin Li and Xiaoyang Li and Xingxing Li and Lin Liu and Shouda Liu and Sichao Liu and Xudong Liu and Yuchen Liu and Zhengxi Liu and Lu Lu and Junjie Pan and Xin Wang and Yuping Wang and Yuxuan Wang and Zhen Wei and Jian Wu and Chao Yao and Yifeng Yang and Yuanhao Yi and Junteng Zhang and Qidi Zhang and Shuo Zhang and Wenjie Zhang and Yang Zhang and Zilin Zhao and Dejian Zhong and Xiaobin Zhuang},

      year={2024},

      eprint={2406.02430},

      archivePrefix={arXiv},

      primaryClass={eess.AS},

      url={https://arxiv.org/abs/2406.02430}, 

}

```

```

@misc{du2024cosyvoicescalablemultilingualzeroshot,

      title={CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens}, 

      author={Zhihao Du and Qian Chen and Shiliang Zhang and Kai Hu and Heng Lu and Yexin Yang and Hangrui Hu and Siqi Zheng and Yue Gu and Ziyang Ma and Zhifu Gao and Zhijie Yan},

      year={2024},

      eprint={2407.05407},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2407.05407}, 

}

```

```

@misc{du2024cosyvoice2scalablestreaming,

      title={CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models}, 

      author={Zhihao Du and Yuxuan Wang and Qian Chen and Xian Shi and Xiang Lv and Tianyu Zhao and Zhifu Gao and Yexin Yang and Changfeng Gao and Hui Wang and Fan Yu and Huadai Liu and Zhengyan Sheng and Yue Gu and Chong Deng and Wen Wang and Shiliang Zhang and Zhijie Yan and Jingren Zhou},

      year={2024},

      eprint={2412.10117},

      archivePrefix={arXiv},

      primaryClass={cs.SD},

      url={https://arxiv.org/abs/2412.10117}, 

}

```

```

@misc{zeng2024glm4voiceintelligenthumanlikeendtoend,

      title={GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot}, 

      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Kedong Wang and Shengmin Jiang and Lei Zhao and Yuxiao Dong and Jie Tang},

      year={2024},

      eprint={2412.02612},

      archivePrefix={arXiv},

      primaryClass={cs.CL},

      url={https://arxiv.org/abs/2412.02612}, 

}

```
ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Awesome

https://github.com/mddct/usm-tokenizer

Awesome Lists containing this project

README