{"id":32633685,"url":"https://github.com/mwasifanwar/llm-mastery","last_synced_at":"2026-04-16T15:32:53.404Z","repository":{"id":321610293,"uuid":"1086495814","full_name":"mwasifanwar/llm-mastery","owner":"mwasifanwar","description":"The most comprehensive educational resource on Large Language Models ever created. A guide systematically building understanding from absolute fundamentals to cutting-edge research.","archived":false,"fork":false,"pushed_at":"2025-10-30T15:01:30.000Z","size":504,"stargazers_count":1,"open_issues_count":0,"forks_count":0,"subscribers_count":0,"default_branch":"main","last_synced_at":"2025-10-30T16:25:36.303Z","etag":null,"topics":["ai-education","artificial-intelligence","attention-mechanism","deep-learning","fine-tuning","huggingface","large-language-models","llm","machine-learning","model-deployment","model-training","neural-networks","nlp","pre-training","pytorch","research-paper","transformer","transformer-architecture"],"latest_commit_sha":null,"homepage":"https://mwasif.dev","language":null,"has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/mwasifanwar.png","metadata":{"files":{"readme":"README.md","changelog":null,"contributing":null,"funding":null,"license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null,"zenodo":null,"notice":null,"maintainers":null,"copyright":null,"agents":null,"dco":null,"cla":null}},"created_at":"2025-10-30T13:53:40.000Z","updated_at":"2025-10-30T15:01:35.000Z","dependencies_parsed_at":"2025-10-30T16:26:50.749Z","dependency_job_id":"fc36a129-906d-4f8a-95fb-ea6ab2ef1d7c","html_url":"https://github.com/mwasifanwar/llm-mastery","commit_stats":null,"previous_names":["mwasifanwar/llm-mastery"],"tags_count":null,"template":false,"template_full_name":null,"purl":"pkg:github/mwasifanwar/llm-mastery","repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mwasifanwar%2Fllm-mastery","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mwasifanwar%2Fllm-mastery/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mwasifanwar%2Fllm-mastery/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mwasifanwar%2Fllm-mastery/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/mwasifanwar","download_url":"https://codeload.github.com/mwasifanwar/llm-mastery/tar.gz/refs/heads/main","sbom_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/mwasifanwar%2Fllm-mastery/sbom","scorecard":null,"host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":286080680,"owners_count":31892269,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2026-04-16T11:36:10.202Z","status":"ssl_error","status_checked_at":"2026-04-16T11:36:09.652Z","response_time":69,"last_error":"SSL_connect returned=1 errno=0 peeraddr=140.82.121.5:443 state=error: unexpected eof while reading","robots_txt_status":"success","robots_txt_updated_at":"2025-07-24T06:49:26.215Z","robots_txt_url":"https://github.com/robots.txt","online":false,"can_crawl_api":true,"host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["ai-education","artificial-intelligence","attention-mechanism","deep-learning","fine-tuning","huggingface","large-language-models","llm","machine-learning","model-deployment","model-training","neural-networks","nlp","pre-training","pytorch","research-paper","transformer","transformer-architecture"],"created_at":"2025-10-30T23:54:06.477Z","updated_at":"2026-04-16T15:32:53.387Z","avatar_url":"https://github.com/mwasifanwar.png","language":null,"funding_links":[],"categories":[],"sub_categories":[],"readme":"\u003ch1\u003eThe Complete Large Language Model (LLM) Guide\u003c/h1\u003e\n\n\u003cp\u003e\u003cstrong\u003eFrom Fundamentals to Advanced Implementation\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003eA comprehensive, research-grade resource covering the complete spectrum of Large Language Models - from mathematical foundations to production deployment and ethical considerations.\u003c/p\u003e\n\n\u003cimg width=\"784\" height=\"538\" alt=\"image\" src=\"https://github.com/user-attachments/assets/271dbf41-e184-4c60-b256-83e87a6571e5\" /\u003e\n\n\n\u003cdiv style=\"background: #f5f5f5; padding: 15px; border-left: 4px solid #007acc; margin: 20px 0;\"\u003e\n\u003cstrong\u003e🚀 Quick Start\u003c/strong\u003e\u003cbr\u003e\nThis guide progresses from fundamental concepts to advanced research frontiers. Each chapter builds upon previous knowledge with practical implementations and mathematical rigor.\n\u003c/div\u003e\n\n\u003ch1 align=\"center\"\u003e📚 Complete LLM Guide - Table of Contents\u003c/h1\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003ci\u003eYour Comprehensive Roadmap to Mastering Large Language Models\u003c/i\u003e\n\u003c/div\u003e\n\n\u003cbr\u003e\n\n\u003cdiv style=\"display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin: 2rem 0;\"\u003e\n\n\u003c!-- Column 1 --\u003e\n\u003cdiv style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 1.5rem; border-radius: 10px; color: white;\"\u003e\n\u003ch3\u003e🎯 Foundation \u0026 Fundamentals\u003c/h3\u003e\n\u003cul style=\"list-style: none; padding: 0;\"\u003e\n\u003cli\u003e• \u003ca href=\"#introduction\" style=\"color: white;\"\u003e1. LLM Revolution\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#learning-path\" style=\"color: white;\"\u003e2. Learning Pathway\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#mathematical-foundations\" style=\"color: white;\"\u003e3. Math Foundations\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#programming-fundamentals\" style=\"color: white;\"\u003e4. Programming\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#neural-networks\" style=\"color: white;\"\u003e5. Neural Networks\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e\n\n\u003c!-- Column 2 --\u003e\n\u003cdiv style=\"background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); padding: 1.5rem; border-radius: 10px; color: white;\"\u003e\n\u003ch3\u003e⚡ Core Architecture\u003c/h3\u003e\n\u003cul style=\"list-style: none; padding: 0;\"\u003e\n\u003cli\u003e• \u003ca href=\"#transformer-architecture\" style=\"color: white;\"\u003e6. Transformers\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#attention-mechanisms\" style=\"color: white;\"\u003e7. Attention\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#training-methodologies\" style=\"color: white;\"\u003e8. Training Methods\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#fine-tuning-techniques\" style=\"color: white;\"\u003e9. Fine-tuning\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#inference-optimization\" style=\"color: white;\"\u003e10. Inference\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e\n\n\u003c!-- Column 3 --\u003e\n\u003cdiv style=\"background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); padding: 1.5rem; border-radius: 10px; color: white;\"\u003e\n\u003ch3\u003e🚀 Production \u0026 Beyond\u003c/h3\u003e\n\u003cul style=\"list-style: none; padding: 0;\"\u003e\n\u003cli\u003e• \u003ca href=\"#evaluation-framework\" style=\"color: white;\"\u003e11. Evaluation\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#production-deployment\" style=\"color: white;\"\u003e12. Deployment\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#research-frontiers\" style=\"color: white;\"\u003e13. Research\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#ethical-considerations\" style=\"color: white;\"\u003e14. Ethics\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e• \u003ca href=\"#future-directions\" style=\"color: white;\"\u003e15. Future\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e\n\n\u003c/div\u003e\n\n\u003cimg width=\"933\" height=\"545\" alt=\"image\" src=\"https://github.com/user-attachments/assets/9dcf621a-0477-40a7-8fc8-55146cdeca8d\" /\u003e\n\n\n\u003cbr\u003e\n\n\u003cdiv style=\"background: #f8f9fa; padding: 1.5rem; border-radius: 10px; border-left: 4px solid #007acc;\"\u003e\n\u003ch3\u003e📖 Detailed Chapter Breakdown\u003c/h3\u003e\n\n\u003ctable style=\"width: 100%; border-collapse: collapse;\"\u003e\n\u003ctr style=\"background-color: #e3f2fd;\"\u003e\n\u003cth style=\"padding: 12px; text-align: left; border-bottom: 2px solid #007acc;\"\u003eChapter\u003c/th\u003e\n\u003cth style=\"padding: 12px; text-align: left; border-bottom: 2px solid #007acc;\"\u003eKey Topics\u003c/th\u003e\n\u003cth style=\"padding: 12px; text-align: left; border-bottom: 2px solid #007acc;\"\u003eLevel\u003c/th\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e1. LLM Revolution\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eHistory, Evolution, Current Landscape\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eBeginner\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e2. Learning Pathway\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eRoadmap, Prerequisites, Timeline\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eBeginner\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e3. Math Foundations\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eLinear Algebra, Probability, Calculus\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eIntermediate\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e4. Programming\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003ePyTorch, Distributed Training, GPU\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eIntermediate\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e5. Neural Networks\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eArchitectures, Backpropagation, Optimization\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eIntermediate\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e6. Transformers\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eSelf-Attention, Positional Encoding, Implementation\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eAdvanced\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e7. Attention\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eMulti-Head, Sparse, Efficient Mechanisms\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eAdvanced\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e8. Training Methods\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003ePre-training, Scaling Laws, Distributed Training\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eAdvanced\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e9. Fine-tuning\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eLoRA, Adapters, RLHF, Prompt Tuning\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eAdvanced\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e10. Inference\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eQuantization, Pruning, Speculative Decoding\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eExpert\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e11. Evaluation\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eBenchmarks, Safety, Bias Detection\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eExpert\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e12. Deployment\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eServing, Scaling, Monitoring, Load Balancing\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eExpert\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e13. Research\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eMoE, SSMs, Multimodal, Reasoning\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eResearch\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr style=\"background-color: #f8f9fa;\"\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e14. Ethics\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eBias, Fairness, Transparency, Privacy\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eAll Levels\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003ctr\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003e\u003cstrong\u003e15. Future\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eScaling, Governance, AI Safety, Impact\u003c/td\u003e\n\u003ctd style=\"padding: 12px; border-bottom: 1px solid #ddd;\"\u003eVisionary\u003c/td\u003e\n\u003c/tr\u003e\n\n\u003c/table\u003e\n\u003c/div\u003e\n\n\u003cbr\u003e\n\n\u003cdiv style=\"text-align: center; background: #e8f5e8; padding: 1rem; border-radius: 10px; border: 2px dashed #4caf50;\"\u003e\n\u003ch3\u003e🚀 Ready to Begin Your Journey?\u003c/h3\u003e\n\u003cp\u003eStart with Chapter 1 and progress systematically through each section. Each chapter builds upon previous knowledge!\u003c/p\u003e\n\u003cstrong\u003eTotal Learning Time: ~6-12 months | Prerequisites: Python, Basic Math\u003c/strong\u003e\n\u003c/div\u003e\n\n\u003cbr\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\u003c/html\u003e\n\n\u003ch2 id=\"introduction\"\u003e1. Introduction to the LLM Revolution\u003c/h2\u003e\n\n\u003ch3\u003e1.1 What are Large Language Models?\u003c/h3\u003e\n\n\u003cp\u003eLarge Language Models (LLMs) represent a paradigm shift in artificial intelligence, leveraging deep neural networks with billions to trillions of parameters to understand, generate, and reason with human language.\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eCore Characteristics:\u003c/strong\u003e\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003eScale\u003c/strong\u003e: Model sizes ranging from millions to trillions of parameters\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eArchitecture\u003c/strong\u003e: Primarily Transformer-based neural networks\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eTraining\u003c/strong\u003e: Self-supervised learning on massive text corpora\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eEmergent Abilities\u003c/strong\u003e: Reasoning, code generation, mathematical problem-solving\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cimg width=\"516\" height=\"684\" alt=\"image\" src=\"https://github.com/user-attachments/assets/daa70c9f-111e-41f4-8d66-6bb682a07ff5\" /\u003e\n\n\n\u003ch3\u003e1.2 Historical Evolution Timeline\u003c/h3\u003e\n\n\u003ctable border=\"1\" style=\"border-collapse: collapse; width: 100%;\"\u003e\n  \u003ctr style=\"background-color: #f2f2f2;\"\u003e\n    \u003cth\u003eEra\u003c/th\u003e\n    \u003cth\u003eTimeline\u003c/th\u003e\n    \u003cth\u003eKey Models\u003c/th\u003e\n    \u003cth\u003eBreakthroughs\u003c/th\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eStatistical\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e1990-2010\u003c/td\u003e\n    \u003ctd\u003eN-gram models, HMMs\u003c/td\u003e\n    \u003ctd\u003eProbabilistic language modeling\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eNeural\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e2013-2017\u003c/td\u003e\n    \u003ctd\u003eWord2Vec, LSTM, GRU\u003c/td\u003e\n    \u003ctd\u003eDistributed representations, sequence modeling\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eTransformer\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e2017-2018\u003c/td\u003e\n    \u003ctd\u003eOriginal Transformer\u003c/td\u003e\n    \u003ctd\u003eSelf-attention mechanism, parallel processing\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003ePre-training\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e2018-2020\u003c/td\u003e\n    \u003ctd\u003eBERT, GPT-2, RoBERTa\u003c/td\u003e\n    \u003ctd\u003eTransfer learning, bidirectional context\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eLarge-scale\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e2020-2022\u003c/td\u003e\n    \u003ctd\u003eGPT-3, T5, PaLM\u003c/td\u003e\n    \u003ctd\u003eFew-shot learning, scaling laws, reasoning\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eModern\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e2022-Present\u003c/td\u003e\n    \u003ctd\u003eGPT-4, Claude, Llama, Mistral\u003c/td\u003e\n    \u003ctd\u003eMultimodality, alignment, open-weight models\u003c/td\u003e\n  \u003c/tr\u003e\n\u003c/table\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e1.3 Scale Progression Analysis\u003c/h3\u003e\n\n\u003cpre\u003e\u003ccode\u003e# Parameter count evolution (2018-2024)\nModel Scaling Timeline:\n├── ELMo (2018): 94 million parameters\n├── BERT-base (2018): 110 million parameters\n├── GPT-1 (2018): 117 million parameters\n├── GPT-2 (2019): 1.5 billion parameters\n├── T5 (2020): 11 billion parameters\n├── GPT-3 (2020): 175 billion parameters\n├── PaLM (2022): 540 billion parameters\n├── GPT-4 (2023): ~1.7 trillion parameters (estimated)\n└── Gemini Ultra (2024): ~? trillion parameters\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cimg width=\"664\" height=\"675\" alt=\"image\" src=\"https://github.com/user-attachments/assets/c2676e7a-bfaa-4350-b350-7ba41a5af149\" /\u003e\n\n\n\u003ch3\u003e1.4 Current Model Landscape\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eMajor Model Families:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cdiv style=\"display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;\"\u003e\n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003eGPT Series (OpenAI)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003eGenerative Pre-trained Transformers\u003c/li\u003e\n      \u003cli\u003eAutoregressive decoder-only architecture\u003c/li\u003e\n      \u003cli\u003eStrong few-shot learning capabilities\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n  \n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003eBERT Family (Google)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003eBidirectional Encoder Representations\u003c/li\u003e\n      \u003cli\u003eMasked language modeling objective\u003c/li\u003e\n      \u003cli\u003eExcellent for understanding tasks\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n  \n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003eT5 Framework (Google)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003eText-to-Text Transfer Transformer\u003c/li\u003e\n      \u003cli\u003eUnified framework for all NLP tasks\u003c/li\u003e\n      \u003cli\u003eEncoder-decoder architecture\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n  \n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003eLlama Series (Meta)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003eOpen-weight foundation models\u003c/li\u003e\n      \u003cli\u003eEfficient pre-training approaches\u003c/li\u003e\n      \u003cli\u003eStrong performance per parameter\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e1.5 Core Architectural Concepts\u003c/h3\u003e\n\n\u003cpre\u003e\u003ccode\u003eHigh-Level LLM Architecture:\nInput Text → Tokenization → Embedding → Transformer Blocks → Output Head → Generated Text\n    │           │             │              │                 │\n    │           │             │              └── Multi-Head Attention\n    │           │             │                  Layer Normalization\n    │           │             │                  Feed-Forward Networks\n    │           │             └── Word/Position Embeddings\n    │           └── Subword Tokenization (BPE, SentencePiece)\n    └── Prompt/Context\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cimg width=\"776\" height=\"625\" alt=\"image\" src=\"https://github.com/user-attachments/assets/876b938e-24ac-46b7-b10c-06506b0d6442\" /\u003e\n\n\n\u003ch2 id=\"learning-path\"\u003e2. Complete Learning Pathway\u003c/h2\u003e\n\n\u003ch3\u003e2.1 Prerequisite Knowledge Map\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eEssential Foundations:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px;\"\u003e\n\u003ch4\u003e🟦 Beginner Level (Months 1-3)\u003c/h4\u003e\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003ePython Programming\u003c/strong\u003e: OOP, data structures, libraries\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eLinear Algebra\u003c/strong\u003e: Vectors, matrices, transformations\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eProbability \u0026 Statistics\u003c/strong\u003e: Distributions, Bayes theorem\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eCalculus\u003c/strong\u003e: Derivatives, gradients, chain rule\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px; margin-top: 10px;\"\u003e\n\u003ch4\u003e🟩 Intermediate Level (Months 4-6)\u003c/h4\u003e\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003eDeep Learning Fundamentals\u003c/strong\u003e: Neural networks, backpropagation\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003ePyTorch/TensorFlow\u003c/strong\u003e: Model implementation, training loops\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eNLP Basics\u003c/strong\u003e: Tokenization, word embeddings, RNNs\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eSoftware Engineering\u003c/strong\u003e: Version control, testing, APIs\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px; margin-top: 10px;\"\u003e\n\u003ch4\u003e🟪 Advanced Level (Months 7-12)\u003c/h4\u003e\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003eTransformer Architecture\u003c/strong\u003e: Self-attention, positional encoding\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eDistributed Training\u003c/strong\u003e: Data/model parallelism, mixed precision\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eOptimization Theory\u003c/strong\u003e: Loss landscapes, convergence analysis\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eResearch Methodology\u003c/strong\u003e: Paper reading, experimental design\u003c/li\u003e\n\u003c/ul\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e2.2 Progressive Learning Roadmap\u003c/h3\u003e\n\n\u003cpre\u003e\u003ccode\u003eLearning Progression (12-Month Plan):\nMonth 1-2: Mathematical Foundations \u0026 Python\nMonth 3-4: Deep Learning Basics \u0026 PyTorch\nMonth 5-6: NLP Fundamentals \u0026 Classical Models\nMonth 7-8: Transformer Architecture \u0026 Implementation\nMonth 9-10: Pre-training \u0026 Fine-tuning Techniques\nMonth 11-12: Advanced Topics \u0026 Research Projects\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cimg width=\"487\" height=\"602\" alt=\"image\" src=\"https://github.com/user-attachments/assets/33c253db-2ab0-4925-802c-edd79472e1b5\" /\u003e\n\n\n\u003ch3\u003e2.3 Practical Project Timeline\u003c/h3\u003e\n\n\u003ctable border=\"1\" style=\"border-collapse: collapse; width: 100%;\"\u003e\n  \u003ctr style=\"background-color: #f2f2f2;\"\u003e\n    \u003cth\u003ePhase\u003c/th\u003e\n    \u003cth\u003eProjects\u003c/th\u003e\n    \u003cth\u003eTechnologies\u003c/th\u003e\n    \u003cth\u003eOutcomes\u003c/th\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eBeginner\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eText classification, Named Entity Recognition\u003c/td\u003e\n    \u003ctd\u003escikit-learn, spaCy, BERT\u003c/td\u003e\n    \u003ctd\u003eBasic NLP pipeline understanding\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eIntermediate\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eTransformer from scratch, Fine-tuning LLMs\u003c/td\u003e\n    \u003ctd\u003ePyTorch, HuggingFace, WandB\u003c/td\u003e\n    \u003ctd\u003eArchitecture mastery, training workflows\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eAdvanced\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003ePre-training small LLM, Optimization techniques\u003c/td\u003e\n    \u003ctd\u003eDeepSpeed, FlashAttention, vLLM\u003c/td\u003e\n    \u003ctd\u003eProduction-grade model development\u003c/td\u003e\n  \u003c/tr\u003e\n\u003c/table\u003e\n\n\u003ch2 id=\"mathematical-foundations\"\u003e3. Mathematical Foundations\u003c/h2\u003e\n\n\u003ch3\u003e3.1 Linear Algebra Essentials\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eVector and Matrix Operations:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003eGiven vectors $x, y \\in \\mathbb{R}^n$ and matrices $A, B \\in \\mathbb{R}^{m \\times n}$:\u003c/p\u003e\n\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003eDot Product\u003c/strong\u003e: $x \\cdot y = \\sum_{i=1}^{n} x_i y_i$\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eMatrix Multiplication\u003c/strong\u003e: $(AB)_{ij} = \\sum_{k=1}^{n} A_{ik} B_{kj}$\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eTranspose Properties\u003c/strong\u003e: $(A^T)_{ij} = A_{ji}$\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cp\u003e\u003cstrong\u003eEigen decomposition:\u003c/strong\u003e For square matrix $A$,\u003c/p\u003e\n\u003cp\u003e$A = Q \\Lambda Q^{-1}$ where $\\Lambda$ contains eigenvalues and $Q$ contains eigenvectors.\u003c/p\u003e\n\n\u003ch3\u003e3.2 Probability Theory\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eKey Distributions in LLMs:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003eSoftmax Distribution\u003c/strong\u003e: $P(y_i) = \\frac{e^{z_i}}{\\sum_{j=1}^{K} e^{z_j}}$\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eCross-Entropy Loss\u003c/strong\u003e: $L = -\\sum_{i=1}^{C} y_i \\log(\\hat{y}_i)$\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eBayes' Theorem\u003c/strong\u003e: $P(A|B) = \\frac{P(B|A)P(A)}{P(B)}$\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003ch3\u003e3.3 Information Theory\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eEntropy and KL Divergence:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eShannon Entropy\u003c/strong\u003e: $H(X) = -\\sum_{x \\in \\mathcal{X}} p(x) \\log p(x)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eCross Entropy\u003c/strong\u003e: $H(P, Q) = -\\sum_{x \\in \\mathcal{X}} P(x) \\log Q(x)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eKL Divergence\u003c/strong\u003e: $D_{KL}(P \\| Q) = \\sum_{x \\in \\mathcal{X}} P(x) \\log \\frac{P(x)}{Q(x)}$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003ePerplexity\u003c/strong\u003e: $\\text{PP}(X) = \\exp\\left(-\\frac{1}{N} \\sum_{i=1}^{N} \\log P(x_i)\\right)$\u003c/p\u003e\n\n\u003ch3\u003e3.4 Calculus for Optimization\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eGradient Descent Update Rule:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e$\\theta_{t+1} = \\theta_t - \\eta \\nabla_\\theta J(\\theta)$\u003c/p\u003e\n\n\u003cp\u003ewhere $\\eta$ is learning rate and $J(\\theta)$ is the loss function.\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eChain Rule for Backpropagation:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e$\\frac{\\partial L}{\\partial x} = \\frac{\\partial L}{\\partial y} \\cdot \\frac{\\partial y}{\\partial x}$\u003c/p\u003e\n\n\u003ch3\u003e3.5 Statistical Learning Theory\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eBias-Variance Decomposition:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e$\\mathbb{E}[(y - \\hat{f}(x))^2] = \\text{Bias}[\\hat{f}(x)]^2 + \\text{Var}[\\hat{f}(x)] + \\sigma^2$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$\\text{Bias}[\\hat{f}(x)] = \\mathbb{E}[\\hat{f}(x)] - f(x)$\u003c/li\u003e\n  \u003cli\u003e$\\text{Var}[\\hat{f}(x)] = \\mathbb{E}[(\\hat{f}(x) - \\mathbb{E}[\\hat{f}(x)])^2]$\u003c/li\u003e\n  \u003cli\u003e$\\sigma^2$ is irreducible error\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003ch3\u003e3.6 Key Mathematical Theorems\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eCentral Limit Theorem:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eGiven i.i.d. random variables $X_1, X_2, ..., X_n$ with mean $\\mu$ and variance $\\sigma^2$:\u003c/p\u003e\n\u003cp\u003e$\\frac{\\bar{X} - \\mu}{\\sigma/\\sqrt{n}} \\xrightarrow{d} N(0,1)$ as $n \\to \\infty$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eLaw of Large Numbers:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\bar{X}_n \\xrightarrow{a.s.} \\mu$ as $n \\to \\infty$\u003c/p\u003e\n\n\u003ch3\u003e3.7 Numerical Linear Algebra\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eSingular Value Decomposition (SVD):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$A = U \\Sigma V^T$ where:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$U$: left singular vectors (orthogonal)\u003c/li\u003e\n  \u003cli\u003e$\\Sigma$: singular values (diagonal matrix)\u003c/li\u003e\n  \u003cli\u003e$V$: right singular vectors (orthogonal)\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cp\u003e\u003cstrong\u003eLow-Rank Approximation:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$A_k = U_k \\Sigma_k V_k^T$ approximates $A$ with rank $k$\u003c/p\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e3.8 References \u0026 Further Reading\u003c/h3\u003e\n\n\u003cul\u003e\n  \u003cli\u003e\u003cstrong\u003eLinear Algebra\u003c/strong\u003e: Gilbert Strang, \"Introduction to Linear Algebra\"\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eProbability\u003c/strong\u003e: Sheldon Ross, \"A First Course in Probability\"\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eInformation Theory\u003c/strong\u003e: Thomas Cover, \"Elements of Information Theory\"\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eOptimization\u003c/strong\u003e: Stephen Boyd, \"Convex Optimization\"\u003c/li\u003e\n  \u003cli\u003e\u003cstrong\u003eDeep Learning\u003c/strong\u003e: Ian Goodfellow, \"Deep Learning\"\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px; margin-top: 20px;\"\u003e\n\u003ch2 id=\"programming-fundamentals\"\u003e4. Programming Fundamentals\u003c/h2\u003e\n\n\u003ch3\u003e4.1 Essential Programming Languages\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eCore Language Stack for LLM Development:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cdiv style=\"display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;\"\u003e\n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003ePython (Primary)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003e\u003cstrong\u003eFrameworks\u003c/strong\u003e: PyTorch, TensorFlow, JAX\u003c/li\u003e\n      \u003cli\u003e\u003cstrong\u003eLibraries\u003c/strong\u003e: Transformers, NumPy, Pandas\u003c/li\u003e\n      \u003cli\u003e\u003cstrong\u003eUse Cases\u003c/strong\u003e: Model development, training, research\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n  \n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003eC++ (Performance)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003e\u003cstrong\u003eFrameworks\u003c/strong\u003e: CUDA, PyTorch C++ API\u003c/li\u003e\n      \u003cli\u003e\u003cstrong\u003eLibraries\u003c/strong\u003e: Intel MKL, NVIDIA CUDA Toolkit\u003c/li\u003e\n      \u003cli\u003e\u003cstrong\u003eUse Cases\u003c/strong\u003e: Kernel optimization, inference engines\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n  \n  \u003cdiv style=\"border: 1px solid #ddd; padding: 15px;\"\u003e\n    \u003ch4\u003eBash/Shell (DevOps)\u003c/h4\u003e\n    \u003cul\u003e\n      \u003cli\u003e\u003cstrong\u003eTools\u003c/strong\u003e: Docker, Kubernetes, Slurm\u003c/li\u003e\n      \u003cli\u003e\u003cstrong\u003eUse Cases\u003c/strong\u003e: Deployment, cluster management, automation\u003c/li\u003e\n    \u003c/ul\u003e\n  \u003c/div\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e4.2 Python Ecosystem Mastery\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eEssential Libraries and Their Roles:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003e# Core LLM Development Stack\nllm_stack = {\n    \"deep_learning\": [\"PyTorch\", \"TensorFlow\", \"JAX\"],\n    \"transformer_libs\": [\"HuggingFace Transformers\", \"FairSeq\", \"Megatron-LM\"],\n    \"numerical_computing\": [\"NumPy\", \"SciPy\", \"CuPy\"],\n    \"data_processing\": [\"Pandas\", \"PyArrow\", \"Dask\"],\n    \"experiment_tracking\": [\"Weights \u0026 Biases\", \"MLflow\", \"TensorBoard\"],\n    \"distributed_training\": [\"DeepSpeed\", \"PyTorch DDP\", \"Horovod\"]\n}\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e4.3 PyTorch Fundamentals\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eCore Tensor Operations:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n# Basic tensor operations\nx = torch.randn(2, 3)  # 2x3 tensor\ny = torch.ones(2, 3)   # 2x3 tensor of ones\n\n# Common operations\nz = x + y              # Element-wise addition\nz = torch.matmul(x, y.T)  # Matrix multiplication\nz = F.softmax(x, dim=-1)  # Softmax activation\n\n# Automatic differentiation\nx = torch.tensor(2.0, requires_grad=True)\ny = x ** 2\ny.backward()  # Compute gradients\nprint(x.grad)  # dy/dx = 2x = 4.0\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eNeural Network Module:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass SimpleNN(nn.Module):\n    def __init__(self, input_size, hidden_size, output_size):\n        super().__init__()\n        self.fc1 = nn.Linear(input_size, hidden_size)\n        self.fc2 = nn.Linear(hidden_size, output_size)\n        self.dropout = nn.Dropout(0.1)\n        \n    def forward(self, x):\n        x = F.relu(self.fc1(x))\n        x = self.dropout(x)\n        x = self.fc2(x)\n        return x\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e4.4 Distributed Training Fundamentals\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eData Parallelism:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eimport torch.distributed as dist\nfrom torch.nn.parallel import DistributedDataParallel as DDP\n\n# Initialize distributed training\ndef setup(rank, world_size):\n    dist.init_process_group(\"nccl\", rank=rank, world_size=world_size)\n    torch.cuda.set_device(rank)\n\n# Wrap model with DDP\nmodel = SimpleNN(100, 50, 10)\nmodel = DDP(model, device_ids=[rank])\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eMixed Precision Training:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003efrom torch.cuda.amp import autocast, GradScaler\n\nscaler = GradScaler()\n\nfor input, target in dataloader:\n    optimizer.zero_grad()\n    \n    with autocast():\n        output = model(input)\n        loss = criterion(output, target)\n    \n    scaler.scale(loss).backward()\n    scaler.step(optimizer)\n    scaler.update()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e4.5 GPU Programming Basics\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eCUDA Fundamentals:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003e# GPU memory management\nx = torch.randn(1000, 1000).cuda()  # Move to GPU\ny = torch.randn(1000, 1000).cuda()\n\n# GPU operations\nz = torch.matmul(x, y)  # Executed on GPU\n\n# Memory statistics\nprint(torch.cuda.memory_allocated())  # Current memory usage\nprint(torch.cuda.max_memory_allocated())  # Peak memory usage\n\n# Synchronization\ntorch.cuda.synchronize()  # Wait for GPU operations to complete\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch2 id=\"neural-networks\"\u003e5. Neural Networks Deep Dive\u003c/h2\u003e\n\n\u003ch3\u003e5.1 Biological Inspiration \u0026 Mathematical Formulation\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eFrom Biological Neurons to Artificial Neurons:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003eA single artificial neuron implements:\u003c/p\u003e\n\u003cp\u003e$y = f\\left(\\sum_{i=1}^{n} w_i x_i + b\\right)$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$x_i$: Input features\u003c/li\u003e\n  \u003cli\u003e$w_i$: Learnable weights\u003c/li\u003e\n  \u003cli\u003e$b$: Bias term\u003c/li\u003e\n  \u003cli\u003e$f$: Activation function\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003ch3\u003e5.2 Activation Functions\u003c/h3\u003e\n\n\u003ctable border=\"1\" style=\"border-collapse: collapse; width: 100%;\"\u003e\n  \u003ctr style=\"background-color: #f2f2f2;\"\u003e\n    \u003cth\u003eFunction\u003c/th\u003e\n    \u003cth\u003eFormula\u003c/th\u003e\n    \u003cth\u003eDerivative\u003c/th\u003e\n    \u003cth\u003eUse Cases\u003c/th\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eSigmoid\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\sigma(x) = \\frac{1}{1 + e^{-x}}$\u003c/td\u003e\n    \u003ctd\u003e$\\sigma(x)(1 - \\sigma(x))$\u003c/td\u003e\n    \u003ctd\u003eBinary classification, gates\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eTanh\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\tanh(x) = \\frac{e^x - e^{-x}}{e^x + e^{-x}}$\u003c/td\u003e\n    \u003ctd\u003e$1 - \\tanh^2(x)$\u003c/td\u003e\n    \u003ctd\u003eHidden layers, RNNs\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eReLU\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\text{ReLU}(x) = \\max(0, x)$\u003c/td\u003e\n    \u003ctd\u003e$1$ if $x \u003e 0$, else $0$\u003c/td\u003e\n    \u003ctd\u003eMost hidden layers\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eGELU\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$x \\Phi(x)$\u003c/td\u003e\n    \u003ctd\u003eComplex\u003c/td\u003e\n    \u003ctd\u003eTransformers, BERT, GPT\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eSoftmax\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\frac{e^{x_i}}{\\sum_j e^{x_j}}$\u003c/td\u003e\n    \u003ctd\u003e$\\text{Softmax}(x_i)(\\delta_{ij} - \\text{Softmax}(x_j))$\u003c/td\u003e\n    \u003ctd\u003eOutput layer, attention\u003c/td\u003e\n  \u003c/tr\u003e\n\u003c/table\u003e\n\n\u003ch3\u003e5.3 Backpropagation Mathematics\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eChain Rule Formulation:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003eGiven a neural network with loss $L$, the gradient for weight $w_{ij}^{(l)}$ at layer $l$:\u003c/p\u003e\n\n\u003cp\u003e$\\frac{\\partial L}{\\partial w_{ij}^{(l)}} = \\frac{\\partial L}{\\partial z_j^{(l)}} \\cdot \\frac{\\partial z_j^{(l)}}{\\partial w_{ij}^{(l)}} = \\delta_j^{(l)} \\cdot a_i^{(l-1)}$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$z_j^{(l)} = \\sum_i w_{ij}^{(l)} a_i^{(l-1)} + b_j^{(l)}$ (pre-activation)\u003c/li\u003e\n  \u003cli\u003e$a_j^{(l)} = f(z_j^{(l)})$ (activation)\u003c/li\u003e\n  \u003cli\u003e$\\delta_j^{(l)} = \\frac{\\partial L}{\\partial z_j^{(l)}}$ (error term)\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cp\u003e\u003cstrong\u003eBackward Pass Recursion:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e$\\delta_j^{(l)} = f'(z_j^{(l)}) \\sum_k w_{jk}^{(l+1)} \\delta_k^{(l+1)}$\u003c/p\u003e\n\n\u003ch3\u003e5.4 Loss Functions\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eCommon Loss Functions in LLMs:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eCross-Entropy Loss\u003c/strong\u003e (Classification):\u003c/p\u003e\n\u003cp\u003e$L = -\\frac{1}{N} \\sum_{i=1}^N \\sum_{c=1}^C y_{i,c} \\log(\\hat{y}_{i,c})$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eMean Squared Error\u003c/strong\u003e (Regression):\u003c/p\u003e\n\u003cp\u003e$L = \\frac{1}{N} \\sum_{i=1}^N (y_i - \\hat{y}_i)^2$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eBinary Cross-Entropy\u003c/strong\u003e:\u003c/p\u003e\n\u003cp\u003e$L = -\\frac{1}{N} \\sum_{i=1}^N [y_i \\log(\\hat{y}_i) + (1-y_i) \\log(1-\\hat{y}_i)]$\u003c/p\u003e\n\n\u003ch3\u003e5.5 Optimization Algorithms\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eStochastic Gradient Descent (SGD):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\theta_{t+1} = \\theta_t - \\eta \\nabla_\\theta J(\\theta)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eMomentum SGD:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$v_{t+1} = \\gamma v_t + \\eta \\nabla_\\theta J(\\theta)$\u003c/p\u003e\n\u003cp\u003e$\\theta_{t+1} = \\theta_t - v_{t+1}$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eAdam Optimizer:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$m_t = \\beta_1 m_{t-1} + (1-\\beta_1) g_t$\u003c/p\u003e\n\u003cp\u003e$v_t = \\beta_2 v_{t-1} + (1-\\beta_2) g_t^2$\u003c/p\u003e\n\u003cp\u003e$\\hat{m}_t = \\frac{m_t}{1-\\beta_1^t}$\u003c/p\u003e\n\u003cp\u003e$\\hat{v}_t = \\frac{v_t}{1-\\beta_2^t}$\u003c/p\u003e\n\u003cp\u003e$\\theta_{t+1} = \\theta_t - \\eta \\frac{\\hat{m}_t}{\\sqrt{\\hat{v}_t} + \\epsilon}$\u003c/p\u003e\n\n\u003ch3\u003e5.6 Regularization Techniques\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eL1/L2 Regularization:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L_{\\text{total}} = L_{\\text{data}} + \\lambda \\sum_i |w_i|$ (L1)\u003c/p\u003e\n\u003cp\u003e$L_{\\text{total}} = L_{\\text{data}} + \\lambda \\sum_i w_i^2$ (L2)\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eDropout:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eDuring training: $a_i^{(l)} = \\frac{m_i}{1-p} f(z_i^{(l)})$\u003c/p\u003e\n\u003cp\u003ewhere $m_i \\sim \\text{Bernoulli}(1-p)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eBatch Normalization:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\hat{x}_i = \\frac{x_i - \\mu_B}{\\sqrt{\\sigma_B^2 + \\epsilon}}$\u003c/p\u003e\n\u003cp\u003e$y_i = \\gamma \\hat{x}_i + \\beta$\u003c/p\u003e\n\n\u003ch3\u003e5.7 Advanced Architectures\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eConvolutional Neural Networks (CNNs):\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003eclass CNN(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)\n        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)\n        self.pool = nn.MaxPool2d(2)\n        self.fc = nn.Linear(64 * 7 * 7, 10)\n    \n    def forward(self, x):\n        x = self.pool(F.relu(self.conv1(x)))\n        x = self.pool(F.relu(self.conv2(x)))\n        x = x.view(-1, 64 * 7 * 7)\n        x = self.fc(x)\n        return x\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eRecurrent Neural Networks (RNNs):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$h_t = \\tanh(W_{hh}h_{t-1} + W_{xh}x_t + b_h)$\u003c/p\u003e\n\u003cp\u003e$y_t = W_{hy}h_t + b_y$\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass SimpleRNN(nn.Module):\n    def __init__(self, input_size, hidden_size, output_size):\n        super().__init__()\n        self.hidden_size = hidden_size\n        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)\n        self.i2o = nn.Linear(input_size + hidden_size, output_size)\n    \n    def forward(self, input, hidden):\n        combined = torch.cat((input, hidden), 1)\n        hidden = torch.tanh(self.i2h(combined))\n        output = self.i2o(combined)\n        return output, hidden\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch2 id=\"transformer-architecture\"\u003e6. Transformer Architecture Mastery\u003c/h2\u003e\n\n\u003ch3\u003e6.1 Core Transformer Components\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eComplete Transformer Architecture:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eTransformer Architecture:\nInput → Token Embedding → Positional Encoding → Encoder Stack → Decoder Stack → Output\n    │                      │                      │              │\n    │                      │                      ├── Multi-Head Self-Attention\n    │                      │                      ├── Feed-Forward Network\n    │                      │                      ├── Layer Normalization\n    │                      │                      └── Residual Connections\n    │                      └── sin/cos functions or learned\n    └── WordPiece/BPE tokenization\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cimg width=\"664\" height=\"330\" alt=\"image\" src=\"https://github.com/user-attachments/assets/6bab9bb2-9009-4066-82c3-5cdffab6e95e\" /\u003e\n\n\n\u003ch3\u003e6.2 Self-Attention Mechanism\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eScaled Dot-Product Attention:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{Attention}(Q, K, V) = \\text{softmax}\\left(\\frac{QK^T}{\\sqrt{d_k}}\\right)V$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$Q$: Query matrix ($n \\times d_k$)\u003c/li\u003e\n  \u003cli\u003e$K$: Key matrix ($m \\times d_k$)\u003c/li\u003e\n  \u003cli\u003e$V$: Value matrix ($m \\times d_v$)\u003c/li\u003e\n  \u003cli\u003e$d_k$: Dimension of key vectors\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cp\u003e\u003cstrong\u003eMulti-Head Attention:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{MultiHead}(Q, K, V) = \\text{Concat}(\\text{head}_1, ..., \\text{head}_h)W^O$\u003c/p\u003e\n\u003cp\u003ewhere $\\text{head}_i = \\text{Attention}(QW_i^Q, KW_i^K, VW_i^V)$\u003c/p\u003e\n\n\u003ch3\u003e6.3 Positional Encoding\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eSinusoidal Positional Encoding:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$PE_{(pos, 2i)} = \\sin\\left(\\frac{pos}{10000^{2i/d_{\\text{model}}}}\\right)$\u003c/p\u003e\n\u003cp\u003e$PE_{(pos, 2i+1)} = \\cos\\left(\\frac{pos}{10000^{2i/d_{\\text{model}}}}\\right)$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$pos$: Position in the sequence\u003c/li\u003e\n  \u003cli\u003e$i$: Dimension index\u003c/li\u003e\n  \u003cli\u003e$d_{\\text{model}}$: Model dimension\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003ch3\u003e6.4 Feed-Forward Networks\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003ePosition-wise Feed-Forward Network:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{FFN}(x) = \\max(0, xW_1 + b_1)W_2 + b_2$\u003c/p\u003e\n\n\u003cp\u003eIn modern transformers, GELU activation is often used:\u003c/p\u003e\n\u003cp\u003e$\\text{GELU}(x) = x \\Phi(x)$\u003c/p\u003e\n\u003cp\u003ewhere $\\Phi(x)$ is the cumulative distribution function of the standard normal distribution.\u003c/p\u003e\n\n\u003ch3\u003e6.5 Layer Normalization\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eLayerNorm Operation:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{LayerNorm}(x) = \\gamma \\cdot \\frac{x - \\mu}{\\sqrt{\\sigma^2 + \\epsilon}} + \\beta$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$\\mu = \\frac{1}{d} \\sum_{i=1}^d x_i$\u003c/li\u003e\n  \u003cli\u003e$\\sigma^2 = \\frac{1}{d} \\sum_{i=1}^d (x_i - \\mu)^2$\u003c/li\u003e\n  \u003cli\u003e$\\gamma, \\beta$: Learnable parameters\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e6.6 Complete Transformer Implementation\u003c/h3\u003e\n\n\u003cpre\u003e\u003ccode\u003eimport torch\nimport torch.nn as nn\nimport math\n\nclass MultiHeadAttention(nn.Module):\n    def __init__(self, d_model, num_heads):\n        super().__init__()\n        self.d_model = d_model\n        self.num_heads = num_heads\n        self.d_k = d_model // num_heads\n        \n        self.w_q = nn.Linear(d_model, d_model)\n        self.w_k = nn.Linear(d_model, d_model)\n        self.w_v = nn.Linear(d_model, d_model)\n        self.w_o = nn.Linear(d_model, d_model)\n        \n    def forward(self, q, k, v, mask=None):\n        batch_size, seq_len = q.size(0), q.size(1)\n        \n        # Linear projections and reshape for multi-head\n        Q = self.w_q(q).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)\n        K = self.w_k(k).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)\n        V = self.w_v(v).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)\n        \n        # Scaled dot-product attention\n        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)\n        \n        if mask is not None:\n            scores = scores.masked_fill(mask == 0, -1e9)\n        \n        attn_weights = torch.softmax(scores, dim=-1)\n        attn_output = torch.matmul(attn_weights, V)\n        \n        # Concatenate heads and put through final linear layer\n        attn_output = attn_output.transpose(1, 2).contiguous().view(\n            batch_size, seq_len, self.d_model\n        )\n        return self.w_o(attn_output)\n\nclass PositionWiseFFN(nn.Module):\n    def __init__(self, d_model, d_ff):\n        super().__init__()\n        self.linear1 = nn.Linear(d_model, d_ff)\n        self.linear2 = nn.Linear(d_ff, d_model)\n        self.activation = nn.GELU()\n        \n    def forward(self, x):\n        return self.linear2(self.activation(self.linear1(x)))\n\nclass TransformerEncoderLayer(nn.Module):\n    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):\n        super().__init__()\n        self.self_attn = MultiHeadAttention(d_model, num_heads)\n        self.ffn = PositionWiseFFN(d_model, d_ff)\n        self.norm1 = nn.LayerNorm(d_model)\n        self.norm2 = nn.LayerNorm(d_model)\n        self.dropout = nn.Dropout(dropout)\n        \n    def forward(self, x, mask=None):\n        # Self-attention with residual connection and layer norm\n        attn_output = self.self_attn(x, x, x, mask)\n        x = self.norm1(x + self.dropout(attn_output))\n        \n        # Feed-forward with residual connection and layer norm\n        ffn_output = self.ffn(x)\n        x = self.norm2(x + self.dropout(ffn_output))\n        \n        return x\n\nclass PositionalEncoding(nn.Module):\n    def __init__(self, d_model, max_len=5000):\n        super().__init__()\n        \n        pe = torch.zeros(max_len, d_model)\n        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n        div_term = torch.exp(torch.arange(0, d_model, 2).float() * \n                           (-math.log(10000.0) / d_model))\n        \n        pe[:, 0::2] = torch.sin(position * div_term)\n        pe[:, 1::2] = torch.cos(position * div_term)\n        pe = pe.unsqueeze(0).transpose(0, 1)\n        \n        self.register_buffer('pe', pe)\n        \n    def forward(self, x):\n        return x + self.pe[:x.size(0), :]\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e6.7 Encoder-Decoder Architecture\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eCross-Attention Mechanism:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eIn decoder layers, cross-attention connects encoder outputs to decoder inputs:\u003c/p\u003e\n\u003cp\u003e$\\text{CrossAttention}(Q_{\\text{dec}}, K_{\\text{enc}}, V_{\\text{enc}}) = \\text{softmax}\\left(\\frac{Q_{\\text{dec}}K_{\\text{enc}}^T}{\\sqrt{d_k}}\\right)V_{\\text{enc}}$\u003c/p\u003e\n\n\u003ch3\u003e6.8 Masking Strategies\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eTypes of Attention Masks:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003e# Causal masking (autoregressive models)\ndef causal_mask(size):\n    mask = torch.triu(torch.ones(size, size), diagonal=1)\n    return mask == 0  # Lower triangular matrix\n\n# Padding mask\ndef padding_mask(input_ids, pad_token_id=0):\n    return (input_ids != pad_token_id).unsqueeze(1).unsqueeze(2)\n\n# Combined mask for decoder\ndef combined_mask(tgt, pad_token_id=0):\n    causal_mask = torch.triu(torch.ones(tgt.size(1), tgt.size(1)), diagonal=1)\n    padding_mask = (tgt != pad_token_id).unsqueeze(1)\n    return padding_mask \u0026 (causal_mask == 0)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e6.9 Modern Variants and Optimizations\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eArchitectural Improvements:\u003c/strong\u003e\u003c/p\u003e\n\n\u003ctable border=\"1\" style=\"border-collapse: collapse; width: 100%;\"\u003e\n  \u003ctr style=\"background-color: #f2f2f2;\"\u003e\n    \u003cth\u003eVariant\u003c/th\u003e\n    \u003cth\u003eKey Innovation\u003c/th\u003e\n    \u003cth\u003eUse Cases\u003c/th\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eALiBi\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eRelative positional encoding without learned parameters\u003c/td\u003e\n    \u003ctd\u003eLong sequence modeling\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eRoPE\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eRotary Position Embeddings\u003c/td\u003e\n    \u003ctd\u003eLlama, GPT-NeoX\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eFlashAttention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eIO-aware attention algorithm\u003c/td\u003e\n    \u003ctd\u003eLong context, memory efficiency\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eSwiGLU\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eGated linear unit activation\u003c/td\u003e\n    \u003ctd\u003ePaLM, Llama 2\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eGrouped Query Attention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eShared key-value heads across query heads\u003c/td\u003e\n    \u003ctd\u003eLlama 2, inference optimization\u003c/td\u003e\n  \u003c/tr\u003e\n\u003c/table\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px; margin-top: 20px;\"\u003e\n\u003ch2 id=\"attention-mechanisms\"\u003e7. Attention Mechanisms In-Depth\u003c/h2\u003e\n\n\u003ch3\u003e7.1 Attention Formalism\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eGeneral Attention Formulation:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eGiven queries $Q$, keys $K$, and values $V$, attention computes:\u003c/p\u003e\n\u003cp\u003e$\\text{Attention}(Q, K, V) = \\sum_i \\alpha(q, k_i) v_i$\u003c/p\u003e\n\u003cp\u003ewhere $\\alpha(q, k_i)$ is the attention weight between query $q$ and key $k_i$.\u003c/p\u003e\n\n\u003ch3\u003e7.2 Attention Variants\u003c/h3\u003e\n\n\u003ctable border=\"1\" style=\"border-collapse: collapse; width: 100%;\"\u003e\n  \u003ctr style=\"background-color: #f2f2f2;\"\u003e\n    \u003cth\u003eType\u003c/th\u003e\n    \u003cth\u003eFormula\u003c/th\u003e\n    \u003cth\u003eComplexity\u003c/th\u003e\n    \u003cth\u003eUse Cases\u003c/th\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eFull Self-Attention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\text{softmax}\\left(\\frac{QK^T}{\\sqrt{d_k}}\\right)V$\u003c/td\u003e\n    \u003ctd\u003e$O(n^2 d)$\u003c/td\u003e\n    \u003ctd\u003eStandard transformers, short sequences\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eLinear Attention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\phi(Q)(\\phi(K)^T V)$\u003c/td\u003e\n    \u003ctd\u003e$O(n d^2)$\u003c/td\u003e\n    \u003ctd\u003eLong sequences, memory constraints\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eLocal Attention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eWindow-based computation\u003c/td\u003e\n    \u003ctd\u003e$O(n w d)$\u003c/td\u003e\n    \u003ctd\u003eImages, local dependencies\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eSparse Attention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eFixed/learned patterns\u003c/td\u003e\n    \u003ctd\u003e$O(n \\sqrt{n} d)$\u003c/td\u003e\n    \u003ctd\u003eVery long sequences\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eLow-Rank Attention\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eProjected attention matrices\u003c/td\u003e\n    \u003ctd\u003e$O(n k d)$\u003c/td\u003e\n    \u003ctd\u003eApproximation, efficiency\u003c/td\u003e\n  \u003c/tr\u003e\n\u003c/table\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e7.3 Multi-Head Attention Mathematics\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eDetailed Multi-Head Formulation:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eFor head $i$:\u003c/p\u003e\n\u003cp\u003e$Q_i = Q W_i^Q, \\quad K_i = K W_i^K, \\quad V_i = V W_i^V$\u003c/p\u003e\n\u003cp\u003e$\\text{head}_i = \\text{softmax}\\left(\\frac{Q_i K_i^T}{\\sqrt{d_k}}\\right) V_i$\u003c/p\u003e\n\u003cp\u003e$\\text{MultiHead}(Q, K, V) = \\text{Concat}(\\text{head}_1, ..., \\text{head}_h) W^O$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eParameter Count:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eTotal parameters = $4 \\times d_{\\text{model}} \\times d_{\\text{model}}$ (for Q, K, V, O projections)\u003c/p\u003e\n\n\u003ch3\u003e7.4 Efficient Attention Mechanisms\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eLinformer (Low-Rank Projection):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$K' = E K, \\quad V' = F V$ where $E, F \\in \\mathbb{R}^{k \\times n}$\u003c/p\u003e\n\u003cp\u003eComplexity reduces from $O(n^2)$ to $O(nk)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003ePerformer (Fast Attention via Orthogonal Random Features):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{Attention}(Q, K, V) \\approx \\phi(Q) (\\phi(K)^T V)$\u003c/p\u003e\n\u003cp\u003ewhere $\\phi$ is a feature map approximating softmax kernel\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass EfficientAttention(nn.Module):\n    def __init__(self, d_model, num_heads, feature_dim=256):\n        super().__init__()\n        self.d_model = d_model\n        self.num_heads = num_heads\n        self.feature_dim = feature_dim\n        \n        # Random features for approximation\n        self.w = nn.Parameter(torch.randn(feature_dim, d_model // num_heads))\n        \n    def random_features(self, x):\n        # Random feature map for kernel approximation\n        x_proj = F.linear(x, self.w)\n        return torch.exp(x_proj - x_proj.max(dim=-1, keepdim=True)[0])\n    \n    def forward(self, q, k, v):\n        batch_size, seq_len = q.size(0), q.size(1)\n        \n        # Apply random feature maps\n        q_features = self.random_features(q)\n        k_features = self.random_features(k)\n        \n        # Linear attention computation\n        kv_matrix = torch.bmm(k_features.transpose(1,2), v)\n        attention_output = torch.bmm(q_features, kv_matrix)\n        \n        return attention_output\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e7.5 Sparse Attention Patterns\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eFixed Patterns:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003edef fixed_sparse_attention_mask(seq_len, pattern_type=\"strided\"):\n    mask = torch.zeros(seq_len, seq_len)\n    \n    if pattern_type == \"strided\":\n        # Every other position attends to previous 8 positions\n        for i in range(seq_len):\n            start = max(0, i - 8)\n            mask[i, start:i+1] = 1\n            if i % 2 == 0 and i \u003e 0:\n                mask[i, i-1] = 1\n                \n    elif pattern_type == \"dilated\":\n        # Dilated attention pattern\n        for i in range(seq_len):\n            for j in range(0, i+1, 2):  # Attend to every other position\n                if j \u003c= i:\n                    mask[i, j] = 1\n                    \n    return mask.bool()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e7.6 Long Sequence Attention\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eSliding Window Attention:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eEach position only attends to $w$ previous positions:\u003c/p\u003e\n\u003cp\u003e$\\text{Attention}(q_i, K, V) = \\sum_{j=\\max(0,i-w)}^{i} \\alpha(q_i, k_j) v_j$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eBlock-Sparse Attention:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003edef block_sparse_attention(q, k, v, block_size=64, num_blocks=4):\n    batch_size, seq_len, d_model = q.shape\n    \n    # Reshape into blocks\n    q_blocks = q.view(batch_size, seq_len // block_size, block_size, d_model)\n    k_blocks = k.view(batch_size, seq_len // block_size, block_size, d_model)\n    v_blocks = v.view(batch_size, seq_len // block_size, block_size, d_model)\n    \n    output = torch.zeros_like(q)\n    \n    # Each block attends to previous num_blocks blocks\n    for block_idx in range(seq_len // block_size):\n        start_block = max(0, block_idx - num_blocks + 1)\n        attended_blocks = range(start_block, block_idx + 1)\n        \n        # Compute attention within attended blocks\n        # ... implementation details ...\n        \n    return output\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch2 id=\"training-methodologies\"\u003e8. Advanced Training Methodologies\u003c/h2\u003e\n\n\u003ch3\u003e8.1 Pre-training Objectives\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eAutoregressive (Causal) Language Modeling:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L_{\\text{CLM}} = -\\sum_{t=1}^T \\log P(x_t | x_{\u0026lt;t})$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eMasked Language Modeling (BERT-style):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L_{\\text{MLM}} = -\\sum_{i \\in M} \\log P(x_i | x_{\\setminus M})$\u003c/p\u003e\n\u003cp\u003ewhere $M$ is set of masked positions\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003ePermutation Language Modeling (XLNet):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L_{\\text{PLM}} = \\mathbb{E}_{z \\sim Z_T} \\left[ \\sum_{t=1}^T \\log P(x_{z_t} | x_{z_{\u0026lt;t}}) \\right]$\u003c/p\u003e\n\n\u003ch3\u003e8.2 Scaling Laws\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eKaplan Scaling Laws:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L(N, D) = \\left(\\frac{N_c}{N}\\right)^{\\alpha_N} + \\left(\\frac{D_c}{D}\\right)^{\\alpha_D} + L_\\infty$\u003c/p\u003e\n\n\u003cp\u003ewhere:\u003c/p\u003e\n\u003cul\u003e\n  \u003cli\u003e$N$: Model parameters\u003c/li\u003e\n  \u003cli\u003e$D$: Training tokens\u003c/li\u003e\n  \u003cli\u003e$N_c, D_c$: Critical values\u003c/li\u003e\n  \u003cli\u003e$\\alpha_N, \\alpha_D$: Scaling exponents\u003c/li\u003e\n  \u003cli\u003e$L_\\infty$: Irreducible loss\u003c/li\u003e\n\u003c/ul\u003e\n\n\u003cp\u003e\u003cstrong\u003eChinchilla Optimal Scaling:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eFor compute budget $C$, optimal model size $N$ and tokens $D$ satisfy:\u003c/p\u003e\n\u003cp\u003e$N \\propto C^{0.5}, \\quad D \\propto C^{0.5}$\u003c/p\u003e\n\n\u003ch3\u003e8.3 Distributed Training Strategies\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eData Parallelism:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e# PyTorch DDP Example\nimport torch.distributed as dist\nfrom torch.nn.parallel import DistributedDataParallel as DDP\n\ndef train_ddp(rank, world_size):\n    # Initialize process group\n    dist.init_process_group(\"nccl\", rank=rank, world_size=world_size)\n    \n    # Model and data\n    model = TransformerModel().to(rank)\n    ddp_model = DDP(model, device_ids=[rank])\n    \n    # Training loop\n    for batch in dataloader:\n        loss = ddp_model(batch)\n        loss.backward()\n        optimizer.step()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eModel Parallelism:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003eclass ModelParallelTransformer(nn.Module):\n    def __init__(self, num_devices):\n        super().__init__()\n        self.num_devices = num_devices\n        self.layers = nn.ModuleList([\n            TransformerLayer().to(f\"cuda:{i % num_devices}\")\n            for i in range(num_layers)\n        ])\n    \n    def forward(self, x):\n        for i, layer in enumerate(self.layers):\n            device = f\"cuda:{i % self.num_devices}\"\n            x = x.to(device)\n            x = layer(x)\n        return x\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003ePipeline Parallelism:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003efrom torch.distributed.pipeline.sync import Pipe\n\n# Split model across devices\nmodel = LargeTransformer()\nmodel_parts = split_model_into_partitions(model, num_partitions=4)\n\n# Create pipeline\nmodel_pipe = Pipe(model_parts, chunks=8)  # Micro-batches\n\n# Training\noutput = model_pipe(input)\nloss = criterion(output, target)\nloss.backward()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e8.4 Mixed Precision Training\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eFP16/FP32 Mixed Precision:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003efrom torch.cuda.amp import autocast, GradScaler\n\nscaler = GradScaler()\n\nfor input, target in dataloader:\n    optimizer.zero_grad()\n    \n    with autocast():\n        output = model(input)\n        loss = criterion(output, target)\n    \n    scaler.scale(loss).backward()\n    scaler.step(optimizer)\n    scaler.update()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eBF16 Support:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e# BF16 has better dynamic range than FP16\ntorch.set_float32_matmul_precision('medium')  # Use TF32 for matmuls\n\nmodel = model.to(torch.bfloat16)\nfor input, target in dataloader:\n    input = input.to(torch.bfloat16)\n    output = model(input)\n    # No need for gradient scaling with BF16\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e8.5 Optimization Techniques\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eAdamW Optimizer:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\theta_{t+1} = \\theta_t - \\eta \\left( \\frac{\\hat{m}_t}{\\sqrt{\\hat{v}_t} + \\epsilon} + \\lambda \\theta_t \\right)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eLearning Rate Schedules:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eLinear Warmup + Cosine Decay:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003edef get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):\n    def lr_lambda(current_step):\n        if current_step \u003c num_warmup_steps:\n            return float(current_step) / float(max(1, num_warmup_steps))\n        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))\n        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))\n    \n    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e8.6 Regularization Methods\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eWeight Decay:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L_{\\text{total}} = L_{\\text{task}} + \\lambda \\sum \\theta^2$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eGradient Clipping:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e# Global gradient clipping\ntorch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n\n# Per-parameter clipping\nfor param in model.parameters():\n    if param.grad is not None:\n        param.grad.data.clamp_(-1.0, 1.0)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eStochastic Depth:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003eclass StochasticDepth(nn.Module):\n    def __init__(self, drop_prob):\n        super().__init__()\n        self.drop_prob = drop_prob\n    \n    def forward(self, x, layer):\n        if self.training and torch.rand(1) \u003c self.drop_prob:\n            return x  # Skip layer\n        return layer(x)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch2 id=\"fine-tuning-techniques\"\u003e9. Fine-tuning and Adaptation\u003c/h2\u003e\n\n\u003ch3\u003e9.1 Full Fine-tuning\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eStandard Fine-tuning Process:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003edef full_finetune(model, train_dataloader, num_epochs=3):\n    optimizer = AdamW(model.parameters(), lr=5e-5)\n    \n    for epoch in range(num_epochs):\n        model.train()\n        for batch in train_dataloader:\n            outputs = model(**batch)\n            loss = outputs.loss\n            loss.backward()\n            optimizer.step()\n            optimizer.zero_grad()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e9.2 Parameter-Efficient Fine-tuning (PEFT)\u003c/h3\u003e\n\n\u003ch4\u003e9.2.1 LoRA (Low-Rank Adaptation)\u003c/h4\u003e\n\n\u003cp\u003e\u003cstrong\u003eLoRA Mathematical Formulation:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$W' = W + \\Delta W = W + BA$\u003c/p\u003e\n\u003cp\u003ewhere $B \\in \\mathbb{R}^{d \\times r}$, $A \\in \\mathbb{R}^{r \\times k}$, $r \\ll \\min(d,k)$\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass LoRALayer(nn.Module):\n    def __init__(self, base_layer, rank=8, alpha=16):\n        super().__init__()\n        self.base_layer = base_layer\n        self.rank = rank\n        self.alpha = alpha\n        \n        # LoRA matrices\n        self.lora_A = nn.Parameter(torch.randn(base_layer.in_features, rank))\n        self.lora_B = nn.Parameter(torch.zeros(rank, base_layer.out_features))\n        \n    def forward(self, x):\n        base_output = self.base_layer(x)\n        lora_output = x @ self.lora_A @ self.lora_B\n        return base_output + (self.alpha / self.rank) * lora_output\n\ndef apply_lora_to_linear_layers(model, rank=8):\n    for name, module in model.named_children():\n        if isinstance(module, nn.Linear):\n            # Replace with LoRA layer\n            setattr(model, name, LoRALayer(module, rank=rank))\n        else:\n            apply_lora_to_linear_layers(module, rank)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch4\u003e9.2.2 Adapter Layers\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass Adapter(nn.Module):\n    def __init__(self, dim, adapter_dim=64):\n        super().__init__()\n        self.down_proj = nn.Linear(dim, adapter_dim)\n        self.up_proj = nn.Linear(adapter_dim, dim)\n        self.activation = nn.GELU()\n        \n    def forward(self, x):\n        return x + self.up_proj(self.activation(self.down_proj(x)))\n\nclass TransformerWithAdapters(nn.Module):\n    def __init__(self, base_transformer):\n        super().__init__()\n        self.base = base_transformer\n        \n        # Add adapters after attention and FFN\n        for layer in self.base.layers:\n            layer.attention_adapter = Adapter(layer.self_attn.d_model)\n            layer.ffn_adapter = Adapter(layer.ffn.d_model)\n    \n    def forward(self, x):\n        for layer in self.base.layers:\n            # Original attention\n            attn_output = layer.self_attn(x)\n            x = layer.attention_adapter(attn_output)\n            \n            # Original FFN\n            ffn_output = layer.ffn(x)\n            x = layer.ffn_adapter(ffn_output)\n        \n        return x\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e9.3 Prompt-based Methods\u003c/h3\u003e\n\n\u003ch4\u003e9.3.1 Prompt Tuning\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass PromptTuning(nn.Module):\n    def __init__(self, model, prompt_length=20):\n        super().__init__()\n        self.model = model\n        self.prompt_length = prompt_length\n        self.prompt_embeddings = nn.Parameter(\n            torch.randn(prompt_length, model.config.hidden_size)\n        )\n        \n    def forward(self, input_ids, attention_mask=None):\n        batch_size = input_ids.shape[0]\n        \n        # Get original embeddings\n        inputs_embeds = self.model.get_input_embeddings()(input_ids)\n        \n        # Concatenate prompt embeddings\n        prompt_embeds = self.prompt_embeddings.unsqueeze(0).repeat(batch_size, 1, 1)\n        inputs_embeds = torch.cat([prompt_embeds, inputs_embeds], dim=1)\n        \n        # Adjust attention mask\n        if attention_mask is not None:\n            prompt_mask = torch.ones(batch_size, self.prompt_length).to(attention_mask.device)\n            attention_mask = torch.cat([prompt_mask, attention_mask], dim=1)\n        \n        return self.model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e9.3.2 P-Tuning\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass PTuning(nn.Module):\n    def __init__(self, model, prompt_length=20, prompt_hidden_size=512):\n        super().__init__()\n        self.model = model\n        self.prompt_length = prompt_length\n        \n        # LSTM for prompt generation\n        self.lstm = nn.LSTM(\n            input_size=model.config.hidden_size,\n            hidden_size=prompt_hidden_size,\n            num_layers=2,\n            bidirectional=True,\n            batch_first=True\n        )\n        \n        self.mlp = nn.Sequential(\n            nn.Linear(2 * prompt_hidden_size, model.config.hidden_size),\n            nn.ReLU(),\n            nn.Linear(model.config.hidden_size, model.config.hidden_size)\n        )\n        \n    def forward(self, input_ids, attention_mask=None):\n        batch_size = input_ids.shape[0]\n        \n        # Generate continuous prompts\n        prompt_tokens = torch.arange(self.prompt_length).unsqueeze(0).repeat(batch_size, 1)\n        prompt_embeds = self.model.get_input_embeddings()(prompt_tokens)\n        \n        # Process through LSTM and MLP\n        lstm_out, _ = self.lstm(prompt_embeds)\n        continuous_prompts = self.mlp(lstm_out)\n        \n        # Get original embeddings and concatenate\n        inputs_embeds = self.model.get_input_embeddings()(input_ids)\n        inputs_embeds = torch.cat([continuous_prompts, inputs_embeds], dim=1)\n        \n        # Adjust attention mask\n        if attention_mask is not None:\n            prompt_mask = torch.ones(batch_size, self.prompt_length).to(attention_mask.device)\n            attention_mask = torch.cat([prompt_mask, attention_mask], dim=1)\n        \n        return self.model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e9.4 Instruction Tuning\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eInstruction Format:\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003einstruction_prompt = \"\"\"\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n\"\"\"\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eSupervised Fine-tuning (SFT):\u003c/strong\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003edef instruction_tuning_loss(model, batch):\n    \"\"\"Compute loss for instruction following\"\"\"\n    instructions = batch[\"instruction\"]\n    responses = batch[\"response\"]\n    \n    # Format input with instruction template\n    formatted_inputs = [\n        f\"Instruction: {inst}\\n\\nResponse: {resp}\"\n        for inst, resp in zip(instructions, responses)\n    ]\n    \n    # Tokenize and compute loss\n    inputs = tokenizer(formatted_inputs, return_tensors=\"pt\", padding=True, truncation=True)\n    outputs = model(**inputs, labels=inputs[\"input_ids\"])\n    \n    return outputs.loss\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e9.5 Reinforcement Learning from Human Feedback (RLHF)\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eThree-Stage RLHF Process:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003e# Stage 1: Supervised Fine-tuning\nsft_trainer = SFTTrainer(\n    model=base_model,\n    train_dataset=instruction_data,\n    formatting_func=format_instruction\n)\n\n# Stage 2: Reward Model Training\nclass RewardModel(nn.Module):\n    def __init__(self, base_model):\n        super().__init__()\n        self.transformer = base_model\n        self.value_head = nn.Linear(base_model.config.hidden_size, 1)\n    \n    def forward(self, input_ids, attention_mask):\n        outputs = self.transformer(input_ids, attention_mask=attention_mask)\n        last_hidden_state = outputs.last_hidden_state\n        # Use the EOS token for reward prediction\n        eos_token_hidden = last_hidden_state[:, -1, :]\n        reward = self.value_head(eos_token_hidden)\n        return reward\n\n# Stage 3: PPO Training\ndef ppo_training_step(policy_model, reward_model, prompts):\n    # Generate responses with current policy\n    with torch.no_grad():\n        old_responses = policy_model.generate(prompts)\n        old_rewards = reward_model(old_responses)\n    \n    # Update policy using PPO\n    # ... PPO implementation details ...\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e9.6 Evaluation Metrics for Fine-tuning\u003c/h3\u003e\n\n\u003ctable border=\"1\" style=\"border-collapse: collapse; width: 100%;\"\u003e\n  \u003ctr style=\"background-color: #f2f2f2;\"\u003e\n    \u003cth\u003eMetric\u003c/th\u003e\n    \u003cth\u003eFormula\u003c/th\u003e\n    \u003cth\u003eInterpretation\u003c/th\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003ePerplexity\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\exp\\left(-\\frac{1}{N}\\sum_{i=1}^N \\log P(w_i|w_{\u0026lt;i})\\right)$\u003c/td\u003e\n    \u003ctd\u003eLower is better\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eBLEU Score\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003eBP $\\cdot$ $\\exp\\left(\\sum_{n=1}^N w_n \\log p_n\\right)$\u003c/td\u003e\n    \u003ctd\u003e0-100, higher better\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eROUGE Score\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\frac{\\text{Overlap}}{\\text{Reference Length}}$\u003c/td\u003e\n    \u003ctd\u003eRecall-oriented\u003c/td\u003e\n  \u003c/tr\u003e\n  \u003ctr\u003e\n    \u003ctd\u003e\u003cstrong\u003eAccuracy\u003c/strong\u003e\u003c/td\u003e\n    \u003ctd\u003e$\\frac{\\text{Correct}}{\\text{Total}}$\u003c/td\u003e\n    \u003ctd\u003eClassification tasks\u003c/td\u003e\n  \u003c/tr\u003e\n\u003c/table\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px; margin-top: 20px;\"\u003e\n\u003ch2 id=\"inference-optimization\"\u003e10. Inference Optimization\u003c/h2\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch3\u003e10.1 Quantization Techniques\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eMathematical Foundation of Quantization:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eFor floating-point tensor $X$ to integer tensor $X_q$:\u003c/p\u003e\n\u003cp\u003e$X_q = \\text{round}\\left(\\frac{X - \\beta}{\\alpha}\\right)$\u003c/p\u003e\n\u003cp\u003ewhere $\\alpha = \\frac{\\max(X) - \\min(X)}{2^b - 1}$, $\\beta = \\min(X)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eDequantization:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$X_{\\text{dequant}} = X_q \\times \\alpha + \\beta$\u003c/p\u003e\n\n\u003ch4\u003e10.1.1 Post-Training Quantization (PTQ)\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eimport torch\nimport torch.quantization\n\ndef post_training_quantization(model, calibration_loader):\n    # Set model to evaluation mode\n    model.eval()\n    \n    # Prepare model for quantization\n    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')\n    torch.quantization.prepare(model, inplace=True)\n    \n    # Calibrate with sample data\n    with torch.no_grad():\n        for batch in calibration_loader:\n            model(batch)\n    \n    # Convert to quantized model\n    torch.quantization.convert(model, inplace=True)\n    return model\n\n# Example usage for linear layer quantization\nclass QuantizedLinear(torch.nn.Module):\n    def __init__(self, in_features, out_features):\n        super().__init__()\n        self.in_features = in_features\n        self.out_features = out_features\n        self.weight = torch.nn.Parameter(torch.randn(out_features, in_features))\n        self.weight_scale = torch.nn.Parameter(torch.tensor(1.0))\n        self.weight_zero_point = torch.nn.Parameter(torch.tensor(0))\n        \n    def forward(self, x):\n        # Quantize weights\n        weight_q = torch.quantize_per_tensor(\n            self.weight, self.weight_scale, self.weight_zero_point, torch.qint8\n        )\n        # Dequantize for computation (in real scenario, use quantized ops)\n        weight_dequant = weight_q.dequantize()\n        return torch.nn.functional.linear(x, weight_dequant)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e10.1.2 Quantization-Aware Training (QAT)\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass QATLinear(torch.nn.Module):\n    def __init__(self, in_features, out_features):\n        super().__init__()\n        self.in_features = in_features\n        self.out_features = out_features\n        self.weight = torch.nn.Parameter(torch.randn(out_features, in_features))\n        \n        # Quantization stubs\n        self.weight_quant = torch.quantization.QuantStub()\n        self.weight_dequant = torch.quantization.DeQuantStub()\n        \n    def forward(self, x):\n        # Simulate quantization during training\n        weight_quantized = self.weight_quant(self.weight)\n        weight = self.weight_dequant(weight_quantized)\n        return torch.nn.functional.linear(x, weight)\n\ndef prepare_qat(model):\n    # Fuse layers for better quantization\n    torch.quantization.fuse_modules(model, [['conv', 'bn', 'relu']], inplace=True)\n    \n    # Prepare for QAT\n    model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')\n    torch.quantization.prepare_qat(model, inplace=True)\n    return model\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e10.1.3 Mixed-Precision Quantization\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003edef mixed_precision_quantization(model, sensitivity_analysis):\n    \"\"\"Apply different precision based on layer sensitivity\"\"\"\n    quantization_config = {}\n    \n    for name, module in model.named_modules():\n        if isinstance(module, torch.nn.Linear):\n            sensitivity = sensitivity_analysis[name]\n            \n            if sensitivity \u003c 0.1:  # Low sensitivity\n                # Use 4-bit quantization\n                config = torch.quantization.QConfig(\n                    activation=torch.quantization.MinMaxObserver.with_args(dtype=torch.quint4),\n                    weight=torch.quantization.MinMaxObserver.with_args(dtype=torch.qint4)\n                )\n            elif sensitivity \u003c 0.3:  # Medium sensitivity\n                # Use 8-bit quantization\n                config = torch.quantization.default_qconfig\n            else:  # High sensitivity\n                # Keep in FP16\n                config = None\n                \n            quantization_config[name] = config\n    \n    return quantization_config\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e10.2 Pruning Methods\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eMagnitude-Based Pruning:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eRemove weights with smallest magnitudes:\u003c/p\u003e\n\u003cp\u003e$W_{\\text{pruned}}[i,j] = \\begin{cases} 0 \u0026 \\text{if } |W[i,j]| \u003c \\theta \\\\ W[i,j] \u0026 \\text{otherwise} \\end{cases}$\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass MagnitudePruning:\n    def __init__(self, pruning_rate=0.2):\n        self.pruning_rate = pruning_rate\n    \n    def apply(self, model):\n        all_weights = []\n        for name, param in model.named_parameters():\n            if 'weight' in name and len(param.shape) \u003e= 2:  # Only weight matrices\n                all_weights.append(param.data.abs().view(-1))\n        \n        # Calculate global threshold\n        all_weights = torch.cat(all_weights)\n        threshold = torch.quantile(all_weights, self.pruning_rate)\n        \n        # Apply pruning\n        for name, param in model.named_parameters():\n            if 'weight' in name and len(param.shape) \u003e= 2:\n                mask = param.data.abs() \u003e threshold\n                param.data *= mask.float()\n        \n        return model\n\ndef iterative_pruning(model, dataloader, total_iterations=10, target_sparsity=0.8):\n    \"\"\"Iterative pruning with fine-tuning\"\"\"\n    initial_sparsity = 0.0\n    sparsity_increment = (target_sparsity - initial_sparsity) / total_iterations\n    \n    for iteration in range(total_iterations):\n        # Prune\n        current_sparsity = initial_sparsity + (iteration + 1) * sparsity_increment\n        pruning = MagnitudePruning(pruning_rate=current_sparsity)\n        model = pruning.apply(model)\n        \n        # Fine-tune\n        fine_tune_model(model, dataloader, epochs=1)\n    \n    return model\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cp\u003e\u003cstrong\u003eStructured Pruning:\u003c/strong\u003e\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass StructuredPruning:\n    def __init__(self, pruning_method='l1'):\n        self.pruning_method = pruning_method\n    \n    def compute_importance(self, weight):\n        if self.pruning_method == 'l1':\n            return torch.norm(weight, p=1, dim=1)  # L1 norm of rows\n        elif self.pruning_method == 'l2':\n            return torch.norm(weight, p=2, dim=1)  # L2 norm of rows\n    \n    def prune_neurons(self, model, pruning_rate):\n        for name, module in model.named_modules():\n            if isinstance(module, torch.nn.Linear):\n                importance = self.compute_importance(module.weight)\n                \n                # Calculate threshold\n                threshold = torch.quantile(importance, pruning_rate)\n                \n                # Create mask for important neurons\n                mask = importance \u003e threshold\n                \n                # Apply mask to output dimension\n                module.weight.data = module.weight.data[mask, :]\n                if module.bias is not None:\n                    module.bias.data = module.bias.data[mask]\n                \n                # Update output features\n                module.out_features = mask.sum().item()\n        \n        return model\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e10.3 Knowledge Distillation\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003eDistillation Loss:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$L_{\\text{distill}} = \\alpha \\cdot L_{\\text{CE}}(y_{\\text{student}}, y_{\\text{true}}) + (1-\\alpha) \\cdot \\tau^2 \\cdot \\text{KL}(p_{\\text{teacher}}^\\tau \\| p_{\\text{student}}^\\tau)$\u003c/p\u003e\n\n\u003cp\u003ewhere $p^\\tau = \\text{softmax}(z/\\tau)$\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass KnowledgeDistillationLoss(torch.nn.Module):\n    def __init__(self, temperature=4.0, alpha=0.7):\n        super().__init__()\n        self.temperature = temperature\n        self.alpha = alpha\n        self.ce_loss = torch.nn.CrossEntropyLoss()\n        self.kl_loss = torch.nn.KLDivLoss(reduction='batchmean')\n    \n    def forward(self, student_logits, teacher_logits, labels):\n        # Soften the probabilities\n        student_probs = torch.nn.functional.log_softmax(student_logits / self.temperature, dim=-1)\n        teacher_probs = torch.nn.functional.softmax(teacher_logits / self.temperature, dim=-1)\n        \n        # Calculate distillation loss\n        distill_loss = self.kl_loss(student_probs, teacher_probs) * (self.temperature ** 2)\n        \n        # Calculate student loss\n        student_loss = self.ce_loss(student_logits, labels)\n        \n        # Combined loss\n        return self.alpha * student_loss + (1 - self.alpha) * distill_loss\n\ndef distill_training(student, teacher, dataloader, epochs=10):\n    criterion = KnowledgeDistillationLoss()\n    optimizer = torch.optim.Adam(student.parameters())\n    \n    for epoch in range(epochs):\n        for batch in dataloader:\n            inputs, labels = batch\n            \n            # Get teacher predictions (no gradient)\n            with torch.no_grad():\n                teacher_logits = teacher(inputs)\n            \n            # Student forward pass\n            student_logits = student(inputs)\n            \n            # Compute distillation loss\n            loss = criterion(student_logits, teacher_logits, labels)\n            \n            # Backward pass\n            optimizer.zero_grad()\n            loss.backward()\n            optimizer.step()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e10.4 Advanced Inference Techniques\u003c/h3\u003e\n\n\u003ch4\u003e10.4.1 Speculative Decoding\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass SpeculativeDecoding:\n    def __init__(self, target_model, draft_model, max_speculative_tokens=5):\n        self.target_model = target_model\n        self.draft_model = draft_model\n        self.max_speculative_tokens = max_speculative_tokens\n    \n    def generate(self, prompt, max_length=100):\n        sequences = prompt\n        draft_sequences = prompt\n        \n        while len(sequences[0]) \u003c max_length:\n            # Draft phase: generate multiple tokens quickly\n            draft_tokens = []\n            for _ in range(self.max_speculative_tokens):\n                draft_logits = self.draft_model(draft_sequences)\n                next_token = torch.argmax(draft_logits[:, -1, :], dim=-1)\n                draft_tokens.append(next_token)\n                draft_sequences = torch.cat([draft_sequences, next_token.unsqueeze(-1)], dim=-1)\n            \n            # Verification phase: check with target model\n            target_logits = self.target_model(draft_sequences)\n            target_probs = torch.softmax(target_logits, dim=-1)\n            \n            # Verify and accept tokens\n            accepted_tokens = self._verify_tokens(draft_tokens, target_probs)\n            \n            if len(accepted_tokens) \u003e 0:\n                sequences = torch.cat([sequences] + accepted_tokens, dim=-1)\n            else:\n                # If no tokens accepted, generate one from target model\n                next_token = torch.argmax(target_probs[:, -1, :], dim=-1)\n                sequences = torch.cat([sequences, next_token.unsqueeze(-1)], dim=-1)\n        \n        return sequences\n    \n    def _verify_tokens(self, draft_tokens, target_probs):\n        accepted_tokens = []\n        for i, token in enumerate(draft_tokens):\n            target_prob = target_probs[:, i, token]\n            draft_prob = # probability from draft model\n            \n            # Acceptance criteria\n            if torch.rand(1) \u003c torch.min(torch.tensor(1.0), target_prob / draft_prob):\n                accepted_tokens.append(token.unsqueeze(-1))\n            else:\n                break\n        \n        return accepted_tokens\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e10.4.2 KV Caching\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass KVCache:\n    def __init__(self, batch_size, max_length, num_heads, head_dim):\n        self.k_cache = torch.zeros(batch_size, max_length, num_heads, head_dim)\n        self.v_cache = torch.zeros(batch_size, max_length, num_heads, head_dim)\n        self.current_length = 0\n    \n    def update(self, new_k, new_v):\n        batch_size, seq_len = new_k.shape[0], new_k.shape[1]\n        \n        # Append new keys and values to cache\n        self.k_cache[:, self.current_length:self.current_length+seq_len] = new_k\n        self.v_cache[:, self.current_length:self.current_length+seq_len] = new_v\n        \n        self.current_length += seq_len\n        \n        return (self.k_cache[:, :self.current_length],\n                self.v_cache[:, :self.current_length])\n\nclass EfficientTransformerInference:\n    def __init__(self, model, max_cache_length=2048):\n        self.model = model\n        self.kv_cache = None\n        self.max_cache_length = max_cache_length\n    \n    def generate(self, input_ids, max_length=100):\n        if self.kv_cache is None:\n            self._initialize_cache(input_ids.shape[0])\n        \n        sequences = input_ids\n        \n        for _ in range(max_length - input_ids.shape[1]):\n            # Only process the last token for autoregressive generation\n            if sequences.shape[1] \u003e 1:\n                current_input = sequences[:, -1:]\n            else:\n                current_input = sequences\n            \n            # Forward pass with KV cache\n            outputs = self.model(\n                current_input,\n                past_key_values=self.kv_cache,\n                use_cache=True\n            )\n            \n            # Update KV cache\n            self.kv_cache = outputs.past_key_values\n            \n            # Get next token\n            next_token_logits = outputs.logits[:, -1, :]\n            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)\n            \n            sequences = torch.cat([sequences, next_token], dim=-1)\n        \n        return sequences\n    \n    def _initialize_cache(self, batch_size):\n        num_heads = self.model.config.num_attention_heads\n        head_dim = self.model.config.hidden_size // num_heads\n        \n        self.kv_cache = [\n            (torch.zeros(batch_size, self.max_cache_length, num_heads, head_dim),\n             torch.zeros(batch_size, self.max_cache_length, num_heads, head_dim))\n            for _ in range(self.model.config.num_hidden_layers)\n        ]\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e10.4.3 Continuous Batching\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass ContinuousBatchingInference:\n    def __init__(self, model, max_batch_size=32):\n        self.model = model\n        self.max_batch_size = max_batch_size\n        self.requests = []\n        self.kv_caches = {}\n    \n    def add_request(self, prompt, request_id):\n        self.requests.append({\n            'id': request_id,\n            'prompt': prompt,\n            'tokens': [prompt],\n            'finished': False\n        })\n        \n        # Initialize KV cache for this request\n        self.kv_caches[request_id] = self._initialize_kv_cache()\n    \n    def process_batch(self):\n        # Group requests that are ready for next token\n        batch_requests = []\n        batch_inputs = []\n        batch_kv_caches = []\n        \n        for req in self.requests:\n            if not req['finished']:\n                batch_requests.append(req)\n                batch_inputs.append(req['tokens'][-1])  # Last token\n                batch_kv_caches.append(self.kv_caches[req['id']])\n        \n        if not batch_requests:\n            return\n        \n        # Process batch\n        batch_outputs = self._process_batch_inference(\n            batch_inputs, batch_kv_caches\n        )\n        \n        # Update requests\n        for i, req in enumerate(batch_requests):\n            next_token = batch_outputs[i]\n            req['tokens'].append(next_token)\n            \n            # Check for completion\n            if next_token == self.model.config.eos_token_id:\n                req['finished'] = True\n    \n    def _process_batch_inference(self, batch_inputs, batch_kv_caches):\n        # Implement batched inference with separate KV caches\n        # This is a simplified version\n        batch_tensor = torch.stack(batch_inputs)\n        \n        # Process through model (would need custom implementation for separate KV caches)\n        outputs = self.model(batch_tensor)\n        next_tokens = torch.argmax(outputs.logits[:, -1, :], dim=-1)\n        \n        return next_tokens\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cdiv align=\"center\"\u003e\n  \u003cb\u003eM Wasif\u003c/b\u003e\n\u003c/div\u003e\n\n\u003ch2 id=\"evaluation-framework\"\u003e11. Comprehensive Evaluation\u003c/h2\u003e\n\n\u003ch3\u003e11.1 Intrinsic Evaluation Metrics\u003c/h3\u003e\n\n\u003cp\u003e\u003cstrong\u003ePerplexity:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{PPL} = \\exp\\left(-\\frac{1}{N}\\sum_{i=1}^N \\log P(w_i | w_{\u0026lt;i})\\right)$\u003c/p\u003e\n\n\u003cp\u003e\u003cstrong\u003eBits per Character (BPC):\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003e$\\text{BPC} = \\frac{1}{N}\\sum_{i=1}^N -\\log_2 P(w_i | w_{\u0026lt;i})$\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003edef calculate_perplexity(model, tokenizer, text_dataset):\n    total_log_likelihood = 0\n    total_tokens = 0\n    \n    model.eval()\n    with torch.no_grad():\n        for text in text_dataset:\n            inputs = tokenizer(text, return_tensors='pt')\n            outputs = model(**inputs, labels=inputs['input_ids'])\n            \n            # Negative log likelihood\n            nll = outputs.loss * inputs['input_ids'].numel()\n            total_log_likelihood += nll.item()\n            total_tokens += inputs['input_ids'].numel()\n    \n    avg_nll = total_log_likelihood / total_tokens\n    perplexity = torch.exp(torch.tensor(avg_nll))\n    return perplexity.item()\n\ndef calculate_bits_per_character(model, tokenizer, text):\n    \"\"\"Calculate bits per character for text generation models\"\"\"\n    total_bits = 0\n    total_chars = 0\n    \n    # Tokenize and process text\n    tokens = tokenizer.encode(text)\n    \n    for i in range(1, len(tokens)):\n        # Get probability of next token\n        input_ids = torch.tensor([tokens[:i]])\n        with torch.no_grad():\n            outputs = model(input_ids)\n            probs = torch.softmax(outputs.logits[0, -1], dim=-1)\n            token_prob = probs[tokens[i]].item()\n        \n        # Convert to bits\n        bits = -math.log2(token_prob) if token_prob \u003e 0 else float('inf')\n        total_bits += bits\n    \n    total_chars = len(text)\n    return total_bits / total_chars\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e11.2 Extrinsic Evaluation Benchmarks\u003c/h3\u003e\n\n\u003ch4\u003e11.2.1 General Language Understanding\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass GLUEEvaluator:\n    def __init__(self, model, tokenizer):\n        self.model = model\n        self.tokenizer = tokenizer\n        self.tasks = {\n            'cola': self.evaluate_cola,\n            'sst2': self.evaluate_sst2,\n            'mrpc': self.evaluate_mrpc,\n            'qqp': self.evaluate_qqp,\n            'mnli': self.evaluate_mnli\n        }\n    \n    def evaluate_all(self, datasets):\n        results = {}\n        for task_name, dataset in datasets.items():\n            if task_name in self.tasks:\n                accuracy = self.tasks[task_name](dataset)\n                results[task_name] = accuracy\n        return results\n    \n    def evaluate_sst2(self, dataset):\n        \"\"\"Sentiment classification accuracy\"\"\"\n        correct = 0\n        total = 0\n        \n        for text, label in dataset:\n            inputs = self.tokenizer(text, return_tensors='pt', truncation=True)\n            with torch.no_grad():\n                outputs = self.model(**inputs)\n                prediction = torch.argmax(outputs.logits, dim=-1).item()\n            \n            if prediction == label:\n                correct += 1\n            total += 1\n        \n        return correct / total\n\nclass MMLUEvaluator:\n    def __init__(self, model, tokenizer):\n        self.model = model\n        self.tokenizer = tokenizer\n    \n    def evaluate_subject(self, subject_name, test_data):\n        \"\"\"Evaluate on specific MMLU subject\"\"\"\n        correct = 0\n        total = 0\n        \n        for question_data in test_data:\n            question = question_data['question']\n            choices = question_data['choices']\n            answer = question_data['answer']\n            \n            # Format as multiple choice\n            prompt = self._format_mmlu_prompt(question, choices)\n            \n            # Get model probabilities for each choice\n            choice_probs = []\n            for choice in choices:\n                full_prompt = prompt + choice\n                inputs = self.tokenizer(full_prompt, return_tensors='pt')\n                with torch.no_grad():\n                    outputs = self.model(**inputs)\n                    # Use last token probability\n                    logits = outputs.logits[0, -1, :]\n                    prob = torch.softmax(logits, dim=-1)[self.tokenizer.eos_token_id]\n                    choice_probs.append(prob.item())\n            \n            # Predict highest probability choice\n            predicted = np.argmax(choice_probs)\n            if predicted == answer:\n                correct += 1\n            total += 1\n        \n        return correct / total\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e11.2.2 Reasoning and Mathematical Ability\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass GSM8KEvaluator:\n    def __init__(self, model, tokenizer):\n        self.model = model\n        self.tokenizer = tokenizer\n    \n    def evaluate(self, dataset):\n        correct = 0\n        total = 0\n        \n        for problem_data in dataset:\n            problem = problem_data['question']\n            answer = problem_data['answer']\n            \n            # Use chain-of-thought prompting\n            cot_prompt = f\"Q: {problem}\\nA: Let's think step by step.\"\n            \n            # Generate reasoning\n            inputs = self.tokenizer(cot_prompt, return_tensors='pt')\n            with torch.no_grad():\n                outputs = self.model.generate(\n                    **inputs,\n                    max_length=len(inputs['input_ids'][0]) + 200,\n                    temperature=0.7,\n                    do_sample=True\n                )\n            \n            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)\n            \n            # Extract final answer\n            final_answer = self._extract_answer(generated_text)\n            \n            if self._compare_answers(final_answer, answer):\n                correct += 1\n            total += 1\n        \n        return correct / total\n\nclass HumanEvalEvaluator:\n    def __init__(self, model, tokenizer):\n        self.model = model\n        self.tokenizer = tokenizer\n    \n    def evaluate_code_generation(self, problems):\n        \"\"\"Evaluate on HumanEval code generation benchmark\"\"\"\n        pass_at_k = {1: 0, 5: 0, 10: 0}\n        total = len(problems)\n        \n        for problem in problems:\n            prompt = problem['prompt']\n            test_cases = problem['test']\n            \n            # Generate multiple solutions\n            solutions = self._generate_solutions(prompt, num_samples=10)\n            \n            # Test solutions\n            for k in [1, 5, 10]:\n                passed = any(self._test_solution(sol, test_cases) \n                           for sol in solutions[:k])\n                if passed:\n                    pass_at_k[k] += 1\n        \n        # Calculate pass@k scores\n        return {k: count/total for k, count in pass_at_k.items()}\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e11.3 Safety and Alignment Evaluation\u003c/h3\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass SafetyEvaluator:\n    def __init__(self, model, tokenizer):\n        self.model = model\n        self.tokenizer = tokenizer\n        self.harmful_prompts = self._load_harmful_prompts()\n    \n    def evaluate_refusal_rate(self):\n        \"\"\"Measure how often model refuses harmful requests\"\"\"\n        refusal_count = 0\n        total_prompts = len(self.harmful_prompts)\n        \n        for prompt in self.harmful_prompts:\n            inputs = self.tokenizer(prompt, return_tensors='pt')\n            with torch.no_grad():\n                outputs = self.model.generate(\n                    **inputs,\n                    max_length=len(inputs['input_ids'][0]) + 50,\n                    temperature=0.7\n                )\n            \n            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)\n            \n            if self._is_refusal(response):\n                refusal_count += 1\n        \n        return refusal_count / total_prompts\n    \n    def evaluate_truthfulness(self, truthfulqa_dataset):\n        \"\"\"Evaluate using TruthfulQA benchmark\"\"\"\n        correct = 0\n        total = 0\n        \n        for qa_pair in truthfulqa_dataset:\n            question = qa_pair['question']\n            correct_answer = qa_pair['correct_answer']\n            incorrect_answers = qa_pair['incorrect_answers']\n            \n            # Test if model prefers correct answer\n            preference = self._measure_answer_preference(\n                question, correct_answer, incorrect_answers\n            )\n            \n            if preference == 'correct':\n                correct += 1\n            total += 1\n        \n        return correct / total\n\nclass BiasEvaluator:\n    def __init__(self, model, tokenizer):\n        self.model = model\n        self.tokenizer = tokenizer\n    \n    def evaluate_stereotypes(self, stereotype_dataset):\n        \"\"\"Measure stereotype amplification\"\"\"\n        stereotype_scores = []\n        \n        for example in stereotype_dataset:\n            context = example['context']\n            stereotype_completion = example['stereotype']\n            non_stereotype_completion = example['non_stereotype']\n            \n            # Measure probability of each completion\n            prob_stereotype = self._get_completion_probability(\n                context, stereotype_completion\n            )\n            prob_non_stereotype = self._get_completion_probability(\n                context, non_stereotype_completion\n            )\n            \n            # Calculate stereotype score\n            score = prob_stereotype / (prob_stereotype + prob_non_stereotype)\n            stereotype_scores.append(score)\n        \n        return np.mean(stereotype_scores)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch2 id=\"production-deployment\"\u003e12. Production Deployment\u003c/h2\u003e\n\n\u003ch3\u003e12.1 Model Serving Architectures\u003c/h3\u003e\n\n\u003ch4\u003e12.1.1 Real-time Serving with FastAPI\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003efrom fastapi import FastAPI, HTTPException\nfrom pydantic import BaseModel\nimport torch\nimport asyncio\nfrom typing import List\n\napp = FastAPI(title=\"LLM Inference API\")\n\nclass GenerationRequest(BaseModel):\n    prompt: str\n    max_length: int = 100\n    temperature: float = 0.7\n    top_p: float = 0.9\n    do_sample: bool = True\n\nclass GenerationResponse(BaseModel):\n    generated_text: str\n    inference_time: float\n    tokens_generated: int\n\nclass InferenceEngine:\n    def __init__(self, model_path):\n        self.model = self._load_model(model_path)\n        self.tokenizer = self._load_tokenizer(model_path)\n        self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n        self.model.to(self.device)\n        \n    def generate(self, request: GenerationRequest) -\u003e GenerationResponse:\n        start_time = time.time()\n        \n        # Tokenize input\n        inputs = self.tokenizer(request.prompt, return_tensors=\"pt\").to(self.device)\n        \n        # Generate\n        with torch.no_grad():\n            outputs = self.model.generate(\n                **inputs,\n                max_length=request.max_length,\n                temperature=request.temperature,\n                top_p=request.top_p,\n                do_sample=request.do_sample,\n                pad_token_id=self.tokenizer.eos_token_id\n            )\n        \n        # Decode\n        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)\n        \n        inference_time = time.time() - start_time\n        tokens_generated = len(outputs[0]) - len(inputs['input_ids'][0])\n        \n        return GenerationResponse(\n            generated_text=generated_text,\n            inference_time=inference_time,\n            tokens_generated=tokens_generated\n        )\n\n# Global inference engine\ninference_engine = InferenceEngine(\"path/to/model\")\n\n@app.post(\"/generate\", response_model=GenerationResponse)\nasync def generate_text(request: GenerationRequest):\n    try:\n        response = inference_engine.generate(request)\n        return response\n    except Exception as e:\n        raise HTTPException(status_code=500, detail=str(e))\n\n@app.get(\"/health\")\nasync def health_check():\n    return {\"status\": \"healthy\", \"model_loaded\": True}\n\nif __name__ == \"__main__\":\n    import uvicorn\n    uvicorn.run(app, host=\"0.0.0.0\", port=8000)\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e12.1.2 Batch Processing Service\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eimport redis\nfrom celery import Celery\nfrom typing import List, Dict\nimport json\n\n# Celery app for async task processing\ncelery_app = Celery('llm_worker', broker='redis://localhost:6379/0')\n\nclass BatchInferenceEngine:\n    def __init__(self, model_path, batch_size=32):\n        self.model = self._load_model(model_path)\n        self.tokenizer = self._load_tokenizer(model_path)\n        self.batch_size = batch_size\n        self.padding_queue = []\n    \n    def add_to_batch(self, prompt: str, request_id: str):\n        \"\"\"Add prompt to current batch\"\"\"\n        self.padding_queue.append({\n            'prompt': prompt,\n            'request_id': request_id,\n            'added_time': time.time()\n        })\n        \n        # Process batch if full or timeout\n        if len(self.padding_queue) \u003e= self.batch_size:\n            self._process_batch()\n    \n    def _process_batch(self):\n        if not self.padding_queue:\n            return\n        \n        # Prepare batch\n        prompts = [item['prompt'] for item in self.padding_queue]\n        request_ids = [item['request_id'] for item in self.padding_queue]\n        \n        # Tokenize with padding\n        inputs = self.tokenizer(\n            prompts, \n            return_tensors=\"pt\", \n            padding=True, \n            truncation=True,\n            max_length=512\n        )\n        \n        # Generate\n        with torch.no_grad():\n            outputs = self.model.generate(\n                **inputs,\n                max_length=100,\n                do_sample=True,\n                temperature=0.7\n            )\n        \n        # Decode and store results\n        for i, output in enumerate(outputs):\n            generated_text = self.tokenizer.decode(output, skip_special_tokens=True)\n            self._store_result(request_ids[i], generated_text)\n        \n        # Clear queue\n        self.padding_queue = []\n\n@celery_app.task\ndef process_batch_generation(prompts: List[str]) -\u003e List[str]:\n    \"\"\"Celery task for batch processing\"\"\"\n    inference_engine = BatchInferenceEngine(\"path/to/model\")\n    return inference_engine.process_batch(prompts)\n\n# Redis for result storage\nredis_client = redis.Redis(host='localhost', port=6379, db=0)\n\ndef submit_batch_job(prompts: List[str]) -\u003e str:\n    \"\"\"Submit batch job and return job ID\"\"\"\n    job_id = str(uuid.uuid4())\n    \n    # Store prompts in Redis\n    redis_client.setex(\n        f\"batch_prompts:{job_id}\", \n        3600,  # 1 hour expiry\n        json.dumps(prompts)\n    )\n    \n    # Start async processing\n    process_batch_generation.delay(prompts)\n    \n    return job_id\n\ndef get_batch_results(job_id: str) -\u003e List[str]:\n    \"\"\"Retrieve batch results\"\"\"\n    results_key = f\"batch_results:{job_id}\"\n    if redis_client.exists(results_key):\n        return json.loads(redis_client.get(results_key))\n    return None\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e12.2 Scaling and Load Balancing\u003c/h3\u003e\n\n\u003ch4\u003e12.2.1 Model Parallelism in Production\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass DistributedInferenceService:\n    def __init__(self, model_name, num_gpus=4):\n        self.num_gpus = num_gpus\n        self.model_parts = self._split_model_across_gpus(model_name)\n        \n    def _split_model_across_gpus(self, model_name):\n        \"\"\"Split transformer layers across multiple GPUs\"\"\"\n        model = AutoModelForCausalLM.from_pretrained(model_name)\n        layers_per_gpu = len(model.transformer.h) // self.num_gpus\n        \n        model_parts = []\n        for i in range(self.num_gpus):\n            start_layer = i * layers_per_gpu\n            end_layer = (i + 1) * layers_per_gpu if i \u003c self.num_gpus - 1 else len(model.transformer.h)\n            \n            # Move subset of layers to this GPU\n            gpu_layers = model.transformer.h[start_layer:end_layer]\n            for layer in gpu_layers:\n                layer.to(f\"cuda:{i}\")\n            \n            model_parts.append({\n                'gpu_id': i,\n                'layers': gpu_layers,\n                'start_layer': start_layer,\n                'end_layer': end_layer\n            })\n        \n        return model_parts\n    \n    def distributed_forward(self, hidden_states, attention_mask=None):\n        \"\"\"Forward pass through distributed model\"\"\"\n        current_states = hidden_states\n        \n        for model_part in self.model_parts:\n            # Move input to correct GPU\n            current_states = current_states.to(f\"cuda:{model_part['gpu_id']}\")\n            if attention_mask is not None:\n                attention_mask = attention_mask.to(f\"cuda:{model_part['gpu_id']}\")\n            \n            # Process through layers on this GPU\n            for layer in model_part['layers']:\n                current_states = layer(current_states, attention_mask=attention_mask)[0]\n        \n        return current_states\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch4\u003e12.2.2 Load Balancer Configuration\u003c/h4\u003e\n\n\u003cpre\u003e\u003ccode\u003efrom flask import Flask, request, jsonify\nimport requests\nimport threading\nimport time\n\nclass LoadBalancer:\n    def __init__(self, worker_urls):\n        self.worker_urls = worker_urls\n        self.worker_stats = {url: {'requests': 0, 'errors': 0, 'last_health_check': 0} \n                           for url in worker_urls}\n        self.lock = threading.Lock()\n        \n    def get_healthy_workers(self):\n        \"\"\"Get list of healthy workers based on recent health checks\"\"\"\n        healthy_workers = []\n        current_time = time.time()\n        \n        for url, stats in self.worker_stats.items():\n            # Consider worker healthy if checked within last 30 seconds\n            if current_time - stats['last_health_check'] \u003c 30:\n                healthy_workers.append(url)\n        \n        return healthy_workers\n    \n    def get_least_loaded_worker(self):\n        \"\"\"Select worker with least current load\"\"\"\n        healthy_workers = self.get_healthy_workers()\n        if not healthy_workers:\n            return None\n        \n        # Simple round-robin for now, could be enhanced with actual load metrics\n        with self.lock:\n            selected = min(healthy_workers, \n                         key=lambda url: self.worker_stats[url]['requests'])\n            self.worker_stats[selected]['requests'] += 1\n        \n        return selected\n    \n    def forward_request(self, prompt_data):\n        \"\"\"Forward request to selected worker\"\"\"\n        worker_url = self.get_least_loaded_worker()\n        if not worker_url:\n            return {\"error\": \"No healthy workers available\"}\n        \n        try:\n            response = requests.post(\n                f\"{worker_url}/generate\",\n                json=prompt_data,\n                timeout=30\n            )\n            response.raise_for_status()\n            return response.json()\n        except requests.RequestException as e:\n            with self.lock:\n                self.worker_stats[worker_url]['errors'] += 1\n            return {\"error\": f\"Worker error: {str(e)}\"}\n\n# Flask app as load balancer\napp = Flask(__name__)\nload_balancer = LoadBalancer([\n    \"http://worker1:8000\",\n    \"http://worker2:8000\", \n    \"http://worker3:8000\"\n])\n\n@app.route('/generate', methods=['POST'])\ndef generate_text():\n    data = request.get_json()\n    result = load_balancer.forward_request(data)\n    return jsonify(result)\n\ndef health_check_worker():\n    \"\"\"Background thread to check worker health\"\"\"\n    while True:\n        for worker_url in load_balancer.worker_urls:\n            try:\n                response = requests.get(f\"{worker_url}/health\", timeout=5)\n                if response.status_code == 200:\n                    with load_balancer.lock:\n                        load_balancer.worker_stats[worker_url]['last_health_check'] = time.time()\n            except requests.RequestException:\n                # Worker is unhealthy\n                pass\n        \n        time.sleep(10)  # Check every 10 seconds\n\n# Start health check thread\nhealth_thread = threading.Thread(target=health_check_worker, daemon=True)\nhealth_thread.start()\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003ch3\u003e12.3 Monitoring and Observability\u003c/h3\u003e\n\n\u003cpre\u003e\u003ccode\u003eimport prometheus_client\nfrom prometheus_client import Counter, Histogram, Gauge\nimport time\nimport logging\n\n# Prometheus metrics\nREQUEST_COUNT = Counter('llm_requests_total', 'Total requests', ['model', 'status'])\nREQUEST_DURATION = Histogram('llm_request_duration_seconds', 'Request duration')\nMODEL_LOAD_GAUGE = Gauge('llm_model_loaded', 'Model loaded status')\nGPU_MEMORY_GAUGE = Gauge('llm_gpu_memory_usage', 'GPU memory usage', ['gpu_id'])\n\nclass MonitoringMiddleware:\n    def __init__(self, app, model_name):\n        self.app = app\n        self.model_name = model_name\n    \n    def __call__(self, environ, start_response):\n        start_time = time.time()\n        \n        def custom_start_response(status, headers, exc_info=None):\n            # Record metrics\n            duration = time.time() - start_time\n            status_code = int(status.split(' ')[0])\n            \n            REQUEST_COUNT.labels(model=self.model_name, status=status_code).inc()\n            REQUEST_DURATION.observe(duration)\n            \n            return start_response(status, headers, exc_info)\n        \n        return self.app(environ, custom_start_response)\n\nclass PerformanceMonitor:\n    def __init__(self):\n        self.metrics = {\n            'throughput': 0,\n            'latency_p50': 0,\n            'latency_p95': 0,\n            'latency_p99': 0,\n            'error_rate': 0,\n            'gpu_utilization': 0\n        }\n        self.request_times = []\n        \n    def record_request(self, start_time, end_time, success=True):\n        duration = end_time - start_time\n        self.request_times.append(duration)\n        \n        # Keep only last 1000 requests for sliding window\n        if len(self.request_times) \u003e 1000:\n            self.request_times.pop(0)\n        \n        # Update metrics\n        self._update_metrics()\n    \n    def _update_metrics(self):\n        if not self.request_times:\n            return\n        \n        sorted_times = sorted(self.request_times)\n        n = len(sorted_times)\n        \n        self.metrics.update({\n            'throughput': n / 60,  # requests per minute\n            'latency_p50': sorted_times[int(n * 0.5)],\n            'latency_p95': sorted_times[int(n * 0.95)],\n            'latency_p99': sorted_times[int(n * 0.99)]\n        })\n    \n    def get_metrics(self):\n        return self.metrics.copy()\n\n# Logging configuration\ndef setup_logging():\n    logging.basicConfig(\n        level=logging.INFO,\n        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',\n        handlers=[\n            logging.FileHandler('llm_service.log'),\n            logging.StreamHandler()\n        ]\n    )\n    \n    # JSON formatter for structured logging\n    class JSONFormatter(logging.Formatter):\n        def format(self, record):\n            log_entry = {\n                'timestamp': self.formatTime(record),\n                'level': record.levelname,\n                'logger': record.name,\n                'message': record.getMessage(),\n                'module': record.module,\n                'function': record.funcName,\n                'line': record.lineno\n            }\n            \n            if hasattr(record, 'request_id'):\n                log_entry['request_id'] = record.request_id\n            if hasattr(record, 'model'):\n                log_entry['model'] = record.model\n            \n            return json.dumps(log_entry)\n    \n    # Apply JSON formatter to file handler\n    for handler in logging.getLogger().handlers:\n        if isinstance(handler, logging.FileHandler):\n            handler.setFormatter(JSONFormatter())\n\n# Alerting system\nclass AlertManager:\n    def __init__(self, thresholds):\n        self.thresholds = thresholds\n        self.alert_state = {}\n    \n    def check_metrics(self, metrics):\n        alerts = []\n        \n        # Check latency\n        if metrics['latency_p95'] \u003e self.thresholds['latency_p95']:\n            alerts.append({\n                'severity': 'warning',\n                'message': f\"P95 latency exceeded threshold: {metrics['latency_p95']:.2f}s\"\n            })\n        \n        # Check error rate\n        if metrics['error_rate'] \u003e self.thresholds['error_rate']:\n            alerts.append({\n                'severity': 'critical',\n                'message': f\"Error rate exceeded threshold: {metrics['error_rate']:.2%}\"\n            })\n        \n        # Check GPU memory\n        if metrics['gpu_utilization'] \u003e self.thresholds['gpu_memory']:\n            alerts.append({\n                'severity': 'warning',\n                'message': f\"GPU memory usage high: {metrics['gpu_utilization']:.1%}\"\n            })\n        \n        return alerts\n\u003c/code\u003e\u003c/pre\u003e\n\n\u003cdiv style=\"background: #e8f4f8; padding: 15px; border-radius: 5px; margin-top: 20px;\"\u003e\n\u003ch2 id=\"research-frontiers\"\u003e13. Research Frontiers\u003c/h2\u003e\n\n\u003ch3\u003e13.1 Next-Generation Architectures\u003c/h3\u003e\n\n\u003ch4\u003e13.1.1 Mixture of Experts (MoE)\u003c/h4\u003e\n\n\u003cp\u003e\u003cstrong\u003eMathematical Formulation:\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eGiven input $x$, MoE computes:\u003c/p\u003e\n\u003cp\u003e$y = \\sum_{i=1}^N G(x)_i \\cdot E_i(x)$\u003c/p\u003e\n\u003cp\u003ewhere $G(x)$ is the gating function and $E_i$ are expert networks.\u003c/p\u003e\n\n\u003cpre\u003e\u003ccode\u003eclass MixtureOfExperts(nn.Module):\n    def __init__(self, d_model, num_experts, expert_capacity, top_k=2):\n        super().__init__()\n        self.num_experts = num_experts\n        self.expert_capacity = expert_capacity\n        self.top_k = top_k\n        \n        # Expert networks\n        self.experts = nn.ModuleList([\n            nn.Sequential(\n                nn.Linear(d_model, d_model * 4),\n                nn.GELU(),\n                nn.Linear(d_model * 4, d_model)\n            ) for _ in range(num_experts)\n        ])\n        \n        # Gating network\n        self.gate = nn.Linear(d_model, num_experts)\n        \n    def forward(self, x):\n        batch_size, seq_len, d_model = x.shape\n        \n        # Compute gating scores\n        gate_scores = self.gate(x)  # [batch_size, seq_len, num_experts]\n        \n        # Top-k routing\n        topk_scores, topk_indices = torch.topk(\n            gate_scores, self.top_k, dim=-1\n        )\n        topk_probs = torch.softmax(topk_scores, dim=-1)\n        \n        # Initialize output\n        output = torch.zeros_like(x)\n        \n        # Process through experts\n        for expert_idx in range(self.num_experts):\n            # Find tokens assigned to this expert\n            ex","project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fmwasifanwar%2Fllm-mastery","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fmwasifanwar%2Fllm-mastery","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fmwasifanwar%2Fllm-mastery/lists"}