{"id":13539065,"url":"https://github.com/corkami/collisions","last_synced_at":"2025-05-14T03:09:09.361Z","repository":{"id":37445015,"uuid":"167916892","full_name":"corkami/collisions","owner":"corkami","description":"Hash collisions and exploitations","archived":false,"fork":false,"pushed_at":"2025-02-20T08:06:40.000Z","size":33896,"stargazers_count":3167,"open_issues_count":4,"forks_count":196,"subscribers_count":44,"default_branch":"master","last_synced_at":"2025-04-12T19:43:02.713Z","etag":null,"topics":["collisions","exploitation","hash","md5","scripts","sha1"],"latest_commit_sha":null,"homepage":"","language":"Python","has_issues":true,"has_wiki":null,"has_pages":null,"mirror_url":null,"source_name":null,"license":null,"status":null,"scm":"git","pull_requests_enabled":true,"icon_url":"https://github.com/corkami.png","metadata":{"files":{"readme":"README.html","changelog":null,"contributing":null,"funding":".github/FUNDING.yml","license":null,"code_of_conduct":null,"threat_model":null,"audit":null,"citation":null,"codeowners":null,"security":null,"support":null,"governance":null,"roadmap":null,"authors":null,"dei":null,"publiccode":null,"codemeta":null},"funding":{"github":"corkami","patreon":"corkami","custom":["https://paypal.me/corkami"]}},"created_at":"2019-01-28T07:20:07.000Z","updated_at":"2025-04-08T04:46:34.000Z","dependencies_parsed_at":"2024-08-01T09:22:34.242Z","dependency_job_id":"d181f216-666e-4312-a71f-baeaf1e58078","html_url":"https://github.com/corkami/collisions","commit_stats":null,"previous_names":[],"tags_count":0,"template":false,"template_full_name":null,"repository_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/corkami%2Fcollisions","tags_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/corkami%2Fcollisions/tags","releases_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/corkami%2Fcollisions/releases","manifests_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories/corkami%2Fcollisions/manifests","owner_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners/corkami","download_url":"https://codeload.github.com/corkami/collisions/tar.gz/refs/heads/master","host":{"name":"GitHub","url":"https://github.com","kind":"github","repositories_count":254059508,"owners_count":22007768,"icon_url":"https://github.com/github.png","version":null,"created_at":"2022-05-30T11:31:42.601Z","updated_at":"2022-07-04T15:15:14.044Z","host_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub","repositories_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repositories","repository_names_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/repository_names","owners_url":"https://repos.ecosyste.ms/api/v1/hosts/GitHub/owners"}},"keywords":["collisions","exploitation","hash","md5","scripts","sha1"],"created_at":"2024-08-01T09:01:19.767Z","updated_at":"2025-05-14T03:09:04.339Z","avatar_url":"https://github.com/corkami.png","language":"Python","readme":"\u003c!DOCTYPE html\u003e\n\u003chtml xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\"\u003e\n\u003chead\u003e\n  \u003cmeta charset=\"utf-8\" /\u003e\n  \u003cmeta name=\"generator\" content=\"pandoc\" /\u003e\n  \u003cmeta name=\"viewport\" content=\"width=device-width, initial-scale=1.0, user-scalable=yes\" /\u003e\n  \u003ctitle\u003eREADME\u003c/title\u003e\n  \u003cstyle type=\"text/css\"\u003e\n      code{white-space: pre-wrap;}\n      span.smallcaps{font-variant: small-caps;}\n      span.underline{text-decoration: underline;}\n      div.column{display: inline-block; vertical-align: top; width: 50%;}\n  \u003c/style\u003e\n  \u003cstyle type=\"text/css\"\u003e\na.sourceLine { display: inline-block; line-height: 1.25; }\na.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }\na.sourceLine:empty { height: 1.2em; position: absolute; }\n.sourceCode { overflow: visible; }\ncode.sourceCode { white-space: pre; position: relative; }\ndiv.sourceCode { margin: 1em 0; }\npre.sourceCode { margin: 0; }\n@media screen {\ndiv.sourceCode { overflow: auto; }\n}\n@media print {\ncode.sourceCode { white-space: pre-wrap; }\na.sourceLine { text-indent: -1em; padding-left: 1em; }\n}\npre.numberSource a.sourceLine\n  { position: relative; }\npre.numberSource a.sourceLine:empty\n  { position: absolute; }\npre.numberSource a.sourceLine::before\n  { content: attr(data-line-number);\n    position: absolute; left: -5em; text-align: right; vertical-align: baseline;\n    border: none; pointer-events: all;\n    -webkit-touch-callout: none; -webkit-user-select: none;\n    -khtml-user-select: none; -moz-user-select: none;\n    -ms-user-select: none; user-select: none;\n    padding: 0 4px; width: 4em;\n    color: #aaaaaa;\n  }\npre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }\ndiv.sourceCode\n  {  }\n@media screen {\na.sourceLine::before { text-decoration: underline; }\n}\ncode span.al { color: #ff0000; font-weight: bold; } /* Alert */\ncode span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */\ncode span.at { color: #7d9029; } /* Attribute */\ncode span.bn { color: #40a070; } /* BaseN */\ncode span.bu { } /* BuiltIn */\ncode span.cf { color: #007020; font-weight: bold; } /* ControlFlow */\ncode span.ch { color: #4070a0; } /* Char */\ncode span.cn { color: #880000; } /* Constant */\ncode span.co { color: #60a0b0; font-style: italic; } /* Comment */\ncode span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */\ncode span.do { color: #ba2121; font-style: italic; } /* Documentation */\ncode span.dt { color: #902000; } /* DataType */\ncode span.dv { color: #40a070; } /* DecVal */\ncode span.er { color: #ff0000; font-weight: bold; } /* Error */\ncode span.ex { } /* Extension */\ncode span.fl { color: #40a070; } /* Float */\ncode span.fu { color: #06287e; } /* Function */\ncode span.im { } /* Import */\ncode span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */\ncode span.kw { color: #007020; font-weight: bold; } /* Keyword */\ncode span.op { color: #666666; } /* Operator */\ncode span.ot { color: #007020; } /* Other */\ncode span.pp { color: #bc7a00; } /* Preprocessor */\ncode span.sc { color: #4070a0; } /* SpecialChar */\ncode span.ss { color: #bb6688; } /* SpecialString */\ncode span.st { color: #4070a0; } /* String */\ncode span.va { color: #19177c; } /* Variable */\ncode span.vs { color: #4070a0; } /* VerbatimString */\ncode span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */\n  \u003c/style\u003e\n  \u003c!--[if lt IE 9]\u003e\n    \u003cscript src=\"//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js\"\u003e\u003c/script\u003e\n  \u003c![endif]--\u003e\n\u003c/head\u003e\n\u003cbody\u003e\n\u003c!-- pandoc -s -f gfm -t html README.md -o README.html --\u003e\n\n\u003ch1 id=\"hash-collisions-and-exploitations\"\u003eHash collisions and exploitations\u003c/h1\u003e\n\u003cp\u003eBy Ange Albertini and Marc Stevens.\u003c/p\u003e\n\u003ch2 id=\"faq-tldr\"\u003eFAQ (TL;DR)\u003c/h2\u003e\n\u003cp\u003eQ: Is it possible to make a file get an arbitrary MD2/MD4/MD5/MD6/SHA1/SHA2/SHA3, or the same hash as another file?\u003cbr/\u003e A: No.\u003c/p\u003e\n\u003cp\u003eQ: Can one create 2 different files with the same hash?\u003cbr/\u003e A: With MD5, in \u003ca href=\"#fastcoll-md5\"\u003ea few seconds on a standard computer\u003c/a\u003e. With SHA1, it's \u003ca href=\"#shattered-sha1\"\u003epossible\u003c/a\u003e but not practical for end-users (Complexity: 2^61.2 Price: $11k).\u003c/p\u003e\n\u003cp\u003eQ: Can one make 2 different files get the same hash by appending stuff?\u003cbr/\u003e A: With MD5, in \u003ca href=\"#hashclash-md5\"\u003ea few hours on a standard computer\u003c/a\u003e. With SHA1, it's \u003ca href=\"#shambles-sha-1\"\u003epossible\u003c/a\u003e but not practical for end-users (Complexity: 2^63.4 Price: $45K)\u003c/p\u003e\n\u003cp\u003eQ: Will the 2 files remain valid?\u003cbr/\u003e A: In general, yes, as most file formats tolerate appended data. OTOH files signatures will be likely broken.\u003c/p\u003e\n\u003cp\u003eQ: Can one make 2 different files with arbitrary contents and the same hash?\u003cbr/\u003e A: Yes, it can be instant by relying on special file structures:\u003cbr/\u003e\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003ea special format header (or pair) with tricks, acting as a switch between 2 contents (some formats won't allow such tricks).\u003cbr/\u003e\u003c/li\u003e\n\u003cli\u003epre-computed collisions, based on the specific header(s).\u003cbr/\u003e\u003c/li\u003e\n\u003cli\u003etwo contents of specific formats, both presents after the collision (added after the computation).\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eQ: Which formats can I get instant MD5-colliding files pair for?\u003cbr/\u003e A: \u003ca href=\"scripts/jpg.py\"\u003eJPG\u003c/a\u003e, \u003ca href=\"scripts/png.py\"\u003ePNG\u003c/a\u003e, \u003ca href=\"scripts/gif.py\"\u003eGIF\u003c/a\u003e, \u003ca href=\"scripts/gz.py\"\u003eGZIP\u003c/a\u003e, \u003ca href=\"scripts/pe.py\"\u003ePortable Executable\u003c/a\u003e, \u003ca href=\"scripts/mp4.py\"\u003eMP4\u003c/a\u003e, \u003ca href=\"scripts/jp2.py\"\u003eJPEG2000\u003c/a\u003e, \u003ca href=\"scripts/pdf.py\"\u003ePDF\u003c/a\u003e, \u003ca href=\"scripts/zinsider.py\"\u003eDOCX/PPTX/XSLX\u003c/a\u003e, \u003ca href=\"scripts/zinsider.py\"\u003eEPUB\u003c/a\u003e, \u003ca href=\"scripts/zinsider.py\"\u003e3MF\u003c/a\u003e, \u003ca href=\"scripts/zinsider.py\"\u003eXPS\u003c/a\u003e. Just run the specific script.\u003c/p\u003e\n\u003cp\u003eQ: What about for SHA1?\u003cbr/\u003e A: For SHA1, \u003ca href=\"https://github.com/nneonneo/sha1collider\"\u003eJPG in a PDF\u003c/a\u003e is computed and implemented.\u003c/p\u003e\n\u003cp\u003eQ: What about formats already supported for MD5 (JPG, PNG...), but for SHA1 instead?\u003cbr/\u003e A: They're most likely supported with SHA1 too, but their collisions hasn't been computed.\u003c/p\u003e\n\u003cp\u003eQ: Are computations faster for similar (but different) contents?\u003cbr/\u003e A: No. Any tiny difference requires a full computation.\u003c/p\u003e\n\u003cp\u003eQ: Which formats don't have such shortcut?\u003cbr/\u003e A: ELF, Mach-O, Java Class, TAR, ZIP (among others...)\u003c/p\u003e\n\u003cp\u003eQ: Are classic collisions (in a few hours) still possible with these formats?\u003cbr/\u003e A: Yes, as long as any amount of appended data is tolerated (ie likely not ZIP or Class).\u003c/p\u003e\n\u003cp\u003eQ: Do you provide examples of collisions?\u003cbr/\u003e A: \u003ca href=\"examples/free/README.md\"\u003eYes\u003c/a\u003e.\u003c/p\u003e\n\u003ch2 id=\"table-of-contents\"\u003eTable of Contents\u003c/h2\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#introduction\"\u003eIntroduction\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#status\"\u003eStatus\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#attacks\"\u003eAttacks\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#identical-prefix\"\u003eIdentical prefix\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#fastcoll-md5\"\u003eFastColl (MD5)\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#unicoll-md5\"\u003eUniColl (MD5)\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#shattered-sha1\"\u003eShattered (SHA1)\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#chosen-prefix-collisions\"\u003eChosen-prefix collisions\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#hashclash-md5\"\u003eHashClash (MD5)\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#shambles-sha-1\"\u003eShambles (SHA1)\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#attacks-summary\"\u003eAttacks summary\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#exploitations\"\u003eExploitations\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#standard-strategy\"\u003eStandard strategy\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#jpg\"\u003eJPG\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#custom-scans\"\u003ecustom scans\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#png\"\u003ePNG\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#incompatibility\"\u003eincompatibility\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#gif\"\u003eGIF\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#gzip\"\u003eGZIP\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#lz4--zstandard\"\u003eLZ4 / Zstandard\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#portable-executable\"\u003ePortable Executable\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#mp4-and-others\"\u003eMP4 and others\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#jpeg2000\"\u003eJPEG2000\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#pdf\"\u003ePDF\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#jpg-in-pdf\"\u003eJPG in PDF\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#zip\"\u003eZIP\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#zip-based-formats\"\u003eZip-based formats\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#others\"\u003eOthers\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#uncommon-strategies\"\u003eUncommon strategies\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#multicolls-multiple-collisions-chain\"\u003eMultiColls: multiple collisions chain\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#hashquines\"\u003eHashquines\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#validity\"\u003eValidity\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#polycolls-collisions-of-different-file-types\"\u003ePolyColls: collisions of different file types\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#pe---jpg\"\u003ePE - JPG\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#pdf---pe\"\u003ePDF - PE\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#pdf---png\"\u003ePDF - PNG\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#pileups-multi-collision\"\u003ePileUps (multi-collision)\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#pe---png---mp4---pdf\"\u003ePE - PNG - MP4 - PDF\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#use-cases\"\u003eUse cases\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#gotta-collide-em-all\"\u003eGotta collide 'em all!\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#incriminating-files\"\u003eIncriminating files\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#failures\"\u003eFailures\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#elf\"\u003eELF\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#mach-o\"\u003eMach-O\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#java-class\"\u003eJava Class\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#tar\"\u003eTAR\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#exploitations-summary\"\u003eExploitations summary\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#test-files\"\u003eTest files\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#detection\"\u003eDetection\u003c/a\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"#safe-hashes\"\u003eSafe hashes\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#references\"\u003eReferences\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#credits\"\u003eCredits\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"#conclusion\"\u003eConclusion\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch1 id=\"introduction\"\u003eIntroduction\u003c/h1\u003e\n\u003cp\u003eThe goal is to explore extensively existing attacks - and show on the way how weak MD5 is (instant collisions of any JPG, PNG, PDF, MP4, PE...) - and also explore in detail common file formats to determine how they can be exploited with present or with future attacks.\u003c/p\u003e\n\u003cp\u003eIndeed, the same file format trick can be used on several hashes (the same JPG tricks were used for \u003ca href=\"https://archive.org/stream/pocorgtfo14#page/n49/mode/1up\"\u003eMD5\u003c/a\u003e, \u003ca href=\"https://malicioussha1.github.io/\"\u003emalicious SHA-1\u003c/a\u003e and \u003ca href=\"http://shattered.io\"\u003eSHA1\u003c/a\u003e), as long as the collisions follow the same byte patterns.\u003c/p\u003e\n\u003cp\u003eThis document is \u003cstrong\u003enot\u003c/strong\u003e about new attacks (the most recent one was documented in 2012), but about new forms of exploitations of existing attacks.\u003c/p\u003e\n\u003ch1 id=\"status\"\u003eStatus\u003c/h1\u003e\n\u003cp\u003eCurrent status of known attacks:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003eget a file to get another file's hash or a given hash: \u003cstrong\u003eimpossible\u003c/strong\u003e\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eit's still even not practical with \u003ca href=\"https://eprint.iacr.org/2008/089.pdf\"\u003eMD2\u003c/a\u003e or \u003ca href=\"https://who.paris.inria.fr/Gaetan.Leurent/files/MD4_FSE08.pdf\"\u003eMD4\u003c/a\u003e.\u003c/li\u003e\n\u003cli\u003eworks for simpler hashes(*) \u003c!-- Thanks Sven! --\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003eget two different files with the same MD5: \u003cstrong\u003einstant\u003c/strong\u003e\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eexamples: \u003ca href=\"examples/single-ipc1.bin\"\u003e1\u003c/a\u003e ⟷ \u003ca href=\"examples/single-ipc2.bin\"\u003e2\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003emake two arbitrary files get the same MD5: \u003cstrong\u003ea few hours\u003c/strong\u003e (72 hours.core)\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eexamples: \u003ca href=\"examples/single-cpc1.bin\"\u003e1\u003c/a\u003e ⟷ \u003ca href=\"examples/single-cpc2.bin\"\u003e2\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003emake two arbitrary files of specific file formats (PNG, JPG, PE...) get the same MD5: \u003cstrong\u003einstant\u003c/strong\u003e\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eread below\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003eget two different files with the same SHA1: 6500 years.core\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eget two different PDFs with the same SHA-1 to show a different picture: \u003ca href=\"https://github.com/nneonneo/sha1collider\"\u003einstant\u003c/a\u003e (the prefixes are already computed)\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003e(*) example with \u003ca href=\"https://docs.python.org/3/library/crypt.html\"\u003ecrypt\u003c/a\u003e - thanks \u003ca href=\"https://twitter.com/svblxyz\"\u003eSven\u003c/a\u003e!\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e\u0026gt;\u0026gt;\u0026gt; import crypt\n\u0026gt;\u0026gt;\u0026gt; crypt.crypt(\u0026quot;5dUD\u0026amp;66\u0026quot;, \u0026quot;br\u0026quot;)\n\u0026#39;brokenOz4KxMc\u0026#39;\n\u0026gt;\u0026gt;\u0026gt; crypt.crypt(\u0026quot;O!\u0026gt;\u0026#39;,%$\u0026quot;, \u0026quot;br\u0026quot;)\n\u0026#39;brokenOz4KxMc\u0026#39;\n\u003c/code\u003e\u003c/pre\u003e\n\u003ch1 id=\"attacks\"\u003eAttacks\u003c/h1\u003e\n\u003cp\u003eMD5 and SHA1 work with blocks of 64 bytes.\u003c/p\u003e\n\u003cp\u003eIf two contents A \u0026amp; B have the same hash, then appending the same contents C to both will keep the same hash.\u003c/p\u003e\n\u003cpre class=\"text\"\u003e\u003ccode\u003ehash(A) = hash(B) -\u0026gt; hash(A + C) = hash(B + C)\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eCollisions work by inserting at a block boundary a number of computed collision blocks that depends on what came before in the file. These collision blocks are very random-looking with some minor differences (that follow a specific pattern for each attack) and they will introduce tiny differences while eventually getting hashes the same value after these blocks.\u003c/p\u003e\n\u003cp\u003eThese differences are abused to craft valid files with specific properties.\u003c/p\u003e\n\u003cp\u003eFile formats also work top-down, and most of them work by byte-level chunks.\u003c/p\u003e\n\u003cp\u003eSome 'comment' chunks can be inserted to align file chunks to block boundaries, to align specific structures to collision blocks differences, to hide the rest of the collision blocks randomness from the file parsers, and to hide otherwise valid content from the parser (so that it will see another content).\u003c/p\u003e\n\u003cp\u003eThese 'comment' chunks are often not officially real comments: they are just used as data containers that are ignored by the parser (for example, PNG chunks with a lowercase-starting ID are ancillary, not critical).\u003c/p\u003e\n\u003cp\u003eMost of the time, a difference in the collision blocks is used to modify the length of a comment chunk, which is typically declared just before the data of this chunk: in the gap between the smaller and the longer version of this chunk, another comment chunk is declared to jump over one file's content \u003ccode\u003eA\u003c/code\u003e. After this file content \u003ccode\u003eA\u003c/code\u003e, just append another file content \u003ccode\u003eB\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/layout.png\" /\u003e\u003c/p\u003e\n\u003cp\u003eSince file formats usually define a terminator that will make parsers stop after it, \u003ccode\u003eA\u003c/code\u003e will terminate parsing, which will make the appended content \u003ccode\u003eB\u003c/code\u003e ignored.\u003c/p\u003e\n\u003cp\u003eSo typically at least two comments are needed - often three:\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003ealignment\u003c/li\u003e\n\u003cli\u003ehide collision blocks\u003c/li\u003e\n\u003cli\u003ehide one file content (for re-usable collisions)\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eThese common properties of file formats make it possible - they are not typically seen as weaknesses, but they can be detected or normalized out:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003edummy chunks - used as comments\u003c/li\u003e\n\u003cli\u003emore than one comment\u003c/li\u003e\n\u003cli\u003ehuge comments (lengths: 64b for MP4, 32b for PNG -\u0026gt; trivial collisions. 16b for JPG, 8b for GIF -\u0026gt; no generic collision for GIF, limited for JPG)\u003c/li\u003e\n\u003cli\u003estore any data in a comment (ASCII or UTF8 could be enforced)\u003c/li\u003e\n\u003cli\u003estore anything after the terminator (usually used only for malicious purposes) - can be avoided by using two comments finishing at the same offsets.\u003c/li\u003e\n\u003cli\u003eno integrity check. CRC32 in PNG are usually ignored. However they can be all correct since the collision blocks declare chunks of different lengths - so even if the chunk's data starts differently, the chunk lengths are different\u003c/li\u003e\n\u003cli\u003eflat structure: \u003ca href=\"https://en.wikipedia.org/wiki/Abstract_Syntax_Notation_One\"\u003eASN.1\u003c/a\u003e defines parent structure with the length of all the enclosed substructures, which prevents these constructs: you'd need to abuse a length, but also the length of the parent.\u003c/li\u003e\n\u003cli\u003eput a comment before the header - this makes generic re-usable collisions possible.\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch2 id=\"identical-prefix\"\u003eIdentical prefix\u003c/h2\u003e\n\u003col\u003e\n\u003cli\u003eDefine an arbitrary prefix - its content and length don't matter.\u003c/li\u003e\n\u003cli\u003eThe prefix is padded to the next 64-byte block.\u003c/li\u003e\n\u003cli\u003eCollision block(s) are computed depending on the prefix and appended. Both sides are very random. The differences are predetermined by the attack.\u003c/li\u003e\n\u003cli\u003eAfter this[these] block[s], the hash value is the same despite the file differences.\u003c/li\u003e\n\u003cli\u003eAny arbitrary identical suffix can be added.\u003c/li\u003e\n\u003c/ol\u003e\n\u003ctable\u003e\n\u003cthead\u003e\n\u003ctr class=\"header\"\u003e\n\u003cth style=\"text-align: center;\"\u003ePrefix\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003e=\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003ePrefix\u003c/th\u003e\n\u003c/tr\u003e\n\u003c/thead\u003e\n\u003ctbody\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd style=\"text-align: center;\"\u003eCollision \u003cem\u003eA\u003c/em\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e≠\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003eCollision \u003cem\u003eB\u003c/em\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd style=\"text-align: center;\"\u003eSuffix\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e=\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003eSuffix\u003c/td\u003e\n\u003c/tr\u003e\n\u003c/tbody\u003e\n\u003c/table\u003e\n\u003cp\u003eBoth files are almost identical (their content have only a few bits of differences)\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eExploitation\u003c/strong\u003e:\u003c/p\u003e\n\u003cp\u003eBundle two contents, then either:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eData exploit: run code that checks for differences and displays one or the other (typically trivial since differences are known in advance).\u003c/li\u003e\n\u003cli\u003eStructure exploit: exploit file structure (typically, the length of a comment) to hide one content or show the other (depends on the file format and its parsers).\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eTwo files with this structure:\u003c/p\u003e\n\u003ctable\u003e\n\u003cthead\u003e\n\u003ctr class=\"header\"\u003e\n\u003cth style=\"text-align: center;\"\u003ePrefix\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003e=\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003ePrefix\u003c/th\u003e\n\u003c/tr\u003e\n\u003c/thead\u003e\n\u003ctbody\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd style=\"text-align: center;\"\u003eCollision \u003cem\u003eA\u003c/em\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e≠\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003eCollision \u003cem\u003eB\u003c/em\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003cstrong\u003eA\u003c/strong\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e=\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003cdel\u003eA\u003c/del\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003cdel\u003eB\u003c/del\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e=\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003cstrong\u003eB\u003c/strong\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003c/tbody\u003e\n\u003c/table\u003e\n\u003cp\u003ewill show either A or B.\u003c/p\u003e\n\u003cimg alt='identical prefix collisions' src=pics/identical.png width=350/\u003e\n\n\u003ch3 id=\"fastcoll-md5\"\u003e\u003ca href=\"https://www.win.tue.nl/hashclash/\"\u003eFastColl\u003c/a\u003e (MD5)\u003c/h3\u003e\n\u003cp\u003eFinal version in 2009.\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003etime: a few seconds of computation\u003c/li\u003e\n\u003cli\u003espace: two blocks\u003c/li\u003e\n\u003cli\u003edifferences: no control before, no control after. FastColl difference mask:\n\u003cpre\u003e\u003ccode\u003e.. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..\n.. .. .. X. .. .. .. .. .. .. .. .. .. .. .. ..\n.. .. .. .. .. .. .. .. .. .. .. .. .. X. .X ..\n.. .. .. .. .. .. .. .. .. .. .. X. .. .. .. ..\n\u003c/code\u003e\u003c/pre\u003e\u003c/li\u003e\n\u003cli\u003eexploitation: hard\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eThe differences aren't near the start/end of the blocks, so it's very hard to exploit since you don't control any nearby byte. A potential solution is to brute-force the surrounding bytes - cf \u003ca href=\"https://github.com/angea/pocorgtfo#0x14\"\u003ePoCGTFO 14:10\u003c/a\u003e.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eExamples\u003c/strong\u003e:\u003c/p\u003e\n\u003cp\u003eWith an empty prefix:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003eMD5: fe6c446ee3a831ee010f33ac9c1b602c\nSHA256: c5dd2ef7c74cd2e80a0fd16f1dd6955c626b59def888be734219d48da6b9dbdd\n\n00:  37 75 C1 F1-C4 A7 5A E7-9C E0 DE 7A-5B 10 80 26  7u┴±─ºZτ£α▐z[►Ç\u0026amp;\n10:  02 AB D9 39-C9 6C 5F 02-12 C2 7F DA-CD 0D A3 B0  ☻½┘9╔l_☻↕┬⌂┌═♪ú░\n20:  8C ED FA F3-E1 A3 FD B4-EF 09 E7 FB-B1 C3 99 1D  îφ·≤ßú²┤∩○τ√▒├Ö↔\n30:  CD 91 C8 45-E6 6E FD 3D-C7 BB 61 52-3E F4 E0 38  ═æ╚Eµn²=╟╗aR\u0026gt;⌠α8  \\\n40:  49 11 85 69-EB CC 17 9C-93 4F 40 EB-33 02 AD 20  I◄àiδ╠↨£ôO@δ3☻¡ \n50:  A4 09 2D FB-15 FA 20 1D-D1 DB 17 CD-DD 29 59 1E  ñ○-√§· ↔╤█↨═▌)Y▲    ................\n60:  39 89 9E F6-79 46 9F E6-8B 85 C5 EF-DE 42 4F 46  9ë₧÷yFƒµïà┼∩▐BOF    ...X............\n70:  C2 78 75 9D-8B 65 F4 50-EA 21 C5 59-18 62 FF 7B  ┬xu¥ïe⌠PΩ!┼Y↑b {    .............XX.\n                                                                          ...........X....\n                                                                          ................\n00:  37 75 C1 F1-C4 A7 5A E7-9C E0 DE 7A-5B 10 80 26  7u┴±─ºZτ£α▐z[►Ç\u0026amp;    ...X............\n10:  02 AB D9 B9-C9 6C 5F 02-12 C2 7F DA-CD 0D A3 B0  ☻½┘╣╔l_☻↕┬⌂┌═♪ú░    .............XX.\n20:  8C ED FA F3-E1 A3 FD B4-EF 09 E7 FB-B1 43 9A 1D  îφ·≤ßú²┤∩○τ√▒CÜ↔    ...........X....\n30:  CD 91 C8 45-E6 6E FD 3D-C7 BB 61 D2-3E F4 E0 38  ═æ╚Eµn²=╟╗a╥\u0026gt;⌠α8\n40:  49 11 85 69-EB CC 17 9C-93 4F 40 EB-33 02 AD 20  I◄àiδ╠↨£ôO@δ3☻¡   /\n50:  A4 09 2D 7B-15 FA 20 1D-D1 DB 17 CD-DD 29 59 1E  ñ○-{§· ↔╤█↨═▌)Y▲\n60:  39 89 9E F6-79 46 9F E6-8B 85 C5 EF-DE C2 4E 46  9ë₧÷yFƒµïà┼∩▐┬NF\n70:  C2 78 75 9D-8B 65 F4 50-EA 21 C5 D9-18 62 FF 7B  ┬xu¥ïe⌠PΩ!┼┘↑b {\n\nMD5: fe6c446ee3a831ee010f33ac9c1b602c\nSHA256: e27cf3073c704d0665da42d597d4d20131013204eecb6372a5bd60aeddd5d670\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eOther examples, with an identical prefix: \u003ca href=\"examples/fastcoll1.bin\"\u003e1\u003c/a\u003e ⟷ \u003ca href=\"examples/fastcoll2.bin\"\u003e2\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eVariant\u003c/strong\u003e: there is a \u003ca href=\"https://marc-stevens.nl/research/md5-1block-collision/\"\u003esingle-block MD5 collision\u003c/a\u003e but it takes five weeks of computation.\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"examples/fastcoll.svg\"\u003erecording\u003c/a\u003e of a FastColl computation without any prefix and \u003ca href=\"examples/fastcoll-prefix.svg\"\u003eanother one\u003c/a\u003e with a prefix.\u003c/p\u003e\n\u003ch3 id=\"unicoll-md5\"\u003e\u003ca href=\"unicoll.md\"\u003eUniColl\u003c/a\u003e (MD5)\u003c/h3\u003e\n\u003cp\u003eDocumented in \u003ca href=\"https://www.cwi.nl/system/files/PhD-Thesis-Marc-Stevens-Attacks-on-Hash-Functions-and-Applications.pdf#page=199\"\u003e2012\u003c/a\u003e, implemented in \u003ca href=\"https://github.com/cr-marcstevens/hashclash/blob/95c2619a8078990056beb7aaa59104021714ee3c/scripts/poc_no.sh\"\u003e2017\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://github.com/cr-marcstevens/hashclash#create-you-own-identical-prefix-collision\"\u003eUniColl\u003c/a\u003e lets you control a few bytes in the collision blocks, before and after the first difference, which makes it an identical-prefix collision with some controllable differences, almost like a chosen-prefix collision. This is very handy, and even better the difference can be very predictable: in the case of \u003ccode\u003em2+= 2^8\u003c/code\u003e (a.k.a. \u003ccode\u003eN=1\u003c/code\u003e / \u003ccode\u003em2 9\u003c/code\u003e in HashClash \u003ca href=\"https://github.com/cr-marcstevens/hashclash/blob/master/scripts/poc_no.sh#L30\"\u003epoc_no.sh\u003c/a\u003e script), the difference is +1 on the 9th byte, which makes it very exploitable, as you can even think about the collision in your head: the 9th character of that sentence will be replaced with the next one: \u003ccode\u003e0\u003c/code\u003e replaced by \u003ccode\u003e1\u003c/code\u003e, \u003ccode\u003ea\u003c/code\u003e replaced by \u003ccode\u003eb\u003c/code\u003e..\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003etime: a few minutes (depends on the amount of byte you want to control )\u003c/li\u003e\n\u003cli\u003espace: two blocks\u003c/li\u003e\n\u003cli\u003edifferences:\n\u003cpre\u003e\u003ccode\u003e.. .. .. .. DD .. .. .. ..\n.. .. .. .. +1 .. .. .. ..\n\u003c/code\u003e\u003c/pre\u003e\u003c/li\u003e\n\u003cli\u003eexploitation: very easy - controlled bytes before and after the difference, and the difference is predictable. The only restrictions are alignment and that you 'only' control 10 bytes after the difference.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eExamples with \u003ccode\u003eN=1\u003c/code\u003e and 20 bytes of set text in the collision blocks:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e00:  55 6E 69 43-6F 6C 6C 20-31 20 70 72-65 66 69 78  UniColl 1 prefix\n10:  20 32 30 62-F5 48 34 B9-3B 1C 01 9F-C8 6B E6 44   20b⌡H4╣;∟☺ƒ╚kµD\n20:  FE F6 31 3A-63 DB 99 3E-77 4D C7 5A-6E B0 A6 88  ■÷1:c█Ö\u0026gt;wM╟Zn░ªê\n30:  04 05 FB 39-33 21 64 BF-0D A4 FE E2-A6 9D 83 36  ♦♣√93!d┐♪ñ■Γª¥â6  \\\n40:  4B 14 D7 F2-47 53 84 BA-12 2D 4F BB-83 78 6C 70  K¶╫≥GSä║↕-O╗âxlp\n50:  C6 EB 21 F2-F6 59 9A 85-14 73 04 DD-57 5F 40 3C  ╞δ!≥÷YÜà¶s♦▌W_@\u0026lt;    .........X......\n60:  E1 3F B0 DB-E8 B4 AA B0-D5 56 22 AF-B9 04 26 FC  ß?░█Φ┤¬░╒V\u0026quot;»╣♦\u0026amp;ⁿ    ................\n70:  9F D2 0C 00-86 C8 ED DE-85 7F 03 7B-05 28 D7 0F  ƒ╥♀ å╚φ▐à⌂♥{♣(╫☼    ................\n                                                                          ................\n                                                                          .........X......\n00:  55 6E 69 43-6F 6C 6C 20-31 21 70 72-65 66 69 78  UniColl 1!prefix    ................\n10:  20 32 30 62-F5 48 34 B9-3B 1C 01 9F-C8 6B E6 44   20b⌡H4╣;∟☺ƒ╚kµD    ................\n20:  FE F6 31 3A-63 DB 99 3E-77 4D C7 5A-6E B0 A6 88  ■÷1:c█Ö\u0026gt;wM╟Zn░ªê    ................\n30:  04 05 FB 39-33 21 64 BF-0D A4 FE E2-A6 9D 83 36  ♦♣√93!d┐♪ñ■Γª¥â6\n40:  4B 14 D7 F2-47 53 84 BA-12 2C 4F BB-83 78 6C 70  K¶╫≥GSä║↕,O╗âxlp  /\n50:  C6 EB 21 F2-F6 59 9A 85-14 73 04 DD-57 5F 40 3C  ╞δ!≥÷YÜà¶s♦▌W_@\u0026lt;\n60:  E1 3F B0 DB-E8 B4 AA B0-D5 56 22 AF-B9 04 26 FC  ß?░█Φ┤¬░╒V\u0026quot;»╣♦\u0026amp;ⁿ\n70:  9F D2 0C 00-86 C8 ED DE-85 7F 03 7B-05 28 D7 0F  ƒ╥♀ å╚φ▐à⌂♥{♣(╫☼\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eUniColl has less control than a true chosen-prefix collision, but it's much faster especially since it takes only two blocks.\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"examples/unicoll.svg\"\u003erecording\u003c/a\u003e of a UniColl computation.\u003c/p\u003e\n\u003ch3 id=\"shattered-sha1\"\u003e\u003ca href=\"http://shattered.io\"\u003eShattered\u003c/a\u003e (SHA1)\u003c/h3\u003e\n\u003cp\u003eDocumented in \u003ca href=\"https://marc-stevens.nl/research/papers/EC13-S.pdf\"\u003e2013\u003c/a\u003e, computed in \u003ca href=\"http://shattered.io\"\u003e2017\u003c/a\u003e.\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003etime: 6500 years.CPU and 110 year.GPU\u003c/li\u003e\n\u003cli\u003espace: two blocks\u003c/li\u003e\n\u003cli\u003edifferences:\n\u003cpre\u003e\u003ccode\u003e.. .. .. DD ?? ?? ?? ??\nor\n?? ?? ?? DD .. .. .. ..\n\u003c/code\u003e\u003c/pre\u003e\u003c/li\u003e\n\u003cli\u003eexploitation: medium. The differences are right at the start and at the end of the collision blocks. So no control before \u003cstrong\u003eand\u003c/strong\u003e after a length in the prefix/in the suffix: PNG stores its length before the chunk type, so it won't work. However it will work with JP2 files when they use the JFIF form (the same as JPG), and likely MP4 and other atom/box formats if you use long lengths on 64bits (in this case, they're placed \u003cem\u003eafter\u003c/em\u003e the atom type).\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eThe difference between collision blocks of each side is this Xor mask:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e0C 00 00 02 C0 00 00 10 B4 00 00 1C 3C 00 00 04\nBC 00 00 1A 20 00 00 10 24 00 00 1C EC 00 00 14\n0C 00 00 02 C0 00 00 10 B4 00 00 1C 2C 00 00 04\nBC 00 00 18 B0 00 00 10 00 00 00 0C B8 00 00 10\n\u003c/code\u003e\u003c/pre\u003e\n\u003cimg alt='Shattered PoCs side by side' src=pics/shattered.png width=1000 /\u003e\n\n\u003cp\u003eExamples: \u003ca href=\"https://github.com/angea/pocorgtfo#0x18\"\u003ePoC||GTFO 0x18\u003c/a\u003e is using the computed SHA1 prefixes, re-using the image directly from PDFLaTeX source (see \u003ca href=\"https://archive.org/stream/pocorgtfo18#page/n62/mode/1up\"\u003earticle 18:10\u003c/a\u003e), but also checking the value of the prefixes via JavaScript in the HTML page (the file is polyglot, ZIP HTML and PDF).\u003c/p\u003e\n\u003ch2 id=\"chosen-prefix-collisions\"\u003eChosen-prefix collisions\u003c/h2\u003e\n\u003cp\u003eThey allow to collide any content.\u003c/p\u003e\n\u003ctable\u003e\n\u003cthead\u003e\n\u003ctr class=\"header\"\u003e\n\u003cth style=\"text-align: center;\"\u003e𝓐\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003e≠\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003e𝔅\u003c/th\u003e\n\u003c/tr\u003e\n\u003c/thead\u003e\n\u003ctbody\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd style=\"text-align: center;\"\u003eCollision \u003cem\u003eA\u003c/em\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e≠\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003eCollision \u003cem\u003eB\u003c/em\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003c/tbody\u003e\n\u003c/table\u003e\n\u003col\u003e\n\u003cli\u003etake two arbitrary prefixes\u003c/li\u003e\n\u003cli\u003epad the shortest to be as long as the longest. both are padded to the next block - minus 12 bytes\u003c/li\u003e\n\u003c/ol\u003e\n\u003cul\u003e\n\u003cli\u003ethese 12 bytes of random data will be added on both sides to randomize the birthday search\u003c/li\u003e\n\u003c/ul\u003e\n\u003col start=\"3\"\u003e\n\u003cli\u003e\u003cp\u003eX near-collision blocks will be computed and appended.\u003c/p\u003e\n\u003cp\u003eThe fewer blocks, the longer the computation.\u003c/p\u003e\n\u003cp\u003eEx: \u003ca href=\"https://www.win.tue.nl/hashclash/SingleBlock/\"\u003e400 kHours for one block\u003c/a\u003e. 72 hours.cores for nine blocks with \u003ca href=\"https://github.com/cr-marcstevens/hashclash\"\u003eHashClash\u003c/a\u003e.\u003c/p\u003e\u003c/li\u003e\n\u003c/ol\u003e\n\u003cimg alt='chosen-prefix collisions' src=pics/chosen.png width=400/\u003e\n\n\u003cp\u003eChosen-prefix collisions are almighty, but they can take a long time just for a pair of files.\u003c/p\u003e\n\u003ch3 id=\"hashclash-md5\"\u003e\u003ca href=\"https://github.com/cr-marcstevens/hashclash\"\u003eHashClash\u003c/a\u003e (MD5)\u003c/h3\u003e\n\u003cp\u003eFinal version in \u003ca href=\"https://www.win.tue.nl/hashclash/ChosenPrefixCollisions/\"\u003e2009\u003c/a\u003e.\u003c/p\u003e\n\u003cp\u003eExamples: let's collide \u003ccode\u003eyes\u003c/code\u003e and \u003ccode\u003eno\u003c/code\u003e. It took three hours on 24 cores.\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e\u0026#39;yes\u0026#39; prefix:\n000:  79 65 73 0A-3D 62 84 11-01 75 D3 4D-EB 80 93 DE  yes◙=bä◄☺u╙MδÇô▐   - Prefix, padding\n010:  31 C1 D9 30-45 FB BE 1E-71 F0 0A 63-75 A8 30 AA  1┴┘0E√╛▲q≡◙cu¿0¬\n020:  98 17 CA E3-A2 6B 8E 3D-44 A9 8F F2-0E 67 96 48  ÿ↨╩πókÄ=D⌐Å≥♫gûH\n030:  97 25 A6 FB-00 00 00 00-49 08 09 33-F0 62 C4 E8  ù%ª√    I◘○3≡b─Φ\n\n040:  D5 F1 54 CD-CA A1 42 90-7F 9D 3D 9A-67 C4 1B 0F  ╒±T═╩íBÉ⌂¥=Üg─←☼  - Collision blocks start\n050:  04 9F 19 E8-92 C3 AA 19-43 31 1A DB-DA 96 01 54  ♦ƒ↓ΦÆ├¬↓C1→█┌û☺T\n060:  85 B5 9A 88-D8 A5 0E FB-CD 66 9A DA-4F 20 8A AA  à╡Üê╪Ñ♫√═fÜ┌O è¬\n070:  BA E3 9C F0-78 31 8F D1-14 5F 3E B9-0F 9F 3E 19  ║π£≡x1Å╤¶_\u0026gt;╣☼ƒ\u0026gt;↓\n\n080:  09 9C BB A9-45 89 BA A8-03 E6 C0 31-A0 54 D6 26  ○£╗⌐Eë║¿♥µ└1áT╓\u0026amp;\n090:  3F 80 4C 06-0F C7 D9 19-09 D3 DA 14-FD CB 39 84  ?ÇL♠☼╟┘↓○╙┌¶²╦9ä\n0A0:  1F 0D 77 5F-55 AA 7A 07-4C 24 8B 13-0A 54 A2 BC  ▼♪w_U¬z•L$ï‼◙Tó╝\n0B0:  C5 12 7D 4F-E0 5E F2 23-C5 07 61 E4-80 91 B2 13  ┼↕}Oα^≥#┼•aΣÇæ▓‼\n\n0C0:  E7 79 07 2A-CF 1B 66 39-8C F0 8E 7E-75 25 22 1D  τy•*╧←f9î≡Ä~u%\u0026quot;↔\n0D0:  A7 3B 49 4A-32 A4 3A 07-61 26 64 EA-6B 83 A2 8D  º;IJ2ñ:•a\u0026amp;dΩkâóì\n0E0:  BE A3 FF BE-4E 71 AE 18-E2 D0 86 4F-20 00 30 26  ╛ú ╛Nq«↑Γ╨åO  0\u0026amp;\n0F0:  0A 71 DE 1F-40 B4 F4 8F-9C 50 5C 78-DD CD 72 89  ◙q▐▼@┤⌠Å£P\\x▌═rë\n\n100:  BA D1 BF F9-96 80 E3 06-96 F3 B9 7C-77 2D EB 25  ║╤┐∙ûÇπ♠û≤╣|w-δ%\n110:  1E 56 70 D7-14 1F 55 4D-EC 11 58 59-92 45 E1 33  ▲Vp╫¶▼UM∞◄XYÆEß3\n120:  3E 0E A1 6E-FF D9 90 AD-F6 A0 AD 0E-C6 D6 88 12  \u0026gt;♫ín ┘É¡÷á¡♫╞╓ê↕\n130:  B8 74 F2 9E-DD 53 F7 88-19 73 85 39-AA 9B E0 8D  ╕t≥₧▌S≈ê↓sà9¬¢αì\n                                                                          \\\n140:  82 BF 9C 5E-58 42 1E 3B-94 CF 5B 54-73 5F A8 4A  é┐£^XB▲;ö╧[Ts_¿J\n150:  FD 5B 64 CF-59 D1 96 74-14 B3 0C AF-11 1C F9 47  ²[d╧Y╤ût¶│♀»◄∟∙G      ................\n160:  C5 7A 2C F7-D5 24 F5 EB-BE 54 3E 12-B0 24 67 3F  ┼z,≈╒$⌡δ╛T\u0026gt;↕░$g?      ................\n170:  01 DD 95 76-8D 0D 58 FB-50 23 70 3A-BD ED BE AC  ☺▌òvì♪X√P#p:╜φ╛¼      ...............X\n                                                                             ................\n180:  B8 32 DB AE-E8 DC 3A 83-7A C8 D5 0F-08 90 1D 99  ╕2█«Φ▄:âz╚╒☼◘É↔Ö\n190:  2D 7D 17 34-4E A8 21 98-61 1A 65 DA-FC 9B A4 BA  -}↨4N¿!ÿa→e┌ⁿ¢ñ║      ................\n1A0:  E1 42 2B 86-0C 94 2A F6-D6 A4 81 B5-2B 0B E9 37  ßB+å♀ö*÷╓ñü╡+♂Θ7      ................\n1B0:  44 D2 E4 23-14 7C 16 B8-84 90 8B E0-A1 A7 BD 27  D╥Σ#¶|▬╕äÉïαíº╜\u0026#39;      ..............X.\n                                                                             ................\n1C0:  C7 7E E6 17-1A 93 C5 EE-59 70 91 26-4E 9D C7 7C  ╟~µ↨→ô┼εYpæ\u0026amp;N¥╟|\n1D0:  1D 3D AB F1-B4 F4 F1 D9-86 48 75 77-6E FE 98 84  ↔=½±┤⌠±┘åHuwn■ÿä      ................\n1E0:  EF 3C 1C C7-16 5A 1F 83-60 EC 5C FE-CA 17 0C 74  ∩\u0026lt;∟╟▬Z▼â`∞\\■╩↨♀t      ................\n1F0:  EB 8E 9D F6-90 A3 CD 08-65 D5 5A 4C-2E C6 BE 54  δÄ¥÷Éú═◘e╒ZL.╞╛T      ...............X\n                                                                             ................\n\n\u0026#39;no\u0026#39; prefix:                                                                 ................\n000:  6E 6F 0A E5-5F D0 83 01-9B 4D 55 06-61 AB 88 11  no◙σ_╨â☺¢MU♠a½ê◄      ................\n010:  8A FA 4D 34-B3 75 59 46-56 97 EF 6C-4A 07 90 CC  è·M4│uYFVù∩lJ•É╠      ............X...\n020:  FE 19 D7 CF-6F 92 03 9C-91 AA A5 DA-56 92 C1 04  ■↓╫╧oÆ♥£æ¬Ñ┌VÆ┴♦      ................\n030:  E6 4C 08 A3-00 00 00 00-8D B6 4E 47-FF AF 7A 3C  µL◘ú    ì╢NG »z\u0026lt;\n                                                                             ................\n040:  D5 F1 54 CD-CA A1 42 90-7F 9D 3D 9A-67 C4 1B 0F  ╒±T═╩íBÉ⌂¥=Üg─←☼      ................\n050:  04 9F 19 E8-92 C3 AA 19-43 31 1A DB-DA 96 01 54  ♦ƒ↓ΦÆ├¬↓C1→█┌û☺T      ............X...\n060:  85 B5 9A 88-D8 A5 0E FB-CD 66 9A DA-4F 20 8A A9  à╡Üê╪Ñ♫√═fÜ┌O è⌐      ................\n070:  BA E3 9C F0-78 31 8F D1-14 5F 3E B9-0F 9F 3E 19  ║π£≡x1Å╤¶_\u0026gt;╣☼ƒ\u0026gt;↓\n                                                                             ................\n080:  09 9C BB A9-45 89 BA A8-03 E6 C0 31-A0 54 D6 26  ○£╗⌐Eë║¿♥µ└1áT╓\u0026amp;      ................\n090:  3F 80 4C 06-0F C7 D9 19-09 D3 DA 14-FD CB 39 84  ?ÇL♠☼╟┘↓○╙┌¶²╦9ä      .............X..\n0A0:  1F 0D 77 5F-55 AA 7A 07-4C 24 8B 13-0A 54 B2 BC  ▼♪w_U¬z•L$ï‼◙T▓╝      ................\n0B0:  C5 12 7D 4F-E0 5E F2 23-C5 07 61 E4-80 91 B2 13  ┼↕}Oα^≥#┼•aΣÇæ▓‼\n                                                                             ................\n0C0:  E7 79 07 2A-CF 1B 66 39-8C F0 8E 7E-75 25 22 1D  τy•*╧←f9î≡Ä~u%\u0026quot;↔      ................\n0D0:  A7 3B 49 4A-32 A4 3A 07-61 26 64 EA-6B 83 A2 8D  º;IJ2ñ:•a\u0026amp;dΩkâóì      ...............X\n0E0:  BE A3 FF BE-4E 71 AE 18-E2 D0 86 4F-20 00 30 22  ╛ú ╛Nq«↑Γ╨åO  0\u0026quot;      ................\n0F0:  0A 71 DE 1F-40 B4 F4 8F-9C 50 5C 78-DD CD 72 89  ◙q▐▼@┤⌠Å£P\\x▌═rë\n                                                                           /\n100:  BA D1 BF F9-96 80 E3 06-96 F3 B9 7C-77 2D EB 25  ║╤┐∙ûÇπ♠û≤╣|w-δ%\n110:  1E 56 70 D7-14 1F 55 4D-EC 11 58 59-92 45 E1 33  ▲Vp╫¶▼UM∞◄XYÆEß3\n120:  3E 0E A1 6E-FF D9 90 AD-F6 A0 AD 0E-CA D6 88 12  \u0026gt;♫ín ┘É¡÷á¡♫╩╓ê↕\n130:  B8 74 F2 9E-DD 53 F7 88-19 73 85 39-AA 9B E0 8D  ╕t≥₧▌S≈ê↓sà9¬¢αì\n\n140:  82 BF 9C 5E-58 42 1E 3B-94 CF 5B 54-73 5F A8 4A  é┐£^XB▲;ö╧[Ts_¿J\n150:  FD 5B 64 CF-59 D1 96 74-14 B3 0C AF-11 1C F9 47  ²[d╧Y╤ût¶│♀»◄∟∙G\n160:  C5 7A 2C F7-D5 24 F5 EB-BE 54 3E 12-70 24 67 3F  ┼z,≈╒$⌡δ╛T\u0026gt;↕p$g?\n170:  01 DD 95 76-8D 0D 58 FB-50 23 70 3A-BD ED BE AC  ☺▌òvì♪X√P#p:╜φ╛¼\n\n180:  B8 32 DB AE-E8 DC 3A 83-7A C8 D5 0F-08 90 1D 99  ╕2█«Φ▄:âz╚╒☼◘É↔Ö\n190:  2D 7D 17 34-4E A8 21 98-61 1A 65 DA-FC 9B A4 BA  -}↨4N¿!ÿa→e┌ⁿ¢ñ║\n1A0:  E1 42 2B 86-0C 94 2A F6-D6 A4 81 B5-2B 2B E9 37  ßB+å♀ö*÷╓ñü╡++Θ7\n1B0:  44 D2 E4 23-14 7C 16 B8-84 90 8B E0-A1 A7 BD 27  D╥Σ#¶|▬╕äÉïαíº╜\u0026#39;\n\n1C0:  C7 7E E6 17-1A 93 C5 EE-59 70 91 26-4E 9D C7 7C  ╟~µ↨→ô┼εYpæ\u0026amp;N¥╟|\n1D0:  1D 3D AB F1-B4 F4 F1 D9-86 48 75 77-6E FE 98 84  ↔=½±┤⌠±┘åHuwn■ÿä\n1E0:  EF 3C 1C C7-16 5A 1F 83-60 EC 5C FE-CA 17 0C 54  ∩\u0026lt;∟╟▬Z▼â`∞\\■╩↨♀T\n1F0:  EB 8E 9D F6-90 A3 CD 08-65 D5 5A 4C-2E C6 BE 54  δÄ¥÷Éú═◘e╒ZL.╞╛T\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eHere is a \u003ca href=\"examples/cpc.html\"\u003elog\u003c/a\u003e of the whole operation.\u003c/p\u003e\n\u003ch3 id=\"shambles-sha-1\"\u003e\u003ca href=\"https://sha-mbles.github.io/\"\u003eShambles\u003c/a\u003e (SHA-1)\u003c/h3\u003e\n\u003cp\u003eShambles is a very expensive chosen-prefix collision that uses 9 blocks.\u003c/p\u003e\n\u003cp\u003eEach block has the same xor pattern as Shattered:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e0C 00 00 02 C0 00 00 10 B4 00 00 1C 3C 00 00 04\nBC 00 00 1A 20 00 00 10 24 00 00 1C EC 00 00 14\n0C 00 00 02 C0 00 00 10 B4 00 00 1C 2C 00 00 04\nBC 00 00 18 B0 00 00 10 00 00 00 0C B8 00 00 10\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eBut even if Shattered is much easier to exploit than FastColl, the constraints of the differences in the collision blocks are irrelevant since Shambles is a Chosen Prefix Collision.\u003c/p\u003e\n\u003ch2 id=\"attacks-summary\"\u003eAttacks summary\u003c/h2\u003e\n\u003ctable\u003e\n\u003cthead\u003e\n\u003ctr class=\"header\"\u003e\n\u003cth\u003eHash\u003c/th\u003e\n\u003cth\u003eName\u003c/th\u003e\n\u003cth\u003eDate\u003c/th\u003e\n\u003cth\u003eDuration\u003c/th\u003e\n\u003cth\u003ePrefix type\u003c/th\u003e\n\u003cth\u003eControl near diff\u003c/th\u003e\n\u003c/tr\u003e\n\u003c/thead\u003e\n\u003ctbody\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eMD5\u003c/td\u003e\n\u003ctd\u003eFastColl\u003c/td\u003e\n\u003ctd\u003e2009\u003c/td\u003e\n\u003ctd\u003e2s\u003c/td\u003e\n\u003ctd\u003eIdentical\u003c/td\u003e\n\u003ctd\u003enone\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003eUniColl\u003c/td\u003e\n\u003ctd\u003e2012\u003c/td\u003e\n\u003ctd\u003e7-40min\u003c/td\u003e\n\u003ctd\u003eIdentical\u003c/td\u003e\n\u003ctd\u003e4-10 bytes\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003eHashClash\u003c/td\u003e\n\u003ctd\u003e2009\u003c/td\u003e\n\u003ctd\u003e72h\u003c/td\u003e\n\u003ctd\u003eChosen\u003c/td\u003e\n\u003ctd\u003en/a\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eSHA1\u003c/td\u003e\n\u003ctd\u003eShattered\u003c/td\u003e\n\u003ctd\u003e2013\u003c/td\u003e\n\u003ctd\u003e6500yr\u003c/td\u003e\n\u003ctd\u003eIdentical\u003c/td\u003e\n\u003ctd\u003eprefix \u0026amp; suffix\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003eShambles\u003c/td\u003e\n\u003ctd\u003e2020\u003c/td\u003e\n\u003ctd\u003e?\u003c/td\u003e\n\u003ctd\u003eChosen\u003c/td\u003e\n\u003ctd\u003en/a\u003c/td\u003e\n\u003c/tr\u003e\n\u003c/tbody\u003e\n\u003c/table\u003e\n\u003ch1 id=\"exploitations\"\u003eExploitations\u003c/h1\u003e\n\u003cp\u003eIdentical prefix collisions is usually seen as (very) limited, but chosen-prefix is time consuming.\u003c/p\u003e\n\u003cp\u003eAnother approach is to craft re-usable prefixes via either identical-prefix attack such as UniColl - or chosen-prefix to overcome some limitations - but re-use that prefix pair in combinations with two payloads like a classic identical prefix attack.\u003c/p\u003e\n\u003cp\u003eOnce the prefix pair has been computed, it makes colliding two contents instant: it's just a matter of massaging file data (according to specific file formats) so that it fits the file formats specifications and the precomputed prefix requirements.\u003c/p\u003e\n\u003ch2 id=\"standard-strategy\"\u003eStandard strategy\u003c/h2\u003e\n\u003cp\u003eClassic collisions of two valid files with the same file type.\u003c/p\u003e\n\u003ch3 id=\"jpg\"\u003eJPG\u003c/h3\u003e\n\u003cimg alt='a JPG file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/JPG.png width=500/\u003e\n\n\u003cp\u003eTheoretical limitations and workarounds:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003ethe \u003cem\u003eApplication\u003c/em\u003e segment should in theory right after the \u003cem\u003eStart of Image\u003c/em\u003e marker. In practice, this is not necessary, so our collision can be generic: the only limitation is the size of the smallest image.\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003ea comment's length is stored on two bytes, so the amount it can store is limited to it's limited to 65536 bytes (roughly the size of a 400x400 photo)\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003erather than jumping over a complete JPG file, one can split that file in its segments, and add jump trampolines between segments\u003c/p\u003e\n\u003cimg alt='comments over each image segments' src=pics/jpgcom1.png width=500/\u003e\n\n\u003cp\u003e\u003cem\u003ecomments over each image segments\u003c/em\u003e\u003c/p\u003e\n\u003cimg alt='how comments trampoline work' src=pics/jpgcom2.png width=500/\u003e\n\n\u003cp\u003e\u003cem\u003ehow comments trampoline work\u003c/em\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003ewhile most of a JPG structure is made of segments that are all limited to 65536 bytes in size, the actual compressed data is stored in the \u003cem\u003eEntropy Coded Segment\u003c/em\u003e which doesn't respect its limitations: its size is unknown in advance and grows beyond that limit. It grows with the size of the image, making most of the file size in a baseline (non progressive) image. To make the whole image fit into 64kb chunks, the easy way is to first try to save the image as progressive (which any software can do, and splits the ECS in typically up to six scans). The more advanced way is to use \u003cem\u003eJPEGTran\u003c/em\u003e with its 'wizard' \u003ccode\u003e--scans\u003c/code\u003e command line parameter and define custom scans.\u003c/p\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eThere's no other restriction besides the scans segments, so an MD5 collision of two arbitrary JPGs is \u003cem\u003einstant\u003c/em\u003e, and needs no chosen-prefix collision, just UniColl.\u003c/p\u003e\n\u003cp\u003eWith the \u003ca href=\"scripts/jpg.py\"\u003escript\u003c/a\u003e:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e21:07:35.65\u0026gt;jpg.py Ange.jpg Marc.jpg\n\n21:07:35.75\u0026gt;\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eExamples:\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='identical prefix collisions' src=examples/collision1.jpg height=200/\u003e ⟷ \u003cimg alt='identical prefix collisions' src=examples/collision2.jpg height=200/\u003e\u003c/p\u003e\n\u003ch4 id=\"custom-scans\"\u003ecustom scans\u003c/h4\u003e\n\u003cp\u003e\u003cem\u003e2 MD5-colliding JPGs\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eHere's an example of \u003cem\u003eJPEGTran\u003c/em\u003e scans definition to turn \u003ca href=\"pics/pocorgtfo14.png\"\u003ea 1944x2508 RGB image\u003c/a\u003e into a 100% JPG with 20 scans in which they all fit in 64kb.\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e// \u0026lt;component\u0026gt;: \u0026lt;minbyte\u0026gt;-\u0026lt;maxbyte\u0026gt;, \u0026lt;minbit\u0026gt;, \u0026lt;maxbit\u0026gt;;\n\n// 0=luma\n0: 0-0, 0, 0;\n0: 1-1, 0, 0;\n0: 2-6, 0, 0;\n0: 7-10, 0, 0;\n0: 11-13, 0, 0;\n0: 14-20, 0, 0;\n0: 21-26, 0, 0;\n0: 27-32, 0, 0;\n0: 33-40, 0, 0;\n0: 41-48, 0, 0;\n0: 49-54, 0, 0;\n0: 55-63, 0, 0;\n\n// 1=blueness\n1: 0-0, 0, 0;\n1: 1-16, 0, 0;\n1: 17-32, 0, 0;\n1: 33-63, 0, 0;\n\n// 2=redness\n2: 0-0, 0, 0;\n2: 1-16, 0, 0;\n2: 17-32, 0, 0;\n2: 33-63, 0, 0;\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eResult:\u003c/p\u003e\n\u003cimg alt='a 1944x2508 RGB image as a 100% JPG with 20 scans' src=examples/20scans.jpg width=300/\u003e\n\n\u003cp\u003e\u003cem\u003ea 1944x2508 RGB image as a 100% JPG with 20 scans\u003c/em\u003e\u003c/p\u003e\n\u003ch3 id=\"png\"\u003ePNG\u003c/h3\u003e\n\u003cimg alt='a PNG file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/PNG.png width=500/\u003e\n\n\u003cp\u003eTheoretical limitations and workarounds:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003ePNG uses CRC32 at the end of its chunks, but in practice they're ignored. They can be correct but it's not required.\u003c/li\u003e\n\u003cli\u003ethe image meta data (dimensions, color space...) are stored in the \u003ccode\u003eIHDR\u003c/code\u003e chunk, which should in theory be right after the signature (ie, before any potential comment), so it would mean that we can only precompute collisions of images with the same meta data. However, that chunk can actually be after a comment block (in the vast majority of readers, except Apple ones), so we can put the collision data before the header, which enables to collide any pair of PNG with a single precomputation.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eSince a PNG chunk has a length on four bytes, there's no need to modify the structure of either file: we can jump over a whole image in one go.\u003c/p\u003e\n\u003cp\u003eWe can insert as many discarded chunks as we want, so we can add one for alignment, then one which length will be altered by a UniColl. so the length will be \u003ccode\u003e00\u003c/code\u003e \u003ccode\u003e75\u003c/code\u003e and \u003ccode\u003e01\u003c/code\u003e \u003ccode\u003e75\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003eSo an MD5 collision of two arbitrary PNG images is \u003cem\u003einstant\u003c/em\u003e, with no prerequisite (no computation, just some minor file changes), and needs no chosen-prefix collision, just UniColl.\u003c/p\u003e\n\u003cp\u003eWith the \u003ca href=\"scripts/png.py\"\u003escript\u003c/a\u003e:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e19:27:04.79\u0026gt;png.py nintendo.png sega.png\n\n19:27:04.87\u0026gt;\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eExamples:\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='identical prefix collisions' src=examples/collision1.png width=40% /\u003e ⟷ \u003cimg alt='identical prefix collisions' src=examples/collision2.png width=40% /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003e2 MD5-colliding PNGs with different properties\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"examples/pngGen.svg\"\u003erecording\u003c/a\u003e of the whole operation.\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"examples/pngGen.svg\" alt=\"a recording of a universal (abusive) PNG collision\" /\u003e\u003c/p\u003e\n\u003ch4 id=\"incompatibility\"\u003eincompatibility\u003c/h4\u003e\n\u003cp\u003eMost readers accept flawlessly PNG files that start with a chunk that is not \u003ccode\u003eIHDR\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003eHowever, some (such as Safari and Preview - any other?) don't tolerate it. In this case, the image header and its properties (dimensions, color space) must be first, before any collision blocks.\u003c/p\u003e\n\u003cp\u003eIn this case, both colliding files must have the same properties. Again, UniColl is enough, and of course the computed prefix pair can be reused for any other pair of files with the same properties\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"scripts/pngStd.py\"\u003escript\u003c/a\u003e to collide any pair of such files that launches UniColl if needed to compute the prefix pair.\u003c/p\u003e\n\u003cp\u003eExamples:\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='identical prefix collisions' src=examples/0a959025-1.png width=350/\u003e ⟷ \u003cimg alt='identical prefix collisions' src=examples/0a959025-2.png width=350/\u003e\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='identical prefix collisions' src=examples/aac2423a-1.png width=350/\u003e ⟷ \u003cimg alt='identical prefix collisions' src=examples/aac2423a-2.png width=350/\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003e2 pairs of MD5-colliding PNGs with identical properties for maximum compatibility\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"examples/pngUniColl.svg\"\u003erecording\u003c/a\u003e of the whole operation when UniColl is invoked,\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"examples/pngUniColl.svg\" alt=\"a recording of PNG UniColl collision\" /\u003e\u003c/p\u003e\n\u003cp\u003eand \u003ca href=\"examples/pngSpec.svg\"\u003eanother one\u003c/a\u003e when the prefix has been already computed.\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"examples/pngSpec.svg\" alt=\"a recording of precomputed PNG collision\" /\u003e\u003c/p\u003e\n\u003ch3 id=\"gif\"\u003eGIF\u003c/h3\u003e\n\u003cimg alt='a GIF file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/GIF.png width=500/\u003e\n\n\u003cp\u003eGIF is tricky:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eit stores its meta data in the header before any comment is possible, so there can't be a generic prefix for all GIF files.\u003c/li\u003e\n\u003cli\u003eif the file has a global palette, it is also stored before a comment is possible too.\u003c/li\u003e\n\u003cli\u003eits comment chunks are limited to a single byte in length, so a maximum of 256 bytes!\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eHowever, the comment chunks follow a peculiar structure: it's a chain of \u003ccode\u003e\u0026lt;length:1\u0026gt;\u003c/code\u003e \u003ccode\u003e\u0026lt;data:length\u0026gt;\u003c/code\u003e until a null length is defined. So it makes any non-null byte a valid 'jump forward'. Which makes it suitable to be used with FastColl, as shown in \u003ca href=\"https://github.com/angea/pocorgtfo#0x14\"\u003ePoC||GTFO 14:11\u003c/a\u003e.\u003c/p\u003e\n\u003cp\u003eSo at least, even if we can't have a generic prefix, we can collide any pair of GIF of same metadata (dimensions, palette) and we only need a second of FastColl to compute its prefix.\u003c/p\u003e\n\u003cp\u003eNow the problem is that we can't jump over a whole image like PNG or over a big structure like JPG.\u003c/p\u003e\n\u003cp\u003eA possible workaround is to massage the compressed data or to chunk the image in tiny areas like in the case of the GIF hashquine, but this is not optimal.\u003c/p\u003e\n\u003cp\u003eAnother idea that works generically is that the image data is also stored using this \u003ccode\u003elength data\u003c/code\u003e sequence structure: so if we take two GIFs with no animation, we only have to:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003enormalize the palette\u003c/li\u003e\n\u003cli\u003eset the first frame duration to the maximum\u003c/li\u003e\n\u003cli\u003ecraft a comment that will jump to the start of the first frame data, so that the comment will sled over the image data as a comment, and end the same way: until a null length is encountered. Then the parser will meet the next frame, and display it.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eWith a minor setup (only a few hundred bytes of overhead), we can sled over any GIF image and work around the 256 bytes limitation. This idea was suggested by Marc, and it's brilliant!\u003c/p\u003e\n\u003cp\u003eSo in the end, the current GIF limitations for \u003cem\u003einstant\u003c/em\u003e MD5 collisions are:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eno animation\u003c/li\u003e\n\u003cli\u003ethe images have to be normalized to the same palette - see \u003ca href=\"https://www.lcdf.org/gifsicle/\"\u003e\u003ccode\u003egifsicle --use-colormap web\u003c/code\u003e\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003ethe images have to be the same dimensions\u003c/li\u003e\n\u003cli\u003eafter 11 minutes, both files will show the same image\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eAn easy shortcut to normalize still GIF images is to make them animation frames of the same image, then we can use a \u003ca href=\"scripts/gif.py\"\u003escript\u003c/a\u003e to re-use or compute FastColl blocks to make a file pair that shows each of them.\u003c/p\u003e\n\u003cp\u003eExamples:\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='identical prefix collisions' src=examples/collision1.gif width=350/\u003e ⟷ \u003cimg alt='identical prefix collisions' src=examples/collision2.gif width=350/\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003e2 MD5-colliding GIFs - pics by \u003ca href=\"https://www.kidmograph.com/\"\u003eKidMoGraph\u003c/a\u003e\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"examples/gifFastColl.svg\"\u003erecording\u003c/a\u003e of the whole operation.\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"examples/gifFastColl.svg\" alt=\"a recording of a GIF FastColl collision\" /\u003e\u003c/p\u003e\n\u003ch3 id=\"gzip\"\u003eGZIP\u003c/h3\u003e\n\u003cimg alt='a GZIP file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/GZip.png width=500/\u003e\n\n\u003cp\u003eGZIP specs v4.3: \u003ca href=\"https://datatracker.ietf.org/doc/html/rfc1952\"\u003eRFC 1952\u003c/a\u003e (1996).\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003ea Gzip file is made of one or more 'members' (gzip streams) concatenated. They will be all decompressed and their uncompressed content appended to each other - even if the member's uncompressed content is empty.\u003c/li\u003e\n\u003cli\u003ethese members can be separated with zeroes. Zeroes will be just skipped, except at file start. Any non-null byte will be checked for the signature \u003ccode\u003e1F 8B\u003c/code\u003e. If not matching the signature, the parsing will stop, which can be used to forcibly stop parsing between two payloads, but will trigger some warnings that might cause problems. Another strategy is to add one extra empty member at the end of the file, and make parsing of both payloads finish there - on the member or on its body.\u003c/li\u003e\n\u003cli\u003eThe optional \u003ccode\u003efilename\u003c/code\u003e and \u003ccode\u003efile comment\u003c/code\u003e are null-terminated whereas the \u003ccode\u003eExtra field\u003c/code\u003e is size16-defined, therefore abusable. It's made of one or more subfield(s), with an ID and its own sublength, but subfields are not enforced - very few are officially defined.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eTherefore an empty gzip member with an extra field is a perfect parasite host.\u003c/p\u003e\n\u003cp\u003eIf the top file is too big to fit in an extra field, then its uncompressed stream can be split in smaller files until they all fit in extra fields.\u003c/p\u003e\n\u003cp\u003eAfter the header of a member come its compressed body, its CRC32 and its uncompressed size (not enforced). Therefore an empty data body with its null CRC32 and size make a generic postwrap, which can even be shared by different member headers.\u003c/p\u003e\n\u003cp\u003eVarious implementations rely on the uncompressed size of the last member instead of the sum of all members. So our collided files will show that they are null-sized, because these files finish with an empty member used as trampoline.\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"scripts/gz.py\"\u003escript\u003c/a\u003e to generate instant MD5 collisions of two GZip files. It's taking most of its time to decompress and recompress data if the input files are big - the collisions prefix are pre-computed. Splitting members without decompressing is not possible as the uncompressed CRC32 needs to be calculated.\u003c/p\u003e\n\u003cp\u003eA \u003ccode\u003e.tar.gz\u003c/code\u003e is just the \u003ccode\u003egzip\u003c/code\u003e archive of a \u003ccode\u003etar\u003c/code\u003e archive. It will work fine with gzipped tar, unlike \u003ccode\u003etar\u003c/code\u003e itself.\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/collision1.tar.gz\"\u003ecollision1.tar.gz\u003c/a\u003e (Pacome) ⟷ \u003ca href=\"examples/collision2.tar.gz\"\u003ecollision2.tar.gz\u003c/a\u003e (Reg)\u003c/p\u003e\n\u003ch3 id=\"lz4--zstandard\"\u003eLZ4 / Zstandard\u003c/h3\u003e\n\u003cimg alt='an Zstandard file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/zstd_skip.png width=500/\u003e\n\u003cimg alt='an LZ4 file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/lz4.png width=500/\u003e\n\n\u003cp\u003eLZ4 and Zstandard are 2 different compression formats, with a similar overall structure: they're made of frames, each starting with a specific magic: \u003ccode\u003e0xFD2FB528\u003c/code\u003e for Zstandard frames, \u003ccode\u003e0x184D2204\u003c/code\u003e for Lz4 frames.\u003c/p\u003e\n\u003cp\u003eThey also \u003cstrong\u003eshare\u003c/strong\u003e the same 'skippable' TLV frames, starting with 4 bytes \u003cem\u003emagics\u003c/em\u003e in the range \u003ccode\u003e0x184D2A50\u003c/code\u003e - \u003ccode\u003e0x184D2A5F\u003c/code\u003e, then the \u003cem\u003eLength\u003c/em\u003e of the user data (4 bytes, little-endian), then the \u003cem\u003eUser Data\u003c/em\u003e itself. These frames are entirely optional, of any length, and repeatable. The files can start with these frames. So these frames can be chained to make a perfect generic collision prefix, across 2 formats.\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"scripts/zstd-lz4.py\"\u003escript\u003c/a\u003e to generate instant MD5 collisions of two Zstd/Lz4 files. Like Gzip, 2 different archives will be visible from the outside no matter the content: for example, a \u003ccode\u003e.cpio.zst\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003eExamples:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"examples/free/md5-1.lz4\"\u003emd5-1.lz4\u003c/a\u003e ⟷ \u003ca href=\"examples/free/md5-2.lz4\"\u003emd5-2.lz4\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"examples/free/md5-1.zstd\"\u003emd5-1.zstd\u003c/a\u003e ⟷ \u003ca href=\"examples/free/md5-2.zstd\"\u003emd5-2.zstd\u003c/a\u003e\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"examples/free/md5-c6a611ce.zstd\"\u003emd5-c6a611ce.zstd\u003c/a\u003e ⟷ \u003ca href=\"examples/free/md5-c6a611ce.lz4\"\u003emd5-c6a611ce.lz4\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch3 id=\"portable-executable\"\u003ePortable Executable\u003c/h3\u003e\n\u003cimg alt='a PE file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/PE.png width=600/\u003e\n\n\u003cp\u003eThe Portable Executable has a peculiar structure:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003ethe old DOS header is almost useless, and points to the next structure, the PE header. The DOS headers has no other role. DOS headers can be exchanged between executables.\u003c/li\u003e\n\u003cli\u003ethe DOS header has to be at offset 0, and has a fixed length of a full block, and the pointer is at the end of the structure, beyond UniColl's reach: so only chosen-prefix collision is useful to collide PE files this way.\u003c/li\u003e\n\u003cli\u003eThe PE header and what follows defines the whole file.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eSo the strategy is:\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003ethe PE header can be moved down to leave room for collision blocks after the DOS header.\u003c/li\u003e\n\u003cli\u003eThe DOS header can be exploited (via chosen-prefix collisions) to point to two different offsets, where two different PE headers will be moved.\u003c/li\u003e\n\u003cli\u003eThe sections can be put next to each other, after the \u003ccode\u003eDOS/Collisions/Header1/Header2\u003c/code\u003e structure. You just need to apply a delta to the offsets of the two section tables.\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eThis means that it's possible to instantly collide any pair of PE executables. Even if they use different subsystems or architecture.\u003c/p\u003e\n\u003cp\u003eWhile executables collisions is usually trivial via any loader, this kind of exploitation here is transparent: the code is identical and loaded at the same address.\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/collision1.exe\"\u003etweakPNG.exe\u003c/a\u003e (GUI) ⟷ \u003ca href=\"examples/collision2.exe\"\u003efastcoll.exe\u003c/a\u003e (CLI)\u003c/p\u003e\n\u003cp\u003eHere is a \u003ca href=\"scripts/pe.py\"\u003escript\u003c/a\u003e to generate instant MD5 collisions of Windows Executables.\u003c/p\u003e\n\u003cimg alt='collision of fastcoll.exe (CLI) and tweakPNG(GUI)' src=pics/pe.png width=500/\u003e\n\n\u003ch3 id=\"mp4-and-others\"\u003eMP4 and others\u003c/h3\u003e\n\u003cp\u003eThis format's container is a sequence of \u003ccode\u003eLength Type Value\u003c/code\u003e chunks called Atoms. The length is a 32 bit big-endian and covers itself, the type and the value, so the minimum normal length is 8 (the type is a 4 ASCII characters string).\u003c/p\u003e\n\u003cp\u003eIf the length is null, then the atom takes the rest of the file - such as \u003ccode\u003ejp2c\u003c/code\u003e atoms in JP2 files. If it's 1, then the Type is followed by a 64bit length, changing the atom to \u003ccode\u003eType Length Value\u003c/code\u003e, making it compatible with other collisions like Shattered.\u003c/p\u003e\n\u003cp\u003eSome atoms contain other atoms: in this cases, they're called boxes. That's why this otherwise unnamed structure is called \u0026quot;atom/box\u0026quot;.\u003c/p\u003e\n\u003cp\u003eThis \u0026quot;atom/box\u0026quot; format used in MP4 is actually a derivate of Apple Quicktime, and is used by \u003ca href=\"http://www.ftyps.com/\"\u003emany other formats\u003c/a\u003e (JP2, HEIF, F4V).\u003c/p\u003e\n\u003cp\u003eThe first atom type is \u003cem\u003eusually\u003c/em\u003e \u003ccode\u003eftyp\u003c/code\u003e, which enables to differentiate the actual file format.\u003c/p\u003e\n\u003cp\u003eThe format is quite permissive: just chain \u003ccode\u003efree\u003c/code\u003e atoms, abuse one's length with UniColl, then jump over the first payload.\u003c/p\u003e\n\u003cp\u003eFor MP4 files, the only thing to add is to adjust the \u003ccode\u003estco\u003c/code\u003e (Sample Table - Chunk Offsets) or \u003ccode\u003eco64\u003c/code\u003e (the 64 bit equivalent) tables, since they are absolute(!) offsets pointing to the \u003ccode\u003emdat\u003c/code\u003e movie data - and they are actually enforced!\u003c/p\u003e\n\u003cp\u003eThis gives a \u003ca href=\"scripts/mp4.py\"\u003escript\u003c/a\u003e that instantly collides any arbitrary video - and as mentioned, it may work on other format than MP4.\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/mp4.png\" alt=\"Nirvana - Smells like Teen Spirit / Weird Al Yankovik - Smells like Nirvana\" /\u003e\u003c/p\u003e\n\u003cp\u003eExamples (videos by \u003ca href=\"https://www.kidmograph.com/\"\u003eKidMoGraph\u003c/a\u003e):\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003e32b lengths (standard) \u003ca href=\"examples/collision1.mp4\"\u003ecollision1.mp4\u003c/a\u003e ⟷ \u003ca href=\"examples/collision2.mp4\"\u003ecollision2.mp4\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003cvideo width=300 controls\u003e \u003csource src=\"examples/collision1.mp4\" type=\"video/mp4\"\u003e🏭\u003c/video\u003e ⟷ \u003cvideo width=300 controls\u003e \u003csource src=\"examples/collision2.mp4\" type=\"video/mp4\"\u003e🛣️\u003c/video\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003e64b lengths \u003ca href=\"examples/collisionl1.mp4\"\u003ecollisionl1.mp4\u003c/a\u003e ⟷ \u003ca href=\"examples/collisionl2.mp4\"\u003ecollisionl2.mp4\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003cvideo width=300 controls\u003e \u003csource src=\"examples/collisionl1.mp4\" type=\"video/mp4\"\u003e☀️\u003c/video\u003e ⟷ \u003cvideo width=300 controls\u003e \u003csource src=\"examples/collisionl2.mp4\" type=\"video/mp4\"\u003e🌙\u003c/video\u003e\u003c/p\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003e\u003cvideo\u003e\u003cimg src=\"pics/mp4-pocs.png\" alt=\"how it should look (but your markdown doesn\u0026#39;t render video tags)\" /\u003e\u003c/video\u003e\u003c/p\u003e\n\u003cp\u003eNote that some viewers (OS X, Safari, FireFox) don't allow a file that starts with an Atom that is not \u003ccode\u003eftyp\u003c/code\u003e. In this case, the prefix have to cover this, and it's not so generic, but besides it's the same strategy - only limited to a single file type.\u003c/p\u003e\n\u003ch4 id=\"jpeg2000\"\u003eJPEG2000\u003c/h4\u003e\n\u003cp\u003eJPEG2000 files usually start with the Atom/Box structure like MP4, then the last atom \u003ccode\u003ejp2c\u003c/code\u003e is typically until the end of the file (null length), then from this point on it follows the JFIF structure, like JPEG (starting with \u003ccode\u003eFF 4F\u003c/code\u003e as a segment marker).\u003c/p\u003e\n\u003cp\u003eThe pure-JFIF form is also tolerated, in which case collision is like JPEG: Shattered-compatible, but with comments limited to 64Kb.\u003c/p\u003e\n\u003cp\u003eOn the other hand, if you manipulate JPEG2000 files with the Atom/Box, you don't have this limitation.\u003c/p\u003e\n\u003cp\u003eAs mentioned before, if you're trying to collide this structure and if there are more restriction - for example starting with a \u003ccode\u003efree\u003c/code\u003e atom is not tolerated by some format - then you can compute another UniColl prefix pairs specific to this format: JPEG2000 seems to \u003ca href=\"https://github.com/uclouvain/openjpeg/blob/d2205ba2ee78faeea659263383446c4472b1f9df/src/bin/wx/OPJViewer/source/imagjpeg2000.cpp#L100-L111\"\u003eenforce\u003c/a\u003e a \u003ccode\u003e'jP '\u003c/code\u003e atom first before the usual \u003ccode\u003eftyp\u003c/code\u003e, but besides, that's the only restriction: there's no need to relocate anything.\u003c/p\u003e\n\u003cp\u003eSo the resulting \u003ca href=\"scripts/jp2.py\"\u003escript\u003c/a\u003e is even simpler!\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/jp2.png\" alt=\"Oded Goldreich / Neal Koblitz\" /\u003e\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/collision1.jp2\"\u003ecollision1.jp2\u003c/a\u003e ⟷ \u003ca href=\"examples/collision2.jp2\"\u003ecollision2.jp2\u003c/a\u003e\u003c/p\u003e\n\u003ch3 id=\"pdf\"\u003ePDF\u003c/h3\u003e\n\u003cimg alt='a PDF file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/PDF.png width=300/\u003e\n\n\u003cp\u003e\u003cstrong\u003eabout Shattered\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eShattered exploitation was not a PDF trick, but a JPG trick in a PDF.\u003c/p\u003e\n\u003cp\u003eIt only enabled a PDF to contain a JPG-compressed object that could have two different contents. Both PDFs needed to be totally identical beside.\u003c/p\u003e\n\u003cp\u003eNote that the documents can be totally normal, and can just clip the collision JPG and display it in difference places, such as multi-page documents.\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/shattered1.pdf\"\u003ethe Shattered paper, modified\u003c/a\u003e ⟷ \u003ca href=\"examples/shattered2.pdf\"\u003ethe Shattered paper, original\u003c/a\u003e\u003c/p\u003e\n\u003cimg alt='the Shattered paper using a colliding JPG in the authors' src=pics/shattereddoc1.png width=350/\u003e\n\u003cimg alt='the Shattered paper using a colliding JPG in a figure' src=pics/shattereddoc2.png width=350/\u003e\n\n\u003cp\u003e\u003cem\u003ethe Shattered paper using a colliding JPG in two places\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003ePDF collisions with MD5\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eWith MD5 (and other collision patterns), we can do PDF collisions at document level, with no restrictions at all on either file!\u003c/p\u003e\n\u003cp\u003ePDF has a very different structure from other file formats. It uses object numbers and references to define a tree. The whole document depends on the Root element.\u003c/p\u003e\n\u003c!--\ndigraph {\n rankdir=LR;\n root -\u003e \"catalog#1\"\n \"catalog#1\" -\u003e \"pages#2\"\n \"pages#2\" -\u003e \"page#3\"\n \"page#3\" -\u003e \"pages#2\"\n \"page#3\" -\u003e \"content#4\"\n \"content#4\" -\u003e \"Hello World!\"\n}\n--\u003e\n\n\u003cp\u003e\u003cimg src=\"pics/pdf.svg\" /\u003e\u003c/p\u003e\n\u003cp\u003eThis (valid) PDF\u003c/p\u003e\n\u003cpre class=\"text\"\u003e\u003ccode\u003e%PDF-1.\n1 0 obj\u0026lt;\u0026lt;/Pages 2 0 R\u0026gt;\u0026gt;endobj\n2 0 obj\u0026lt;\u0026lt;/Kids[3 0 R]/Count 1\u0026gt;\u0026gt;endobj\n3 0 obj\u0026lt;\u0026lt;/Parent 2 0 R\u0026gt;\u0026gt;endobj\ntrailer \u0026lt;\u0026lt;/Root 1 0 R\u0026gt;\u0026gt;\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eis equivalent to:\u003c/p\u003e\n\u003cpre class=\"text\"\u003e\u003ccode\u003e%PDF-1.\n11 0 obj\u0026lt;\u0026lt;/Pages 12 0 R\u0026gt;\u0026gt;endobj\n12 0 obj\u0026lt;\u0026lt;/Kids[13 0 R]/Count 1\u0026gt;\u0026gt;endobj\n13 0 obj\u0026lt;\u0026lt;/Parent 12 0 R\u0026gt;\u0026gt;endobj\ntrailer \u0026lt;\u0026lt;/Root 11 0 R\u0026gt;\u0026gt;\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eTricks:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eStoring unused objects in a PDF is tolerated.\u003c/li\u003e\n\u003cli\u003eSkipping any object numbers is also OK. There's even an official way to skip numbers in the \u003ccode\u003eXREF\u003c/code\u003e table.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eSo storing two document trees in the same file is OK. We just need to make the root object refer to either root object of both documents.\u003c/p\u003e\n\u003cp\u003eSo we just need to take two documents, renumber objects and references so that there is no overlap, craft a collision so that the element number referenced as Root object can be changed while keeping the same hash value, which is a perfect fit for UniColl with \u003ccode\u003eN=1\u003c/code\u003e, and adjust the \u003ccode\u003eXREF\u003c/code\u003e table accordingly.\u003c/p\u003e\n\u003c!--\ndigraph {\n rankdir=LR;\n \"trailer\" -\u003e \"catalog#1\" [color=green]\n \"catalog#1\" -\u003e \"pages#2\"\n \"pages#2\" -\u003e \"page#3\"\n \"page#3\" -\u003e \"pages#2\"\n \"page#3\" -\u003e \"content#4\"\n \"content#4\" -\u003e \"Hello World!\"\n trailer -\u003e \"catalog#11\" [\u003ccol\u003eor=red, style=dashed]\n \"catalog#11\" -\u003e \"pages#12\"\n \"pages#12\" -\u003e \"page#13\"\n \"page#13\" -\u003e \"pages#12\"\n \"page#13\" -\u003e \"content#14\"\n \"content#14\" -\u003e \"Bye World!\";\n}\n--\u003e\n\n\u003cp\u003e\u003cimg src=\"pics/pdfcollision.svg\" /\u003e\u003c/p\u003e\n\u003cp\u003eThis way, we can safely collide any pair of PDFs, no matter the page numbers, dimensions, images...\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003ecomments\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003ePDF can store foreign data in two ways:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eas a line comment, in which the only forbidden characters are newline (\u003ccode\u003e\\r\u003c/code\u003e and \u003ccode\u003e\\n\u003c/code\u003e). This can be used inside a dictionary object, to modify for example an object reference, via UniColl. So this is a valid PDF object even if it contains binary collision blocks - just retry until you have no newline characters:\n\u003cpre\u003e\u003ccode\u003e1 0 obj\n\u0026lt;\u0026lt; /Type /Catalog /MD5_is /REALLY_dead_now__ /Pages 2 0 R\n%¥┬•σe╕█╙X₧_~π▌╒εX∟■φe♦%τ8╞■[...]p╛╬ûFZ»‼v◘Åp↑╝%▓% ▼σφj╔◄dZ▀c²aU≤╨╩[├└─yNΓ5╔+▀╪yδ☻ß⌐░¼à(☺z₧\n\u0026gt;\u0026gt;\nendobj\n\u003c/code\u003e\u003c/pre\u003e\u003c/li\u003e\n\u003cli\u003eas a stream object, in which case any data is possible, but since we're inside an object, we can't alter the whole PDF structure, so it requires a chosen-prefix collision to modify the structure outside the containing stream object.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003e\u003cstrong\u003ecolliding text\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eThe first case makes it possible to highlight the beauty of UniColl, a collision where differences are predictable, so you can write poetry over colliding data - thanks \u003ca href=\"https://github.com/Jurph/word-decrementer\"\u003eJurph\u003c/a\u003e!\u003c/p\u003e\n\u003cp\u003eRather than modifying the structure of the document and fool parsers, we'll just use collision blocks directly to produce directly text, with alternate reading!\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e           V                      V\n  Now he hash MD5,       Now he hath MD5,\n  No enemy cares!        No enemy dares!\n   Only he gave           Only he have\n   the shards.            the shares.\n  Can’t be owned \u0026amp;       Can’t be pwned \u0026amp;\n  his true gold,         his true hold,\n  like One Frail,        like One Grail,\n  sound as fold.         sound as gold.\n           ^                      ^\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/poeMD5_A.pdf\"\u003epoeMD5 A\u003c/a\u003e ⟷ \u003ca href=\"examples/poeMD5_B.pdf\"\u003epoeMD5 B\u003c/a\u003e\u003c/p\u003e\n\u003cimg alt='2 Poems colliding via UniColl' src=pics/poeMD5.png width=500/\u003e\n\n\u003cp\u003e\u003cem\u003eA true cryptographic artistic creation :)\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003e(Note I screwed up with Adobe compatibility, but that's my fault, not UniColl's)\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003ecolliding document structure\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eWhether you use UniColl as inline comment or chosen-prefix in a dummy stream object, the strategy is similar: shuffle objects numbers around, then make Root object point to different objects, so unlike Shattered, this means instant collision of any arbitrary pair of PDF, at document level.\u003c/p\u003e\n\u003cp\u003eA useful trick is that \u003ca href=\"https://mupdf.com/docs/manual-mutool-clean.html\"\u003e\u003ccode\u003emutool clean\u003c/code\u003e\u003c/a\u003e output is reliably predictable, so it can be used to normalize PDFs as input, and fix your merged PDF while keeping the important parts of the file unmodified. MuTool doesn't discard bogus key/values - unless asked, and keep them in the same order, so using fake dictionary entries such as \u003ccode\u003e/MD5_is /REALLY_dead_now__\u003c/code\u003e is perfect to align things predictably without needing another kind of comments. However it won't keep comments in dictionaries (so no inline-comment trick)\u003c/p\u003e\n\u003cp\u003eAn easy way to do the object-shuffling operation without hassle is just to merge both PDF files via \u003ccode\u003emutool merge\u003c/code\u003e then split the \u003ccode\u003e/Pages\u003c/code\u003e object in two.\u003c/p\u003e\n\u003cp\u003eTo make room for this object, just merge in front of the two documents a dummy PDF.\u003c/p\u003e\n\u003cp\u003eOptionally, create a fake reference to the dangling array to prevent garbage collection from deleting the second set of pages.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eExample\u003c/strong\u003e: with this \u003ca href=\"scripts/pdf.py\"\u003escript\u003c/a\u003e, it takes \u003ca href=\"examples/pdf.log\"\u003eless than a second\u003c/a\u003e to collide the two public PDF papers like Spectre and Meltdown:\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/collision1.pdf\"\u003espectre.pdf\u003c/a\u003e ⟷ \u003ca href=\"examples/collision2.pdf\"\u003emeltdown.pdf\u003c/a\u003e\u003c/p\u003e\n\u003cimg alt='identical prefix PDF collisions' src=pics/pdf.png width=600/\u003e\n\n\u003cp\u003ePossible extension: chain UniColl blocks to also keep pairs of the various \u003ca href=\"https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf#page=81\"\u003enon-critical objects\u003c/a\u003e that can be referenced in the Root object - such as \u003ccode\u003eOutlines\u003c/code\u003e, \u003ccode\u003eNames\u003c/code\u003e, \u003ccode\u003eAcroForm\u003c/code\u003e and Additional Actions (\u003ccode\u003eAA\u003c/code\u003e) - in the original source files.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003ein PDFLaTeX\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eThe previous techniques work with just a pair of PDF files, but it's also possible to do it directly from TeX sources via \u003ca href=\"http://texdoc.net/texmf-dist/doc/pdftex/manual/pdftex-a.pdf\"\u003especific PDFTeX operators\u003c/a\u003e.\u003c/p\u003e\n\u003cp\u003eYou can define objects directly - including dummy key and values for alignments - and define empty objects to reserve some object slots by including this at the very start of your TeX sources:\u003c/p\u003e\n\u003cdiv class=\"sourceCode\" id=\"cb18\"\u003e\u003cpre class=\"sourceCode latex\"\u003e\u003ccode class=\"sourceCode latex\"\u003e\u003ca class=\"sourceLine\" id=\"cb18-1\" data-line-number=\"1\"\u003e\u003cspan class=\"co\"\u003e% set PDF version low to prevent stream XREF\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-2\" data-line-number=\"2\"\u003e\u003cspan class=\"fu\"\u003e\\pdfminorversion\u003c/span\u003e=3\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-3\" data-line-number=\"3\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-4\" data-line-number=\"4\"\u003e\u003cspan class=\"fu\"\u003e\\begingroup\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-5\" data-line-number=\"5\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-6\" data-line-number=\"6\"\u003e  \u003cspan class=\"co\"\u003e% disable compression to keep alignments\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-7\" data-line-number=\"7\"\u003e  \u003cspan class=\"fu\"\u003e\\pdfcompresslevel\u003c/span\u003e=0\u003cspan class=\"fu\"\u003e\\relax\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-8\" data-line-number=\"8\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-9\" data-line-number=\"9\"\u003e  \u003cspan class=\"fu\"\u003e\\immediate\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-10\" data-line-number=\"10\"\u003e  \u003cspan class=\"fu\"\u003e\\pdfobj\u003c/span\u003e{\u0026lt;\u0026lt;\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-11\" data-line-number=\"11\"\u003e    /Type /Catalog\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-12\" data-line-number=\"12\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-13\" data-line-number=\"13\"\u003e    \u003cspan class=\"co\"\u003e% cool alignment padding\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-14\" data-line-number=\"14\"\u003e    /MD5_is /REALLY_dead_now__\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-15\" data-line-number=\"15\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-16\" data-line-number=\"16\"\u003e    \u003cspan class=\"co\"\u003e% the first reference number should be on offset 0x49,\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-17\" data-line-number=\"17\"\u003e    \u003cspan class=\"co\"\u003e% so the \u0026#39;2\u0026#39; object number will be changed to \u0026#39;3\u0026#39; by UniColl\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-18\" data-line-number=\"18\"\u003e    /Pages 2 0 R\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-19\" data-line-number=\"19\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-20\" data-line-number=\"20\"\u003e    \u003cspan class=\"co\"\u003e% now padding so that the collision blocks (ends at 0xC0) are covered\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-21\" data-line-number=\"21\"\u003e    /0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-22\" data-line-number=\"22\"\u003e    \u003cspan class=\"co\"\u003e% with an extra character to be replaced by a return char\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-23\" data-line-number=\"23\"\u003e    /0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-24\" data-line-number=\"24\"\u003e  \u0026gt;\u0026gt;}\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-25\" data-line-number=\"25\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-26\" data-line-number=\"26\"\u003e  \u003cspan class=\"co\"\u003e% the original catalog of the shifted doc\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-27\" data-line-number=\"27\"\u003e  \u003cspan class=\"fu\"\u003e\\immediate\\pdfobj\u003c/span\u003e{\u0026lt;\u0026lt;/Type/Pages/Count 1/Kids[8 0 R]\u0026gt;\u0026gt;}\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-28\" data-line-number=\"28\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-29\" data-line-number=\"29\"\u003e  \u003cspan class=\"co\"\u003e% the original catalog of the host doc\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-30\" data-line-number=\"30\"\u003e  \u003cspan class=\"fu\"\u003e\\immediate\\pdfobj\u003c/span\u003e{\u0026lt;\u0026lt;/Type/Pages/Count 1/Kids[33 0 R]\u0026gt;\u0026gt;}\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-31\" data-line-number=\"31\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-32\" data-line-number=\"32\"\u003e  \u003cspan class=\"co\"\u003e% now we need to reserve PDF Objects so that there is no overlap\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-33\" data-line-number=\"33\"\u003e  \u003cspan class=\"fu\"\u003e\\newcount\\objcount\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-34\" data-line-number=\"34\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-35\" data-line-number=\"35\"\u003e  \u003cspan class=\"co\"\u003e% the host size (+3 for spare object slots) - 1\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-36\" data-line-number=\"36\"\u003e  \u003cspan class=\"co\"\u003e% putting a higher margin will just work, and XREF can have huge gaps\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-37\" data-line-number=\"37\"\u003e  \u003cspan class=\"fu\"\u003e\\objcount\u003c/span\u003e=25\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-38\" data-line-number=\"38\"\u003e  \u003cspan class=\"fu\"\u003e\\loop\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-39\" data-line-number=\"39\"\u003e    \u003cspan class=\"fu\"\u003e\\message\u003c/span\u003e{\u003cspan class=\"fu\"\u003e\\the\\objcount\u003c/span\u003e}\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-40\" data-line-number=\"40\"\u003e    \u003cspan class=\"fu\"\u003e\\advance\u003c/span\u003e \u003cspan class=\"fu\"\u003e\\objcount\u003c/span\u003e -1\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-41\" data-line-number=\"41\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-42\" data-line-number=\"42\"\u003e  \u003cspan class=\"fu\"\u003e\\immediate\\pdfobj\u003c/span\u003e{\u0026lt;\u0026lt;\u0026gt;\u0026gt;} \u003cspan class=\"co\"\u003e% just an empty object\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-43\" data-line-number=\"43\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-44\" data-line-number=\"44\"\u003e  \u003cspan class=\"fu\"\u003e\\ifnum\u003c/span\u003e \u003cspan class=\"fu\"\u003e\\objcount\u003c/span\u003e\u0026gt;0\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-45\" data-line-number=\"45\"\u003e  \u003cspan class=\"fu\"\u003e\\repeat\u003c/span\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-46\" data-line-number=\"46\"\u003e\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb18-47\" data-line-number=\"47\"\u003e\u003cspan class=\"fu\"\u003e\\endgroup\u003c/span\u003e\u003c/a\u003e\u003c/code\u003e\u003c/pre\u003e\u003c/div\u003e\n\u003cp\u003eDon't forget to normalize PDFLaTeX output - with \u003ccode\u003emutool\u003c/code\u003e for example - if needed: PDFLaTeX is hard to get reproducible builds across distributions - you may even want to hook the time on execution to get the exact hash if required.\u003c/p\u003e\n\u003ch4 id=\"jpg-in-pdf\"\u003eJPG in PDF\u003c/h4\u003e\n\u003cp\u003eYou could expect JPG to be only images, but in a PDF and some PDF readers (non browsers, such as Evince and Adobe Reader), it can be used as page content just like any other embedded object, that is embedded in a JPEG image.\u003c/p\u003e\n\u003cp\u003eTo store the JPEG data losslessly, store it as grayscale 100%, then either use a picture of single row/column, or repeat the data line 8 times (since JPEG blocks are 8x8), and your data is stored losslessly and referenced by the PDF pages.\u003c/p\u003e\n\u003cp\u003eExamples of SHA-1 colliding two PDFs via JPEG page data (a grayscale picture rendering colors) as vector page content:\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"examples/jpgpage1.pdf\"\u003eIf\u003c/a\u003e ⟷ \u003ca href=\"examples/jpgpage2.pdf\"\u003eShattered - the movie\u003c/a\u003e\u003c/p\u003e\n\u003cimg alt='2 SHA-1 colliding PDFs with image data stored as JPG' src=pics/jpgpage.png width=700/\u003e\n\n\u003cp\u003e\u003cem\u003e2 SHA-1 colliding PDFs with image data stored as JPG\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eIt's possible to reference the colliding JPG twice: as a page content, losslessly, which also refers to itself as a lossy image to be displayed. Again, the image to be displayed is grayscale, but the page content can render some colors via PDF operators.\u003c/p\u003e\n\u003cp\u003eThe top of the image shows the page content repeated 8 times.\u003c/p\u003e\n\u003cp\u003eExamples of SHA-1 colliding two PDFs via JPEG used as page data and picture to be displayed:\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"examples/dualjpg1.pdf\"\u003eSkulls \u0026amp; Crossbones\u003c/a\u003e ⟷ \u003ca href=\"examples/dualjpg2.pdf\"\u003eGolden Axe\u003c/a\u003e\u003c/p\u003e\n\u003cimg alt='2 SHA-1 collidings PDF with JPG used as image and page content' src=pics/dualjpg.png width=700/\u003e\n\n\u003cp\u003e\u003cem\u003e2 SHA-1 colliding PDFs with JPG used as image and page content\u003c/em\u003e\u003c/p\u003e\n\u003ch3 id=\"zip\"\u003eZIP\u003c/h3\u003e\n\u003cp\u003e\u003cstrong\u003eTL;DR\u003c/strong\u003e There's no generic re-usable collision for ZIP, but there is for ZIP-based format. It should be possible to collide two files in 2h.core (36 times faster than chosen-prefix)\u003c/p\u003e\n\u003cimg alt='a ZIP file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/ZIP.png width=600/\u003e\n\n\u003cp\u003eZIP archives are a sandwich of 3 layers (at least). First comes the files' content (sequence of \u003ccode\u003eLocal File Header\u003c/code\u003e structures, one per archived file or directory), then some index (again, a sequence of \u003ccode\u003eCentral Directory\u003c/code\u003e), then a single structure that points to this index (\u003ccode\u003eEnd Of Central Directory\u003c/code\u003e).\u003c/p\u003e\n\u003cp\u003eThe order of these layers can't be moved around. Some parser only need the file content's structure, but that's not a correct way to parse and it can be abused.\u003c/p\u003e\n\u003cp\u003eBecause of this required order, there's no generic prefix that could help for any collision.\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003enon generic approach\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eAnother approach could be to just merge both archives, with their merged layers, and using UniColl - but with N=2, which introduces a difference on the 4th byte - to kill the magic signature of the \u003ccode\u003eEnd of Central Directory\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003eThis means one could collide two arbitrary ZIP with a single UniColl and 24 bytes of set prefix.\u003c/p\u003e\n\u003cp\u003eA typical End of Central Directory, which is 22 bytes if the comment is empty:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e00: 504b 0506 0000 0000 0000 0000 0000 0000  PK..............\n10: 0000 0000 0000                           ......\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eIf we use this as prefix (pad the prefix to 16 bits) for UniColl and \u003ccode\u003eN=2\u003c/code\u003e, the difference is on the 4th byte, killing the magic \u003ccode\u003e.P .K 05 06\u003c/code\u003e by changing it predictably to \u003ccode\u003e.P .K 05 86\u003c/code\u003e\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e00: 504b 0506 0000 0000 0000 0000 0000 0000  PK..............\n10: 0000 0000 0000 2121 eb66 cf9d db01 83bb  ......!!.f......\n20: 2888 4c41 e345 7d07 1634 5d4a 3b61 89a0  (.LA.E}..4]J;a..\n30: 0029 94af 4168 2517 0bbc b841 cbf2 9587  .)..Ah%....A....\n40: e438 0043 6390 279d 7c9e a01e e476 4c36  .8.Cc.\u0026#39;.|....vL6\n50: 527f b1f4 653e d866 f98d 7278 5324 0bd5  R...e\u0026gt;.f..rxS$..\n60: b31d ef6d d5d6 1163 5a2e a8a5 21bf eab4  ...m...cZ...!...\n70: c59c 028e a913 f6b7 0036 c93f 5092 a628  .........6.?P..(\n\u003c/code\u003e\u003c/pre\u003e\n\u003cpre\u003e\u003ccode\u003e00: 504b 0586 0000 0000 0000 0000 0000 0000  PK..............\n10: 0000 0000 0000 2121 eb66 cf1d db01 83bb  ......!!.f......\n20: 2888 4c41 e345 7d07 1634 5d4a 3b61 89a0  (.LA.E}..4]J;a..\n30: 0029 94af 4168 251f 0bbc b841 cbf2 9587  .)..Ah%....A....\n40: e438 00c3 6390 279d 7c9e a01e e476 4c36  .8..c.\u0026#39;.|....vL6\n50: 527f b1f4 653e d866 f98d 72f8 5324 0bd5  R...e\u0026gt;.f..r.S$..\n60: b31d ef6d d5d6 1163 5a2e a8a5 21bf eab4  ...m...cZ...!...\n70: c59c 028e a913 f6af 0036 c93f 5092 a628  .........6.?P..(\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eThis is not generic at all, but much faster than chosen-prefix collision:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003ereal 12m23.993s\nuser 112m24.072s\nsys 2m0.194s\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eA problem is that some parsers still parse ZIP files upside-down even if they should be parsed bottom-up: a way to make sure that both files are properly parsed is to chain two UniColl blocks, to enable/disable each \u003ccode\u003eEnd of Central Directory\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003eTo prevent ZIP parsers from complaining about unused space, one can abuse \u003ccode\u003eExtra Fields\u003c/code\u003e, file comments in \u003ccode\u003eCentral Directory\u003c/code\u003e and archive comments in \u003ccode\u003eEnd of Central Directory\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/zip.png\" alt=\"diagram of ZIP collision\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cstrong\u003eExample\u003c/strong\u003e: here is an \u003ca href=\"scripts/zip.asm\"\u003eassembly source\u003c/a\u003e that describes the structure of a dual ZIP, that can host two different archive files.\u003c/p\u003e\n\u003cp\u003eAfter two Unicoll computations, it gives the two colliding files: \u003ca href=\"examples/collision1.zip\"\u003ecollision1.zip\u003c/a\u003e ⟷ \u003ca href=\"examples/collision2.zip\"\u003ecollision2.zip\u003c/a\u003e\u003c/p\u003e\n\u003ch4 id=\"zip-based-formats\"\u003eZip-based formats\u003c/h4\u003e\n\u003cp\u003eEven if the Zip format itself can't be generically exploited like Gzip, some formats relying on Zip \u003cem\u003ecan\u003c/em\u003e be generically exploited inside Zip archives with a pre-defined structure. Some precautions have to be taken to make the Zip collision generic.\u003c/p\u003e\n\u003cp\u003eSome formats are multi-files stored in a Zip archive, and rely on a root file with a fixed filename that points to other files in the archive. Many of them are using XML or text for the root file, and storing other files as-is.\u003c/p\u003e\n\u003cp\u003eIdea : make 2 files sets coexist in the same archive, and point to either set of files. A generic root can be stored first in the beginning of the file, but the collision blocks are stored outside of the file content, in the archive (since collisions have a very high entropy, it's impossible to exploit XML or ASCII-only files with collisions).\u003c/p\u003e\n\u003cp\u003eSteps:\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003e\u003cp\u003ePut 2 sets of files from 2 origins in the same archive - i.e. in different subdirectories.\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003eModify the root file to alternatively point to each set.\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003eSince the timestamp, length and CRC of the root file are stored in both the \u003ccode\u003eLocal File Header\u003c/code\u003e - before the file's contents - and in the \u003ccode\u003eCentral Directory\u003c/code\u003e - after the file contents - these values shouldn't change between the two versions of the files.\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eIf the length varies, all the pointers afterwards will vary, so an identical suffix wouldn't be possible.\u003c/li\u003e\n\u003cli\u003eIf the CRC32 is incorrect in the \u003ccode\u003eCentral Directory\u003c/code\u003e, this copy of the value might be ignored by the parser, but forging a CRC32 to a constant value is helpful to avoid entirely the problem. Forging the CRC by appending 4 random bytes will likely not be enough, as these root files are typically in XML or text with strict syntaxes, so they would become invalid. \u003ca href=\"https://github.com/resilar/crchack\"\u003eCrcHack\u003c/a\u003e greatly helps with forging CRCs with arbitrary bits and no bruteforcing, making sure that the output file is ASCII, and that the modified bits are still in a comment.\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003eUsing the \u003ccode\u003eextra field\u003c/code\u003e of an extra dummy file -- even empty -- in the archive after the root file is an elegant way to store Hashclash collision blocks: that way, the Zip archive maintains a standard structure and can be easily manipulated afterwards, even with standard tools.\u003c/p\u003e\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003e\u003ccode\u003eExtra Fields\u003c/code\u003e have no CRC32, and their 16 bits length is declared in the headers before. They have their own internal \u003ccode\u003eID:2 Size:2 Data\u003c/code\u003e format but it's usually ignored, and are in both the \u003ccode\u003eLocal File Header\u003c/code\u003e and in the \u003ccode\u003eCentral Directory\u003c/code\u003e, but it can be absent from the \u003ccode\u003eCentral Directory\u003c/code\u003e to keep the suffix identical after the collision blocks.\u003c/p\u003e\n\u003cp\u003eThe presence of the extra file that covers the collision blocks in its \u003ccode\u003eextra field\u003c/code\u003e may have to be declared in the format structure, such as in the \u003ccode\u003e[Content_Types].xml\u003c/code\u003e file in an OOXML document. Other XML files in the suffix may have to be modified, as some formats required the use of absolute paths.\u003c/p\u003e\n\u003cp\u003eHere's the overall structure of the generic exploit for a specific zip-based format:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e[Root file] (with constant CRC32)\n\n[Dummy file] (with collision blocks in the extra field)\n\n[...] \u0026lt;- rest of the archive, with 2 documents merged\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eSo by predefining the root file contents and forging ASCII CRC32s, one can compute a generic re-usable Hashclash collision for a specific zip-based format.\u003c/p\u003e\n\u003ch3 id=\"requirements-summary\"\u003eRequirements summary\u003c/h3\u003e\n\u003cul\u003e\n\u003cli\u003etwo or more prefixes\u003c/li\u003e\n\u003cli\u003eone or more file types (polyglots work without problems)\u003c/li\u003e\n\u003cli\u003ean XML root file with fixed filename, file length and CRC: this info is present twice, before and after the collision blocks\u003c/li\u003e\n\u003cli\u003econtents are arbitrary XML\u003c/li\u003e\n\u003cli\u003epadding is possible, even via XML comment, to reach the same length.\u003c/li\u003e\n\u003cli\u003eCRC can be set (via CrcHack) on each content.\u003c/li\u003e\n\u003cli\u003eboth set of files co-exist in the suffix, likely in different directories. Some tools hardcode the path, which may reduce compatibility.\u003c/li\u003e\n\u003cli\u003ea \u003cem\u003eContent type\u003c/em\u003e XML file may need to be merged to cover all files, supported and unsupported (collision blocks, and alternate document)\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch3 id=\"examples\"\u003eExamples\u003c/h3\u003e\n\u003ch4 id=\"crc32\"\u003eCRC32\u003c/h4\u003e\n\u003cp\u003eA minimal XML comment (ASCII-only) with a forged CRC32 (instant computation) with CrcHack.\u003c/p\u003e\n\u003cdiv class=\"sourceCode\" id=\"cb24\"\u003e\u003cpre class=\"sourceCode bash\"\u003e\u003ccode class=\"sourceCode bash\"\u003e\u003ca class=\"sourceLine\" id=\"cb24-1\" data-line-number=\"1\"\u003e\u003cspan class=\"bu\"\u003eecho\u003c/span\u003e \u003cspan class=\"st\"\u003e\u0026quot;\u0026lt;!--ABCDEF--\u0026gt;\u0026quot;\u003c/span\u003e \u003cspan class=\"kw\"\u003e|\u003c/span\u003e \u003cspan class=\"ex\"\u003ecrchack\u003c/span\u003e -b 4.0:+.8*6:1 -b 4.1:+.8*6:1 -b 4.2:+.8*6:1 -b 4.3:+.8*6:1 -b 4.4:+.8*6:1 -b 4.5:+.8*5:1 - 0xdeadf00d\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb24-2\" data-line-number=\"2\"\u003e\u003cspan class=\"op\"\u003e\u0026lt;\u003c/span\u003e!\u003cspan class=\"ex\"\u003e--X\u003c/span\u003e{]EZF--\u003cspan class=\"op\"\u003e\u0026gt;\u003c/span\u003e\u003c/a\u003e\u003c/code\u003e\u003c/pre\u003e\u003c/div\u003e\n\u003cp\u003eAnother example where you adjust the CRC with the case of an alphabetical message.\u003c/p\u003e\n\u003cdiv class=\"sourceCode\" id=\"cb25\"\u003e\u003cpre class=\"sourceCode bash\"\u003e\u003ccode class=\"sourceCode bash\"\u003e\u003ca class=\"sourceLine\" id=\"cb25-1\" data-line-number=\"1\"\u003e\u003cspan class=\"bu\"\u003eecho\u003c/span\u003e \u003cspan class=\"st\"\u003e\u0026quot;\u0026lt;!--THISKINDOFCRCISREALLYIMPRESSIVEA--\u0026gt;\u0026quot;\u003c/span\u003e \u003cspan class=\"kw\"\u003e|\u003c/span\u003e \u003cspan class=\"ex\"\u003ecrchack.exe\u003c/span\u003e -b 4:+.8*32:.8 - 0xcafebabe\u003c/a\u003e\n\u003ca class=\"sourceLine\" id=\"cb25-2\" data-line-number=\"2\"\u003e\u003cspan class=\"op\"\u003e\u0026lt;\u003c/span\u003e!\u003cspan class=\"ex\"\u003e--THIskInDoFCRcIsrEALlyimpRESSIVea--\u003c/span\u003e\u003cspan class=\"op\"\u003e\u0026gt;\u003c/span\u003e\u003c/a\u003e\u003c/code\u003e\u003c/pre\u003e\u003c/div\u003e\n\u003ch4 id=\"collisions\"\u003eCollisions\u003c/h4\u003e\n\u003cp\u003e\u003ca href=\"scripts/zinsider.py\"\u003ezInsider\u003c/a\u003e is a script to instantly generate MD5 collisions of pairs of arbitrary documents using these ZIP+XML formats:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eOffice Open XML: docx / pptx / xlsx\u003c/li\u003e\n\u003cli\u003eOpen Container Format: epub\u003c/li\u003e\n\u003cli\u003eOpen Packaging Conventions:\n\u003cul\u003e\n\u003cli\u003e3D manufacturing format: 3mf\u003c/li\u003e\n\u003cli\u003eXML Paper Specification: xps / oxps\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eTo generate your own collision prefixes, \u003ca href=\"scripts/makezip.py\"\u003ehere is a script\u003c/a\u003e to generate a root zip pair. After computing collisions, use \u003ca href=\"scripts/extendzip.py\"\u003ethis other script\u003c/a\u003e to combine these roots pair with a common suffix.\u003c/p\u003e\n\u003cp\u003eA few collision PoCs:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eOffice Open XML: Excel (\u003ca href=\"examples/free/md5-1.xls\"\u003e1\u003c/a\u003e - \u003ca href=\"examples/free/md5-2.xls\"\u003e2\u003c/a\u003e), Powerpoint (\u003ca href=\"examples/free/md5-1.pptx\"\u003e1\u003c/a\u003e - \u003ca href=\"examples/free/md5-2.pptx\"\u003e2\u003c/a\u003e), Word (\u003ca href=\"examples/free/md5-1.docx\"\u003e1\u003c/a\u003e - \u003ca href=\"examples/free/md5-2.docx\"\u003e2\u003c/a\u003e).\u003c/li\u003e\n\u003cli\u003eOpen Container Format: Epub (\u003ca href=\"examples/collision-1.epub\"\u003e1\u003c/a\u003e - \u003ca href=\"examples/collision-2.epub\"\u003e2\u003c/a\u003e).\u003c/li\u003e\n\u003cli\u003eOpen Packaging Conventions: 3MF (\u003ca href=\"examples/collision-1.3mf\"\u003e1\u003c/a\u003e - \u003ca href=\"examples/collision-2.3mf\"\u003e2\u003c/a\u003e), XPS (\u003ca href=\"examples/collision-1.xps\"\u003e1\u003c/a\u003e - \u003ca href=\"examples/collision-2.xps\"\u003e2\u003c/a\u003e).\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eSome formats with multiple files based on Zip can't be generically exploited:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003eQuake PK3: a zip of files with no specific root.\u003c/li\u003e\n\u003cli\u003eOpen Document Format: the \u003ccode\u003eMETA-INF/manifest.xml\u003c/code\u003e file has to mention every other file, so it can't be generic.\u003c/li\u003e\n\u003cli\u003eAPK, JAR, XPI: the \u003ccode\u003eMETA-INF/MANIFEST.mf\u003c/code\u003e file also has to mention every other file, with its hashes.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eThanks to \u003ca href=\"https://twitter.com/decalage2\"\u003ePhilippe Lagadec\u003c/a\u003e for his help on Office file formats!\u003c/p\u003e\n\u003ch3 id=\"others\"\u003eOthers\u003c/h3\u003e\n\u003cul\u003e\n\u003cli\u003eWasm, via a custom section: \u003ca href=\"scripts/wasm.py\"\u003escript\u003c/a\u003e, examples: \u003ca href=\"examples/free/md5-1.wasm\"\u003emd5-1.wasm\u003c/a\u003e ⟷ \u003ca href=\"examples/free/md5-2.wasm\"\u003emd5-2.wasm\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch2 id=\"uncommon-strategies\"\u003eUncommon strategies\u003c/h2\u003e\n\u003cp\u003eCollisions are usually about two valid files of the same type.\u003c/p\u003e\n\u003ch3 id=\"multicolls-multiple-collisions-chain\"\u003eMultiColls: multiple collisions chain\u003c/h3\u003e\n\u003cp\u003eNothing prevents to chain several collision blocks, and have more than two contents with the same hash value. An example of that are \u003cem\u003ehashquines\u003c/em\u003e - that shows their own MD5 value. The \u003ca href=\"https://github.com/angea/pocorgtfo#0x14\"\u003ePoCGTFO 14\u003c/a\u003e file contains 609 FastColl collisions, to do that through two file types in the same file.\u003c/p\u003e\n\u003ch4 id=\"hashquines\"\u003eHashquines\u003c/h4\u003e\n\u003cp\u003eHashquines are files showing their own hash value. They are covered \u003ca href=\"hashquines/\"\u003ehere\u003c/a\u003e.\u003c/p\u003e\n\u003ch3 id=\"validity\"\u003eValidity\u003c/h3\u003e\n\u003cp\u003eA different strategy would be to kill the file type to bypass scanning as a corrupted file. Just overwriting the magic signature will be enough. Appending both files (as valid or invalid) with a format that doesn't need to be at offset 0 (archive, like ZIP/RAR/...) would reveal another file type.\u003c/p\u003e\n\u003cp\u003eThis enables polyglot collisions without using a chosen-prefix collision:\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003euse UniColl to enable or disable a magic signature, for example a PNG:\u003c/li\u003e\n\u003cli\u003eappend a ZIP archive\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eWhile technically both files are a valid ZIP, since most parser return the first file type found and they start scanning at offset 0, they will see a different file type.\u003c/p\u003e\n\u003cp\u003eExamples:\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='valid image' src=examples/png-valid.png width=300/\u003e ⟷ \u003ca href=\"examples/png-invalid.png\"\u003einvalid\u003c/a\u003e\u003c/p\u003e\n\u003ch3 id=\"polycolls-collisions-of-different-file-types\"\u003ePolyColls: collisions of different file types\u003c/h3\u003e\n\u003cp\u003eIt's also possible to have both side of a collision with different types to lower suspicion:\u003c/p\u003e\n\u003cp\u003eAttack scenario:\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003esend \u003ccode\u003eholiday.jpg\u003c/code\u003e\u003c/li\u003e\n\u003cli\u003eget it whitelisted\u003c/li\u003e\n\u003cli\u003esend \u003ccode\u003eevil.exe\u003c/code\u003e, which has the same MD5.\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eIn these cases, a chosen-prefix collision is required if both file formats need to start at offset 0.\u003c/p\u003e\n\u003cp\u003eSome examples of polycoll layouts:\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/pdf-jpg.png\" alt=\"pdf-jpg polyglot collision\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003ePDF/JPG polycoll\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/pe-png.png\" alt=\"pe-png polyglot collision\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003ePE/PNG polycoll\u003c/em\u003e\u003c/p\u003e\n\u003ch4 id=\"pe---jpg\"\u003ePE - JPG\u003c/h4\u003e\n\u003cp\u003eSince a PE header is usually smaller than 0x500 bytes, it's a perfect fit for a JPG comment:\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003estart with DOS/JPG headers\u003c/li\u003e\n\u003cli\u003eJPEG-comment jumps over PE Header\u003c/li\u003e\n\u003cli\u003ePut the full JPG image\u003c/li\u003e\n\u003cli\u003ePut the whole PE specifications\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eOnce again, the collision is \u003ca href=\"scripts/jpgpe.py\"\u003einstant\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/jpg-pe.exe\"\u003efastcoll.exe\u003c/a\u003e ⟷ \u003ca href=\"examples/jpg-pe.jpg\"\u003eMarc.jpg\u003c/a\u003e\u003c/p\u003e\n\u003ch4 id=\"pdf---pe\"\u003ePDF - PE\u003c/h4\u003e\n\u003cp\u003eMerging a PDF with a dummy file with \u003ccode\u003emutool\u003c/code\u003e is a good generic way to reorder objects and then get the first two objects discardable (dummy page and content), which is a perfect fit for a hosting \u003ccode\u003estream\u003c/code\u003e object of unknown length as \u003ccode\u003e1 0\u003c/code\u003e, and its length referenced further (after collision blocks) in the second object.\u003c/p\u003e\n\u003cp\u003eThe only problem is that \u003ccode\u003emutool\u003c/code\u003e will always inline the length - and remove the length reference, so it has to be re-inserted in the PDF instead of the value, but most reference \u003ccode\u003e2 0 R\u003c/code\u003e will be smaller than hardcoded lengths. Thankfully this can be fixed without altering any object offset, so no need to patch the XREF.\u003c/p\u003e\n\u003cp\u003eHere's a \u003ca href=\"scripts/pdfpe.py\"\u003escript\u003c/a\u003e to, for example, instantly collide a PDF viewer (\u003ca href=\"https://www.sumatrapdfreader.org/free-pdf-reader.html\"\u003eSumatra\u003c/a\u003e is lightweight and standalone) and a PDF document:\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/pepdf.pdf\"\u003ePoster.pdf\u003c/a\u003e ⟷ \u003ca href=\"examples/pepdf.exe\"\u003eSumatra.exe\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/pdfpe.png\" alt=\"a PDF viewer showing a PDF (itself showing a PDF) with the same MD5\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003ea PDF viewer showing a PDF (itself showing a PDF) with the same MD5\u003c/em\u003e\u003c/p\u003e\n\u003ch4 id=\"pdf---png\"\u003ePDF - PNG\u003c/h4\u003e\n\u003cp\u003eSimilarly, it's possible to collide for example arbitrary PDF and PNG files with no restriction on either side. This is instant, re-usable and generic.\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/png-pdf.pdf\"\u003eHello.pdf\u003c/a\u003e ⟷ \u003ca href=\"examples/png-pdf.png\"\u003e1x1.png\u003c/a\u003e\u003c/p\u003e\n\u003ch3 id=\"pileups-multi-collision\"\u003ePileUps (multi-collision)\u003c/h3\u003e\n\u003cp\u003eCryptographic collisions are not limited to two files!\u003c/p\u003e\n\u003cp\u003eAs demonstrated in the \u003ca href=\"https://www.win.tue.nl/hashclash/Nostradamus/\"\u003eNostradamus\u003c/a\u003e experiment in 2008, chaining collisions makes it possible to collide more than two files.\u003c/p\u003e\n\u003cp\u003eThe first collisions can be identical or chosen-prefix, the next ones have to be chosen-prefix.\u003c/p\u003e\n\u003cp\u003eYou can call them multi-collisions, I prefer \u003cem\u003epileups\u003c/em\u003e - it's shorter :)\u003c/p\u003e\n\u003ch4 id=\"pe---png---mp4---pdf\"\u003ePE - PNG - MP4 - PDF\u003c/h4\u003e\n\u003cp\u003eCombining all previously acquired knowledge, I used 3 chosen-prefix collisions to craft 4 different prefixes for different file types: document (PDF), video (MP4), executable (PE) and image (PNG).\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/pileup-diagram.png\" alt=\"diagram of a PE/PNG/MP4/PDF pileup\" /\u003e\u003c/p\u003e\n\u003cp\u003e\u003cem\u003ediagram of a PE/PNG/MP4/PDF pileup\u003c/em\u003e\u003c/p\u003e\n\u003cp\u003eThis script is generic and instant:\u003c/p\u003e\n\u003cp\u003e\u003cimg src=\"pics/pileup.png\" alt=\"diagram of a PE/PNG/MP4/PDF pileup\" /\u003e\u003c/p\u003e\n\u003cp\u003eExamples: \u003ca href=\"examples/pileup.pdf\"\u003ecommodore.pdf\u003c/a\u003e ⟷ \u003ca href=\"examples/pileup.png\"\u003ediagram.png\u003c/a\u003e ⟷ \u003ca href=\"examples/pileup.mp4\"\u003ekidmo.mp4\u003c/a\u003e ⟷ \u003ca href=\"examples/pileup.exe\"\u003esumatra18.exe\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003eSince you may only distribute a single file and it's impossible to guess the other prefix values from it, a solution is to embed all prefixes of the collision in JavaScript code and insert it in your PoCs, turning your files into \u003ca href=\"examples/polyglot.html\"\u003eHTML polyglots\u003c/a\u003e to easily share the related colliding files.\u003c/p\u003e\n\u003cimg alt='HTML payload to generate extra colliding files' src=pics/polyglot.png width=600/\u003e\n\n\u003cp\u003eThe \u003ca href=\"https://github.com/angea/pocorgtfo#0x19\"\u003eissue 19\u003c/a\u003e of 'PoC or GTFO' is such a pileup \u003cstrong\u003eand\u003c/strong\u003e polyglot, combining a 80-page document generated with PDFLaTeX, a PDF viewer for Windows, a PNG diagram and a short 'collision' MP4 video by \u003ca href=\"https://www.kidmograph.com/\"\u003eKidMoGraph\u003c/a\u003e with an HTML payload to generate the other files from the PDF release (and a ZIP archive too):\u003c/p\u003e\n\u003cimg alt='Diagram of the issue 19 of PoC or GTFO, a polyglot and pileup.' src=pics/pocorgtfo19.png width=700/\u003e\n\n\u003cp\u003eThanks to Rafał Hirsz for his permanent help on JavaScript.\u003c/p\u003e\n\u003ch2 id=\"use-cases\"\u003eUse cases\u003c/h2\u003e\n\u003cp\u003eBetter discard MD5 altogether, because file introspection is just too time-consuming and too risky!\u003c/p\u003e\n\u003ch3 id=\"gotta-collide-em-all\"\u003eGotta collide 'em all!\u003c/h3\u003e\n\u003cp\u003eAnother use of instant, re-usable and generic collisions would be to hide any file of a given type - say PNG - behind dummy files (or the same file every time) - which is actually just by concatenating it to the same prefix after stripping the signature - you could even do that at library level!\u003c/p\u003e\n\u003cp\u003eFrom a strict parsing perspective, all your files will show the same content, and the evil images would be revealed as a file with the same MD5 as previously collected.\u003c/p\u003e\n\u003cp\u003eLet's take two files:\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='MS 08-067' src=pics/trinity.png width=300/\u003e ⟷ \u003cimg alt='MS 08-067' src=pics/javascript.png width=300/\u003e\u003c/p\u003e\n\u003cp\u003eand collide them with the same PNG.\u003c/p\u003e\n\u003cp\u003eThey now show the same dummy image, and they're absolutely identical until the 2nd image at file level!\u003c/p\u003e\n\u003cp\u003e\u003cimg alt='MS 08-067' src=examples/gcea1.png width=200/\u003e ⟷ \u003cimg alt='MS 08-067' src=examples/gcea2.png width=200/\u003e\u003c/p\u003e\n\u003cp\u003eTheir evil payload is hidden behind a file with the same MD5 respectively.\u003c/p\u003e\n\u003ch3 id=\"incriminating-files\"\u003eIncriminating files\u003c/h3\u003e\n\u003cp\u003eAnother use case for collisions is to hide something incriminating inside something innocent, but desirable: if the only thing to collect evidence is comparing weak hashes, then you can't deny that you don't have the other file (showing incriminating content but hiding innocent content).\u003c/p\u003e\n\u003cp\u003eSoftwares typically focus on (quick) parsing, not on detailed file analysis.\u003c/p\u003e\n\u003cimg alt='different previews under different tabs of EnCase Forensic' src=pics/encase.png width=400/\u003e\n\n\u003cp\u003e\u003cem\u003ean image showing different previews under different tabs of EnCase Forensic\u003c/em\u003e\u003c/p\u003e\n\u003ch2 id=\"failures\"\u003eFailures\u003c/h2\u003e\n\u003cp\u003eNot all formats can have generic prefixes that can be re-used: if some kind of data holder can't be inserted between the magic signature and the standard headers that are critical and specific to each file, then generic collisions are not possible.\u003c/p\u003e\n\u003cp\u003eOf course, one might still turn the old files into a new one, and even use code to branch out to two different payloads, but it's more like porting payloads than colliding file structure.\u003c/p\u003e\n\u003ch3 id=\"elf\"\u003eELF\u003c/h3\u003e\n\u003cimg alt='an ELF file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/ELF.png width=600/\u003e\n\n\u003cp\u003eThe ELF header is required at offset 0 and contains critical information such as 32b/64b, endianness and ABI right from the beginning, so it's impossible to have a universal prefix then collision blocks before critical parameters that are specific to the original file.\u003c/p\u003e\n\u003ch3 id=\"mach-o\"\u003eMach-O\u003c/h3\u003e\n\u003cimg alt='a Mach-O file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/MachO.png width=600/\u003e\n\n\u003cp\u003eMach-O don't even start with the same magic for 32b (\u003ccode\u003efeedface\u003c/code\u003e) and 64b (\u003ccode\u003efeedfacf\u003c/code\u003e). Soon after, there is the number and size of commands (such as segment definition, symtab, version,...).\u003c/p\u003e\n\u003cp\u003eLike ELF, re-usable collisions are not possible.\u003c/p\u003e\n\u003ch3 id=\"java-class\"\u003eJava Class\u003c/h3\u003e\n\u003cimg alt='a Java Class file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/CLASS.png width=600/\u003e\n\n\u003cp\u003eRight from the start magic are located the versions (which can be troublesome) but the constant pool count which is quite specific to each file, so no universal collisions for all files.\u003c/p\u003e\n\u003cp\u003eHowever, many files still have a common version and we can pad the shortest constant pool to the longest count. First, insert a \u003cem\u003eUTF8 literal\u003c/em\u003e to align information, then declare another one with its length abused by a UniColl (the length is stored on 16 bytes as big endian).\u003c/p\u003e\n\u003cp\u003eHowever this will require code manipulation since all pool indexes will be shifted.\u003c/p\u003e\n\u003cp\u003eInstant MD5 re-usable collisions of Java Class should be possible, but require code analysis and modification.\u003c/p\u003e\n\u003ch3 id=\"tar\"\u003eTAR\u003c/h3\u003e\n\u003cp\u003e\u003cstrong\u003eTL;DR\u003c/strong\u003e No re-usable collision for TAR files, no other strategy than chosen-prefix.\u003c/p\u003e\n\u003cimg alt='a TAR file' src=https://raw.githubusercontent.com/corkami/pics/master/binary/TAR.png width=600/\u003e\n\n\u003cp\u003eTape Archives are a sequence of concatenated header and file contents, all aligned to 512 bytes.\u003c/p\u003e\n\u003cp\u003eThere's no central structure to the whole file. So no global header or comment of any kind to abuse.\u003c/p\u003e\n\u003cp\u003eA trick would be to start a dummy file of variable length, but the length is always at the same offset, which is not compatible with UniColl, which means only chosen-prefix collisions is useful here.\u003c/p\u003e\n\u003ch2 id=\"exploitations-summary\"\u003eExploitations summary\u003c/h2\u003e\n\u003ctable\u003e\n\u003cthead\u003e\n\u003ctr class=\"header\"\u003e\n\u003cth\u003eFormat\u003c/th\u003e\n\u003cth\u003eGeneric?\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003eFastColl\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003eUniColl\u003c/th\u003e\n\u003cth\u003eShattered\u003c/th\u003e\n\u003cth style=\"text-align: center;\"\u003eHashClash / Shambles\u003c/th\u003e\n\u003c/tr\u003e\n\u003c/thead\u003e\n\u003ctbody\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003ePDF\u003c/td\u003e\n\u003ctd\u003eY\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003eJPG\u003c/td\u003e\n\u003ctd\u003eY (1)\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003ctd\u003ex (2)\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eGZ\u003c/td\u003e\n\u003ctd\u003eY\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003ctd\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003ePNG\u003c/td\u003e\n\u003ctd\u003eY/N (3)\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eMP4\u003c/td\u003e\n\u003ctd\u003eY (4)\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003ctd\u003ex (5)\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003ePE\u003c/td\u003e\n\u003ctd\u003eY\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eZIP-based (6)\u003c/td\u003e\n\u003ctd\u003eY\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003c/td\u003e\n\u003ctd\u003e\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eGIF\u003c/td\u003e\n\u003ctd\u003eN\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003eZIP\u003c/td\u003e\n\u003ctd\u003eN\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex (7)\u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003eELF\u003c/td\u003e\n\u003ctd\u003eN\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eTAR\u003c/td\u003e\n\u003ctd\u003eN\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"even\"\u003e\n\u003ctd\u003eMach-O\u003c/td\u003e\n\u003ctd\u003eN\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003ctr class=\"odd\"\u003e\n\u003ctd\u003eClass\u003c/td\u003e\n\u003ctd\u003eN\u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003e \u003c/td\u003e\n\u003ctd\u003e \u003c/td\u003e\n\u003ctd style=\"text-align: center;\"\u003ex\u003c/td\u003e\n\u003c/tr\u003e\n\u003c/tbody\u003e\n\u003c/table\u003e\n\u003col\u003e\n\u003cli\u003eJPG has some limitations on data that can be improved to some extend by manipulating scans encoding.\u003c/li\u003e\n\u003cli\u003ePDF w/ JPG is the \u003ca href=\"http://shattered.io\"\u003einitial implementation\u003c/a\u003e of the Shattered attack, but it's just a pure JPG trick in a PDF document.\u003c/li\u003e\n\u003cli\u003ePNG: Safari/Preview requires PNG to have their \u003ccode\u003eIHDR\u003c/code\u003e chunk in first slot, before any collision block. Doing so prevents a generic prefix, in which case the collision is limited to specific dimensions, color space, BPP and interlacing.\u003c/li\u003e\n\u003cli\u003eAtom/Box formats like MP4 may work with the same prefix for different subformats. Some subformats like JPEG2000 or HEIF require extra grooming, but the exploit strategy is the same - it's just that the collision is not possible between sub-formats, only with a pair of prefix for a specific sub-format.\u003c/li\u003e\n\u003cli\u003eAtom/Box is Shattered-compatible when using 64bit lengths.\u003c/li\u003e\n\u003cli\u003eSome Zip-based formats can be generically exploited.\u003c/li\u003e\n\u003cli\u003eFor better compatibility, ZIP needs two UniColl for a complete archive, and this collisions depend on both files contents.\u003c/li\u003e\n\u003c/ol\u003e\n\u003ch2 id=\"test-files\"\u003eTest files\u003c/h2\u003e\n\u003cp\u003e\u003ca href=\"examples/free/README.md\"\u003eHere\u003c/a\u003e are free (copyright-free, PII-free) test colliding pairs.\u003c/p\u003e\n\u003ch1 id=\"detection\"\u003eDetection\u003c/h1\u003e\n\u003cp\u003eThere are different ways to detect hash collisions in files.\u003c/p\u003e\n\u003col\u003e\n\u003cli\u003eTwo files: if you have two or more files with different contents and the same hash, just diff them!\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eHowever, if you only have a single file, it can be difficult to tell if the file contains a hash collision.\u003c/p\u003e\n\u003col start=\"2\"\u003e\n\u003cli\u003e\u003cp\u003eFile structure: analyse the file at block boundaries, and if you notice high entropy blocks and maybe identical prefix/suffix, you might be able to tell which collision it's using, but it's very error-prone. In the case of a chosen-prefix collision, it's might be impossible to spot as both files might be mostly different besides most of the collision blocks.\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003eHash computation: use an implementation (in \u003ca href=\"https://github.com/cr-marcstevens/hashclash/tree/collisiondetection/src/collisiondetection\"\u003eC\u003c/a\u003e or \u003ca href=\"https://github.com/therealmik/detectcoll\"\u003eGo\u003c/a\u003e) of Marc Stevens' DetectColl (cf his \u003ca href=\"https://marc-stevens.nl/research/papers/C13-S.pdf\"\u003eCounter-cryptanalysis\u003c/a\u003e paper). It only requires one file but it requires the collision to be in a working state (the exact prefix and its corresponding collision blocks) and it's slow.\u003c/p\u003e\u003c/li\u003e\n\u003c/ol\u003e\n\u003cp\u003eDetectColl gives technical information about the collision itself, and shows \u003ccode\u003e*coll*\u003c/code\u003e next to the collided hash.\u003c/p\u003e\n\u003ch2 id=\"example\"\u003eExample\u003c/h2\u003e\n\u003cp\u003eWith the Flame malware certificate:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e$ detectcoll flame.der\nFound collision in block 11:\n   dm: dm4=80000000 dm11=ffff8000 dm14=80000000\n   ihv1=1ba33aac3a7f9ed70aec349b40390e85\n   ihv2=9ba33aac3c7f60ee8cebf69bc2391085\n*coll* c38a66643af816f8438b375b5f42ccbb flame.der\nba2499ba3dda9ef818f854b75a2bd1cd9f2b7bed flame.der\n\u003c/code\u003e\u003c/pre\u003e\n\u003ch2 id=\"safe-hashes\"\u003eSafe hashes\u003c/h2\u003e\n\u003cp\u003eSince Detectcoll can identify blocks used for a hash collision, it can mitigate the collision via \u003cem\u003esafe hashes\u003c/em\u003e: if a collision block is detected, it reprocesses it again to break the collision property. So DetectColl is able to tell different contents apart via the same hash function despite the collisions in the file.\u003c/p\u003e\n\u003cp\u003eIn short:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003efor files with no collision, a safe hash value is equal to the standard hash value.\u003c/li\u003e\n\u003cli\u003efor files with collision, the safe hash differs but will also differs on different file contents despite the collisions.\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eExample with Wang's original collision from 2005:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e$ md5sum wang*\n79054025255fb1a26e4bc422aef54eb4 *wang1.bin\n79054025255fb1a26e4bc422aef54eb4 *wang2.bin\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eSafe MD5 on these files:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e$ detectcoll wang1.bin | grep coll\n*coll* ff531291d102a41aa131e0e09f64ca60 wang1.bin\n\u003c/code\u003e\u003c/pre\u003e\n\u003cpre\u003e\u003ccode\u003e$ detectcoll wang2.bin | grep coll\n*coll* 6a8e7124724d5c819401afc202a4fbd0 wang2.bin\n\u003c/code\u003e\u003c/pre\u003e\n\u003ch2 id=\"signatures\"\u003eSignatures\u003c/h2\u003e\n\u003cp\u003eFor simplicity, you can parse Detectcoll output with this \u003ca href=\"scripts/logparse.py\"\u003escript\u003c/a\u003e and have it match with \u003ca href=\"https://github.com/corkami/collisions/blob/7f7876c431614f33f765bfc1cb62506b476a2eb0/scripts/logparse.py#L15-L24\"\u003eknown signatures\u003c/a\u003e more easily:\u003c/p\u003e\n\u003cpre class=\"shell\"\u003e\u003ccode\u003e$ detectcoll_unsafe * | ./logparse.py\napop-1.bin\nblock: 2, collision: APop\ncpc1.bin\nblock: 9, collision: HashClashCPC\nfastcoll1.bin\nblock: 2, collision: FastColl\nsingle-cpc1.bin\nblock: 1, collision: SingleCPC\nsingle-ipc1.bin\nblock: 0, collision: SingleIPC\nwang1.bin\nblock: 1, collision: FastColl\npileup.exe\nblock: 10, collision: HashClashCPC\nblock: 20, collision: HashClashCPC\n04-unicoll-1.bin\nblock: 1, collision: Unicoll1\n05-uc-n2-1.bin\nblock: 1, collision: Unicoll2\n05-uc-n3-1.bin\nblock: 1, collision: Unicoll3\n05-uc-n3-2.bin\nblock: 1, collision: Unicoll3\n12-shattered1.bin\nblock: 3, collision: SHAttered/Shambles\nblock: 4, collision: SHAttered/Shambles\n13-shambles1.bin\nblock: 9, collision: SHAttered/Shambles\n13-shambles2.bin\nblock: 9, collision: SHAttered/Shambles\nca-rogue.der\nblock: 10, collision: HashClashCPC\nflame.der\nblock: 11, collision: Flame\n\u003c/code\u003e\u003c/pre\u003e\n\u003ch3 id=\"multiple-collisions\"\u003eMultiple collisions\u003c/h3\u003e\n\u003cp\u003eA minor drawback of safe hashes is that they prevent the detection of multiple collisions in the same file, but DetectColl can still detect collisions with 'standard' hashes.\u003c/p\u003e\n\u003cp\u003eExamples with PoCorGTFO 0x14 (a NES+PDF hashquine with an alternate cover picture).\u003c/p\u003e\n\u003cp\u003eSafe hashes can only find one collision:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e$ detectcoll_safe pocorgtfo14.pdf\nFound collision in block 135:\n   dm: dm4=80000000 dm11=ffff8000 dm14=80000000\n   ihv1=73b615bd01d5e48032d3d1a549d0f956\n   ihv2=f3b615bd83d5e480b4d3d1a5cbd0f956\n*coll* c4b085f9fa4b38669fa79d4c410538e9 pocorgtfo14.pdf\neb5d0fb7607c1262236a5a7f591bb510ee9afbbc pocorgtfo14.pdf   \n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eUnsafe hashes finds all of them:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e$ detectcoll_unsafe pocorgtfo14.pdf | grep Found | wc -l\n609\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eIf you check the last few collisions:\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003e$ detectcoll_unsafe pocorgtfo14.pdf | tail | grep Found\nFound collision in block 34169:\nFound collision in block 34250:\nFound collision in block 34324:\nFound collision in block 34389:\nFound collision in block 34456:\nFound collision in block 34523:\nFound collision in block 34585:\nFound collision in block 34738:\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eYou can notice that the last one is not so close from the previous ones: that's because the previous ones belong to the same image file for the hashquines, while the last one is for the alternate cover.\u003c/p\u003e\n\u003ch1 id=\"references\"\u003eReferences\u003c/h1\u003e\n\u003cp\u003ePapers (about file formats exploitation):\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e2004\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"https://eprint.iacr.org/2004/357.pdf\"\u003eMD5 To Be Considered Harmful Someday\u003c/a\u003e - Dan Kaminsky\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://eprint.iacr.org/2004/356.pdf\"\u003ePractical Attacks on Digital Signatures Using MD5 Message Digest\u003c/a\u003e - Ondredj Mikle\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e2005:\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"papers/Illies_NIST_05.pdf\"\u003eA Note on Practical Value of Single Hash Collisions for Special File Formats\u003c/a\u003e - Max Gebhardt, Georg Illies, Werner Schindler\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e2014:\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"https://malicioussha1.github.io/\"\u003eMalicious Hashing: Eve’s Variant of SHA-1\u003c/a\u003e - Ange Albertini, Jean-Philippe Aumasson, Maria Eichlseder, Florian Mendel, Martin Schläffer\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e2017:\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"http://shattered.io\"\u003eThe first collision for full SHA-1\u003c/a\u003e - Marc Stevens, Elie Bursztein, Pierre Karpman, Ange Albertini, Yarik Markov\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://archive.org/stream/pocorgtfo14#page/n45/mode/1up\"\u003ePostscript that shows its own MD5\u003c/a\u003e by Gregor \u0026quot;Greg\u0026quot; Kopf\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://archive.org/stream/pocorgtfo14#page/n49/mode/1up\"\u003eA PDF That Shows Its Own MD5\u003c/a\u003e by Mako\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://archive.org/stream/pocorgtfo14#page/n52/mode/1up\"\u003eThis GIF shows its own MD5!\u003c/a\u003e by Kristoffer \u0026quot;spq\u0026quot; Janke\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://archive.org/stream/pocorgtfo14#page/n55/mode/1up\"\u003eThis PDF is an NES ROM that prints its own MD5 hash!\u003c/a\u003e by Evan Sultanik, Evan Teran\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e2018:\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"https://archive.org/stream/pocorgtfo18#page/n62/mode/1up\"\u003eEasy SHA-1 Colliding PDFs with PDFLaTeX.\u003c/a\u003e by Ange Albertini\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e2020:\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"https://eprint.iacr.org/2020/014.pdf\"\u003eSHA-1 is a Shambles\u003c/a\u003e by Gaëtan Leurent, Thomas Peyrin\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003ePresentations:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003e2017 Exploiting Hash Collisions at Black Alps:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"https://speakerdeck.com/ange/exploiting-hash-collisions\"\u003eslides\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://speakerdeck.com/ange/exploiting-hash-collisions\"\u003e\u003cimg alt='Black Alps 2017 slides' src=pics/blackalps17.png width=350/\u003e\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"https://www.youtube.com/watch?v=Y-oJWEYKVLA\"\u003evideo\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://www.youtube.com/watch?v=Y-oJWEYKVLA\"\u003e\u003cimg src=\"https://img.youtube.com/vi/Y-oJWEYKVLA/0.jpg\" alt=\"Exploiting hash collisions Youtube video\" /\u003e\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003e2019 KILL MD5 at Pass the Salt:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"https://speakerdeck.com/ange/kill-md5\"\u003eslides\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://speakerdeck.com/ange/kill-md5\"\u003e\u003cimg alt='KILL MD5 slides' src=pics/KILLMD5.png width=350/\u003e\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"https://passthesalt.ubicast.tv/videos/kill-md5-demystifying-hash-collisions/\"\u003evideo\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://passthesalt.ubicast.tv/videos/kill-md5-demystifying-hash-collisions/\"\u003e\u003cimg src=\"https://img.youtube.com/vi/4-uXMKk9Ttg/0.jpg\" alt=\"Kill MD5 video\" /\u003e\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eWorkshop (CollTris):\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"https://speakerdeck.com/ange/colltris\"\u003eslides\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://speakerdeck.com/ange/colltris\"\u003e\u003cimg width=350 src=pics/CollTris.png /\u003e\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"https://www.youtube.com/watch?v=BcwrMnGVyBI\"\u003evideo\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href=\"https://www.youtube.com/watch?v=BcwrMnGVyBI\"\u003e\u003cimg width=350 src=pics/recording.jpg /\u003e\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003e\u003ca href=\"workshop/README.md\"\u003ematerials\u003c/a\u003e\u003c/p\u003e\u003c/li\u003e\n\u003cli\u003e\u003cp\u003esessions\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e2019/07/02 150p, Pass The Salt\u003c/li\u003e\n\u003cli\u003e2019/07/24 199p, Google\u003c/li\u003e\n\u003cli\u003e2019/08/19 208p, Google\u003c/li\u003e\n\u003cli\u003e2019/10/23 222p, Hack.lu\u003c/li\u003e\n\u003cli\u003e2019/11/07 225p, Black Alps\u003c/li\u003e\n\u003cli\u003e2019/12/03 229p, Google\u003c/li\u003e\n\u003c/ul\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp\u003eCTF tasks:\u003c/p\u003e\n\u003cul\u003e\n\u003cli\u003e\u003ca href=\"https://ctftime.org/task/3453\"\u003ePrudentialv2\u003c/a\u003e, from the \u003cem\u003eBoston Key Party CTF 2017\u003c/em\u003e.\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://ctftime.org/task/6965\"\u003eHREFIN\u003c/a\u003e, from the \u003cem\u003eGoogle CTF 2018\u003c/em\u003e.\u003c/li\u003e\n\u003cli\u003e\u003ca href=\"https://ctftime.org/task/9271\"\u003eLooking glass\u003c/a\u003e from the \u003cem\u003eDragon Sector Teaser CTF 2019\u003c/em\u003e.\u003c/li\u003e\n\u003c/ul\u003e\n\u003c!-- - [Not my digest](https://ctftime.org/task/4784) from *Hack.lu CTF 2017*: not related to collisions, but solved by Marc himself :p --\u003e\n\n\u003cp\u003eA common challenge for such CTF tasks is to not give a too big advantage or handicap based on the amount of computing power each player has access to.\u003c/p\u003e\n\u003ch1 id=\"credits\"\u003eCredits\u003c/h1\u003e\n\u003cp\u003eAll this was possible thanks to \u003ca href=\"https://marc-stevens.nl/research/\"\u003eMarc Stevens\u003c/a\u003e, not only for his cryptographic contributions, but also for his permanent help and suggestions!\u003c/p\u003e\n\u003cp\u003eThanks also to Philippe Teuwen for his extensive feedback for file formats in general.\u003c/p\u003e\n\u003ch1 id=\"conclusion\"\u003eConclusion\u003c/h1\u003e\n\u003cp\u003e\u003cstrong\u003eKill MD5!\u003c/strong\u003e\u003c/p\u003e\n\u003cp\u003eUnless you actively check for malformations or collisions blocks in files, don't use MD5!\u003c/p\u003e\n\u003cp\u003eIt's not a cryptographic hash, it's a toy function!\u003c/p\u003e\n\u003c/body\u003e\n\u003c/html\u003e\n","funding_links":["https://github.com/sponsors/corkami","https://patreon.com/corkami","https://paypal.me/corkami"],"categories":["\u003ca id=\"683b645c2162a1fce5f24ac2abfa1973\"\u003e\u003c/a\u003e漏洞\u0026\u0026漏洞管理\u0026\u0026漏洞发现/挖掘\u0026\u0026漏洞开发\u0026\u0026漏洞利用\u0026\u0026Fuzzing","Python","逆向破解"],"sub_categories":["\u003ca id=\"41ae40ed61ab2b61f2971fea3ec26e7c\"\u003e\u003c/a\u003e漏洞利用"],"project_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcorkami%2Fcollisions","html_url":"https://awesome.ecosyste.ms/projects/github.com%2Fcorkami%2Fcollisions","lists_url":"https://awesome.ecosyste.ms/api/v1/projects/github.com%2Fcorkami%2Fcollisions/lists"}