lastin-ai-2/papers/2502_06786v1.json

{
  "title": "Matryoshka Quantization",
  "authors": [
    "Pranav Nair",
    "Puranjay Datta",
    "Jeff Dean",
    "Prateek Jain",
    "Aditya Kusupati"
  ],
  "abstract": "Quantizing model weights is critical for reducing the communication and\ninference costs of large models. However, quantizing models -- especially to\nlow precisions like int4 or int2 -- requires a trade-off in model quality;\nint2, in particular, is known to severely degrade model quality. Consequently,\npractitioners are often forced to maintain multiple models with different\nquantization levels or serve a single model that best satisfies the\nquality-latency trade-off. On the other hand, integer data types, such as int8,\ninherently possess a nested (Matryoshka) structure where smaller bit-width\nintegers, like int4 or int2, are nested within the most significant bits. This\npaper proposes Matryoshka Quantization (MatQuant), a novel multi-scale\nquantization technique that addresses the challenge of needing multiple\nquantized models. It allows training and maintaining just one model, which can\nthen be served at different precision levels. Furthermore, due to the\nco-training and co-distillation regularization provided by MatQuant, the int2\nprecision models extracted by MatQuant can be up to $10\\%$ more accurate than\nstandard int2 quantization (using techniques like QAT or OmniQuant). This\nrepresents significant progress in model quantization, demonstrated by the fact\nthat, with the same recipe, an int2 FFN-quantized Gemma-2 9B model is more\naccurate than an int8 FFN-quantized Gemma-2 2B model.",
  "pdf_url": "http://arxiv.org/pdf/2502.06786v1",
  "entry_id": "http://arxiv.org/abs/2502.06786v1",
  "categories": [
    "cs.LG",
    "cs.AI"
  ]
}