lastin-ai-2/papers/2502_05174v1.json

{
  "title": "MELON: Indirect Prompt Injection Defense via Masked Re-execution and Tool Comparison",
  "authors": [
    "Kaijie Zhu",
    "Xianjun Yang",
    "Jindong Wang",
    "Wenbo Guo",
    "William Yang Wang"
  ],
  "abstract": "Recent research has explored that LLM agents are vulnerable to indirect\nprompt injection (IPI) attacks, where malicious tasks embedded in\ntool-retrieved information can redirect the agent to take unauthorized actions.\nExisting defenses against IPI have significant limitations: either require\nessential model training resources, lack effectiveness against sophisticated\nattacks, or harm the normal utilities. We present MELON (Masked re-Execution\nand TooL comparisON), a novel IPI defense. Our approach builds on the\nobservation that under a successful attack, the agent's next action becomes\nless dependent on user tasks and more on malicious tasks. Following this, we\ndesign MELON to detect attacks by re-executing the agent's trajectory with a\nmasked user prompt modified through a masking function. We identify an attack\nif the actions generated in the original and masked executions are similar. We\nalso include three key designs to reduce the potential false positives and\nfalse negatives. Extensive evaluation on the IPI benchmark AgentDojo\ndemonstrates that MELON outperforms SOTA defenses in both attack prevention and\nutility preservation. Moreover, we show that combining MELON with a SOTA prompt\naugmentation defense (denoted as MELON-Aug) further improves its performance.\nWe also conduct a detailed ablation study to validate our key designs.",
  "pdf_url": "http://arxiv.org/pdf/2502.05174v1",
  "entry_id": "http://arxiv.org/abs/2502.05174v1",
  "categories": [
    "cs.CR",
    "cs.AI"
  ]
}