lastin-ai-2/data/papers/papers_cs_AI_20250210_231924.json

[
  {
    "title": "MELON: Indirect Prompt Injection Defense via Masked Re-execution and Tool Comparison",
    "authors": [
      "Kaijie Zhu",
      "Xianjun Yang",
      "Jindong Wang",
      "Wenbo Guo",
      "William Yang Wang"
    ],
    "summary": "Recent research has explored that LLM agents are vulnerable to indirect\nprompt injection (IPI) attacks, where malicious tasks embedded in\ntool-retrieved information can redirect the agent to take unauthorized actions.\nExisting defenses against IPI have significant limitations: either require\nessential model training resources, lack effectiveness against sophisticated\nattacks, or harm the normal utilities. We present MELON (Masked re-Execution\nand TooL comparisON), a novel IPI defense. Our approach builds on the\nobservation that under a successful attack, the agent's next action becomes\nless dependent on user tasks and more on malicious tasks. Following this, we\ndesign MELON to detect attacks by re-executing the agent's trajectory with a\nmasked user prompt modified through a masking function. We identify an attack\nif the actions generated in the original and masked executions are similar. We\nalso include three key designs to reduce the potential false positives and\nfalse negatives. Extensive evaluation on the IPI benchmark AgentDojo\ndemonstrates that MELON outperforms SOTA defenses in both attack prevention and\nutility preservation. Moreover, we show that combining MELON with a SOTA prompt\naugmentation defense (denoted as MELON-Aug) further improves its performance.\nWe also conduct a detailed ablation study to validate our key designs.",
    "published": "2025-02-07T18:57:49+00:00",
    "updated": "2025-02-07T18:57:49+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05174v1",
    "entry_id": "http://arxiv.org/abs/2502.05174v1",
    "categories": [
      "cs.CR",
      "cs.AI"
    ],
    "primary_category": "cs.CR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Joint MoE Scaling Laws: Mixture of Experts Can Be Memory Efficient",
    "authors": [
      "Jan Ludziejewski",
      "Maciej Pióro",
      "Jakub Krajewski",
      "Maciej Stefaniak",
      "Michał Krutul",
      "Jan Małaśnicki",
      "Marek Cygan",
      "Piotr Sankowski",
      "Kamil Adamczewski",
      "Piotr Miłoś",
      "Sebastian Jaszczur"
    ],
    "summary": "Mixture of Experts (MoE) architectures have significantly increased\ncomputational efficiency in both research and real-world applications of\nlarge-scale machine learning models. However, their scalability and efficiency\nunder memory constraints remain relatively underexplored. In this work, we\npresent joint scaling laws for dense and MoE models, incorporating key factors\nsuch as the number of active parameters, dataset size, and the number of\nexperts. Our findings provide a principled framework for selecting the optimal\nMoE configuration under fixed memory and compute budgets. Surprisingly, we show\nthat MoE models can be more memory-efficient than dense models, contradicting\nconventional wisdom. To derive and validate the theoretical predictions of our\nscaling laws, we conduct over 280 experiments with up to 2.7B active parameters\nand up to 5B total parameters. These results offer actionable insights for\ndesigning and deploying MoE models in practical large-scale training scenarios.",
    "published": "2025-02-07T18:55:38+00:00",
    "updated": "2025-02-07T18:55:38+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05172v1",
    "entry_id": "http://arxiv.org/abs/2502.05172v1",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CL"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Long-tailed Medical Diagnosis with Relation-aware Representation Learning and Iterative Classifier Calibration",
    "authors": [
      "Li Pan",
      "Yupei Zhang",
      "Qiushi Yang",
      "Tan Li",
      "Zhen Chen"
    ],
    "summary": "Recently computer-aided diagnosis has demonstrated promising performance,\neffectively alleviating the workload of clinicians. However, the inherent\nsample imbalance among different diseases leads algorithms biased to the\nmajority categories, leading to poor performance for rare categories. Existing\nworks formulated this challenge as a long-tailed problem and attempted to\ntackle it by decoupling the feature representation and classification. Yet, due\nto the imbalanced distribution and limited samples from tail classes, these\nworks are prone to biased representation learning and insufficient classifier\ncalibration. To tackle these problems, we propose a new Long-tailed Medical\nDiagnosis (LMD) framework for balanced medical image classification on\nlong-tailed datasets. In the initial stage, we develop a Relation-aware\nRepresentation Learning (RRL) scheme to boost the representation ability by\nencouraging the encoder to capture intrinsic semantic features through\ndifferent data augmentations. In the subsequent stage, we propose an Iterative\nClassifier Calibration (ICC) scheme to calibrate the classifier iteratively.\nThis is achieved by generating a large number of balanced virtual features and\nfine-tuning the encoder using an Expectation-Maximization manner. The proposed\nICC compensates for minority categories to facilitate unbiased classifier\noptimization while maintaining the diagnostic knowledge in majority classes.\nComprehensive experiments on three public long-tailed medical datasets\ndemonstrate that our LMD framework significantly surpasses state-of-the-art\napproaches. The source code can be accessed at\nhttps://github.com/peterlipan/LMD.",
    "published": "2025-02-05T14:57:23+00:00",
    "updated": "2025-02-07T18:37:47+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.03238v2",
    "entry_id": "http://arxiv.org/abs/2502.03238v2",
    "categories": [
      "cs.CV",
      "cs.AI",
      "cs.LG",
      "cs.MM"
    ],
    "primary_category": "cs.CV",
    "comment": "This work has been accepted in Computers in Biology and Medicine",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Smirk: An Atomically Complete Tokenizer for Molecular Foundation Models",
    "authors": [
      "Alexius Wadell",
      "Anoushka Bhutani",
      "Venkatasubramanian Viswanathan"
    ],
    "summary": "Text-based foundation models have become an important part of scientific\ndiscovery, with molecular foundation models accelerating advancements in\nmolecular design and materials science. However, existing models are\nconstrained by closed-vocabulary tokenizers which capture only a fraction of\nmolecular space. In this work, we systematically evaluate thirty tokenizers,\nincluding 19 chemistry-specific ones, for their coverage of the SMILES\nmolecular representation language, revealing significant gaps. To assess the\nimpact of tokenizer choice, we introduce n-gram language models as a low-cost\nproxy and validate their effectiveness by training and fine-tuning 18\nRoBERTa-style encoders for molecular property prediction. To overcome the\nlimitations of existing tokenizers, we propose two new tokenizers -- Smirk and\nSmirk-GPE -- with full coverage of the OpenSMILES specification. Our results\nhighlight the need for open-vocabulary modeling and chemically diverse\nbenchmarks in cheminformatics. The proposed tokenizer framework systematically\nintegrates nuclear, electronic, and geometric degrees of freedom; this\nfacilitates applications in pharmacology, agriculture, biology, and energy\nstorage.",
    "published": "2024-09-19T02:36:04+00:00",
    "updated": "2025-02-07T18:36:17+00:00",
    "pdf_url": "http://arxiv.org/pdf/2409.15370v2",
    "entry_id": "http://arxiv.org/abs/2409.15370v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "physics.chem-ph",
      "q-bio.BM"
    ],
    "primary_category": "cs.LG",
    "comment": "33 pages, 6 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Masked Diffusion Models are Secretly Time-Agnostic Masked Models and Exploit Inaccurate Categorical Sampling",
    "authors": [
      "Kaiwen Zheng",
      "Yongxin Chen",
      "Hanzi Mao",
      "Ming-Yu Liu",
      "Jun Zhu",
      "Qinsheng Zhang"
    ],
    "summary": "Masked diffusion models (MDMs) have emerged as a popular research topic for\ngenerative modeling of discrete data, thanks to their superior performance over\nother discrete diffusion models, and are rivaling the auto-regressive models\n(ARMs) for language modeling tasks. The recent effort in simplifying the masked\ndiffusion framework further leads to alignment with continuous-space diffusion\nmodels and more principled training and sampling recipes. In this paper,\nhowever, we reveal that both training and sampling of MDMs are theoretically\nfree from the time variable, arguably the key signature of diffusion models,\nand are instead equivalent to masked models. The connection on the sampling\naspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we\nshow that the FHS is theoretically equivalent to MDMs' original generation\nprocess while significantly alleviating the time-consuming categorical sampling\nand achieving a 20$\\times$ speedup. In addition, our investigation raises\ndoubts about whether MDMs can truly beat ARMs in text generation. We identify,\nfor the first time, an underlying numerical issue, even with the commonly used\n32-bit floating-point precision, which results in inaccurate categorical\nsampling. We show that it lowers the effective temperature both theoretically\nand empirically, and the resulting decrease in token diversity makes previous\nevaluations, which assess the generation quality solely through the incomplete\ngenerative perplexity metric, somewhat unfair.",
    "published": "2024-09-04T17:48:19+00:00",
    "updated": "2025-02-07T18:35:02+00:00",
    "pdf_url": "http://arxiv.org/pdf/2409.02908v4",
    "entry_id": "http://arxiv.org/abs/2409.02908v4",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CL"
    ],
    "primary_category": "cs.LG",
    "comment": "Accepted at ICLR 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Simplicity Prevails: Rethinking Negative Preference Optimization for LLM Unlearning",
    "authors": [
      "Chongyu Fan",
      "Jiancheng Liu",
      "Licong Lin",
      "Jinghan Jia",
      "Ruiqi Zhang",
      "Song Mei",
      "Sijia Liu"
    ],
    "summary": "This work studies the problem of large language model (LLM) unlearning,\naiming to remove unwanted data influences (e.g., copyrighted or harmful\ncontent) while preserving model utility. Despite the increasing demand for\nunlearning, a technically-grounded optimization framework is lacking. Gradient\nascent (GA)-type methods, though widely used, are suboptimal as they reverse\nthe learning process without controlling optimization divergence (i.e.,\ndeviation from the pre-trained state), leading to risks of over-forgetting and\npotential model collapse. Negative preference optimization (NPO) has been\nproposed to address this issue and is considered one of the state-of-the-art\nLLM unlearning approaches. In this work, we revisit NPO and identify another\ncritical issue: reference model bias. This bias arises from using the reference\nmodel (i.e., the model prior to unlearning) to evaluate the unlearning success,\nwhich can compromise NPO's effectiveness. Specifically, it leads to (a) uneven\nallocation of optimization power across forget data with varying difficulty\nlevels and (b) ineffective gradient weight smoothing during the early stages of\nunlearning optimization. To overcome these challenges, we propose a simple yet\neffective unlearning optimization framework, called SimNPO, showing that\n`simplicity' in removing the reliance on a reference model (through the lens of\nsimple preference optimization) benefits unlearning. We provide deeper insights\ninto SimNPO's advantages through an analysis based on mixtures of Markov\nchains. Extensive experiments further validate SimNPO's efficacy on benchmarks\nlike TOFU and MUSE, as well as its robustness against relearning attacks. Codes\nare available at https://github.com/OPTML-Group/Unlearn-Simple.",
    "published": "2024-10-09T17:58:12+00:00",
    "updated": "2025-02-07T18:34:28+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.07163v3",
    "entry_id": "http://arxiv.org/abs/2410.07163v3",
    "categories": [
      "cs.CL",
      "cs.AI",
      "cs.LG"
    ],
    "primary_category": "cs.CL",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Transforming Science with Large Language Models: A Survey on AI-assisted Scientific Discovery, Experimentation, Content Generation, and Evaluation",
    "authors": [
      "Steffen Eger",
      "Yong Cao",
      "Jennifer D'Souza",
      "Andreas Geiger",
      "Christian Greisinger",
      "Stephanie Gross",
      "Yufang Hou",
      "Brigitte Krenn",
      "Anne Lauscher",
      "Yizhi Li",
      "Chenghua Lin",
      "Nafise Sadat Moosavi",
      "Wei Zhao",
      "Tristan Miller"
    ],
    "summary": "With the advent of large multimodal language models, science is now at a\nthreshold of an AI-based technological transformation. Recently, a plethora of\nnew AI models and tools has been proposed, promising to empower researchers and\nacademics worldwide to conduct their research more effectively and efficiently.\nThis includes all aspects of the research cycle, especially (1) searching for\nrelevant literature; (2) generating research ideas and conducting\nexperimentation; generating (3) text-based and (4) multimodal content (e.g.,\nscientific figures and diagrams); and (5) AI-based automatic peer review. In\nthis survey, we provide an in-depth overview over these exciting recent\ndevelopments, which promise to fundamentally alter the scientific research\nprocess for good. Our survey covers the five aspects outlined above, indicating\nrelevant datasets, methods and results (including evaluation) as well as\nlimitations and scope for future research. Ethical concerns regarding\nshortcomings of these tools and potential for misuse (fake science, plagiarism,\nharms to research integrity) take a particularly prominent place in our\ndiscussion. We hope that our survey will not only become a reference guide for\nnewcomers to the field but also a catalyst for new AI-based initiatives in the\narea of \"AI4Science\".",
    "published": "2025-02-07T18:26:45+00:00",
    "updated": "2025-02-07T18:26:45+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05151v1",
    "entry_id": "http://arxiv.org/abs/2502.05151v1",
    "categories": [
      "cs.CL",
      "cs.AI",
      "cs.CV",
      "cs.LG"
    ],
    "primary_category": "cs.CL",
    "comment": "Work in progress. Will be updated soon",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "LP-DETR: Layer-wise Progressive Relations for Object Detection",
    "authors": [
      "Zhengjian Kang",
      "Ye Zhang",
      "Xiaoyu Deng",
      "Xintao Li",
      "Yongzhe Zhang"
    ],
    "summary": "This paper presents LP-DETR (Layer-wise Progressive DETR), a novel approach\nthat enhances DETR-based object detection through multi-scale relation\nmodeling. Our method introduces learnable spatial relationships between object\nqueries through a relation-aware self-attention mechanism, which adaptively\nlearns to balance different scales of relations (local, medium and global)\nacross decoder layers. This progressive design enables the model to effectively\ncapture evolving spatial dependencies throughout the detection pipeline.\nExtensive experiments on COCO 2017 dataset demonstrate that our method improves\nboth convergence speed and detection accuracy compared to standard\nself-attention module. The proposed method achieves competitive results,\nreaching 52.3\\% AP with 12 epochs and 52.5\\% AP with 24 epochs using ResNet-50\nbackbone, and further improving to 58.0\\% AP with Swin-L backbone. Furthermore,\nour analysis reveals an interesting pattern: the model naturally learns to\nprioritize local spatial relations in early decoder layers while gradually\nshifting attention to broader contexts in deeper layers, providing valuable\ninsights for future research in object detection.",
    "published": "2025-02-07T18:25:28+00:00",
    "updated": "2025-02-07T18:25:28+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05147v1",
    "entry_id": "http://arxiv.org/abs/2502.05147v1",
    "categories": [
      "cs.CV",
      "cs.AI"
    ],
    "primary_category": "cs.CV",
    "comment": "7 pages, 4 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Rejecting Hallucinated State Targets during Planning",
    "authors": [
      "Mingde Zhao",
      "Tristan Sylvain",
      "Romain Laroche",
      "Doina Precup",
      "Yoshua Bengio"
    ],
    "summary": "Generative models can be used in planning to propose targets corresponding to\nstates or observations that agents deem either likely or advantageous to\nexperience. However, agents can struggle with hallucinated, infeasible targets\nproposed by the models, leading to delusional planning behaviors, which raises\nsafety concerns. Drawing inspiration from the human brain, we propose to reject\nthese hallucinated targets with an add-on target evaluator. Without proper\ntraining, however, the evaluator can produce delusional estimates, rendering it\nfutile. We propose to address this via a combination of learning rule,\narchitecture, and two novel hindsight relabeling strategies, which leads to\ncorrect evaluations of infeasible targets. Our experiments confirm that our\napproach significantly reduces delusional behaviors and enhances the\nperformance of planning agents.",
    "published": "2024-10-09T17:35:25+00:00",
    "updated": "2025-02-07T18:10:13+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.07096v6",
    "entry_id": "http://arxiv.org/abs/2410.07096v6",
    "categories": [
      "cs.AI"
    ],
    "primary_category": "cs.AI",
    "comment": "[20250207 13h10]: https://github.com/mila-iqia/delusions",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "ADAPT to Robustify Prompt Tuning Vision Transformers",
    "authors": [
      "Masih Eskandar",
      "Tooba Imtiaz",
      "Zifeng Wang",
      "Jennifer Dy"
    ],
    "summary": "The performance of deep models, including Vision Transformers, is known to be\nvulnerable to adversarial attacks. Many existing defenses against these\nattacks, such as adversarial training, rely on full-model fine-tuning to induce\nrobustness in the models. These defenses require storing a copy of the entire\nmodel, that can have billions of parameters, for each task. At the same time,\nparameter-efficient prompt tuning is used to adapt large transformer-based\nmodels to downstream tasks without the need to save large copies. In this\npaper, we examine parameter-efficient prompt tuning of Vision Transformers for\ndownstream tasks under the lens of robustness. We show that previous\nadversarial defense methods, when applied to the prompt tuning paradigm, suffer\nfrom gradient obfuscation and are vulnerable to adaptive attacks. We introduce\nADAPT, a novel framework for performing adaptive adversarial training in the\nprompt tuning paradigm. Our method achieves competitive robust accuracy of ~40%\nw.r.t. SOTA robustness methods using full-model fine-tuning, by tuning only ~1%\nof the number of parameters.",
    "published": "2024-03-19T23:13:40+00:00",
    "updated": "2025-02-07T18:04:48+00:00",
    "pdf_url": "http://arxiv.org/pdf/2403.13196v2",
    "entry_id": "http://arxiv.org/abs/2403.13196v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CV",
      "stat.ML"
    ],
    "primary_category": "cs.LG",
    "comment": "Published in Transactions on Machine Learning Research (2025)",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Latent Swap Joint Diffusion for Long-Form Audio Generation",
    "authors": [
      "Yusheng Dai",
      "Chenxi Wang",
      "Chang Li",
      "Chen Wang",
      "Jun Du",
      "Kewei Li",
      "Ruoyu Wang",
      "Jiefeng Ma",
      "Lei Sun",
      "Jianqing Gao"
    ],
    "summary": "Previous work on long-form audio generation using global-view diffusion or\niterative generation demands significant training or inference costs. While\nrecent advancements in multi-view joint diffusion for panoramic generation\nprovide an efficient option, they struggle with spectrum generation with severe\noverlap distortions and high cross-view consistency costs. We initially explore\nthis phenomenon through the connectivity inheritance of latent maps and uncover\nthat averaging operations excessively smooth the high-frequency components of\nthe latent map. To address these issues, we propose Swap Forward (SaFa), a\nframe-level latent swap framework that synchronizes multiple diffusions to\nproduce a globally coherent long audio with more spectrum details in a\nforward-only manner. At its core, the bidirectional Self-Loop Latent Swap is\napplied between adjacent views, leveraging stepwise diffusion trajectory to\nadaptively enhance high-frequency components without disrupting low-frequency\ncomponents. Furthermore, to ensure cross-view consistency, the unidirectional\nReference-Guided Latent Swap is applied between the reference and the\nnon-overlap regions of each subview during the early stages, providing\ncentralized trajectory guidance. Quantitative and qualitative experiments\ndemonstrate that SaFa significantly outperforms existing joint diffusion\nmethods and even training-based long audio generation models. Moreover, we find\nthat it also adapts well to panoramic generation, achieving comparable\nstate-of-the-art performance with greater efficiency and model\ngeneralizability. Project page is available at https://swapforward.github.io/.",
    "published": "2025-02-07T18:02:47+00:00",
    "updated": "2025-02-07T18:02:47+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05130v1",
    "entry_id": "http://arxiv.org/abs/2502.05130v1",
    "categories": [
      "cs.SD",
      "cs.AI",
      "cs.CV",
      "cs.MM",
      "eess.AS"
    ],
    "primary_category": "cs.SD",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Predicting Steady-State Behavior in Complex Networks with Graph Neural Networks",
    "authors": [
      "Priodyuti Pradhan",
      "Amit Reza"
    ],
    "summary": "In complex systems, information propagation can be defined as diffused or\ndelocalized, weakly localized, and strongly localized. This study investigates\nthe application of graph neural network models to learn the behavior of a\nlinear dynamical system on networks. A graph convolution and attention-based\nneural network framework has been developed to identify the steady-state\nbehavior of the linear dynamical system. We reveal that our trained model\ndistinguishes the different states with high accuracy. Furthermore, we have\nevaluated model performance with real-world data. In addition, to understand\nthe explainability of our model, we provide an analytical derivation for the\nforward and backward propagation of our framework.",
    "published": "2025-02-02T17:29:10+00:00",
    "updated": "2025-02-07T17:40:28+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.01693v2",
    "entry_id": "http://arxiv.org/abs/2502.01693v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "nlin.AO"
    ],
    "primary_category": "cs.LG",
    "comment": "13 pages, 7 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "\"It Felt Like I Was Left in the Dark\": Exploring Information Needs and Design Opportunities for Family Caregivers of Older Adult Patients in Critical Care Settings",
    "authors": [
      "Shihan Fu",
      "Bingsheng Yao",
      "Smit Desai",
      "Yuqi Hu",
      "Yuling Sun",
      "Samantha Stonbraker",
      "Yanjun Gao",
      "Elizabeth M. Goldberg",
      "Dakuo Wang"
    ],
    "summary": "Older adult patients constitute a rapidly growing subgroup of Intensive Care\nUnit (ICU) patients. In these situations, their family caregivers are expected\nto represent the unconscious patients to access and interpret patients' medical\ninformation. However, caregivers currently have to rely on overloaded\nclinicians for information updates and typically lack the health literacy to\nunderstand complex medical information. Our project aims to explore the\ninformation needs of caregivers of ICU older adult patients, from which we can\npropose design opportunities to guide future AI systems. The project begins\nwith formative interviews with 11 caregivers to identify their challenges in\naccessing and interpreting medical information; From these findings, we then\nsynthesize design requirements and propose an AI system prototype to cope with\ncaregivers' challenges. The system prototype has two key features: a timeline\nvisualization to show the AI extracted and summarized older adult patients' key\nmedical events; and an LLM-based chatbot to provide context-aware informational\nsupport. We conclude our paper by reporting on the follow-up user evaluation of\nthe system and discussing future AI-based systems for ICU caregivers of older\nadults.",
    "published": "2025-02-07T17:38:10+00:00",
    "updated": "2025-02-07T17:38:10+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05115v1",
    "entry_id": "http://arxiv.org/abs/2502.05115v1",
    "categories": [
      "cs.HC",
      "cs.AI"
    ],
    "primary_category": "cs.HC",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Flexible and Efficient Grammar-Constrained Decoding",
    "authors": [
      "Kanghee Park",
      "Timothy Zhou",
      "Loris D'Antoni"
    ],
    "summary": "Large Language Models (LLMs) are often asked to generate structured outputs\nthat obey precise syntactic rules, such as code snippets or formatted data.\nGrammar-constrained decoding (GCD) can guarantee that LLM outputs matches such\nrules by masking out tokens that will provably lead to outputs that do not\nbelong to a specified context-free grammar (CFG). To guarantee soundness, GCD\nalgorithms have to compute how a given LLM subword tokenizer can align with the\ntokens used\n  by a given context-free grammar and compute token masks based on this\ninformation. Doing so efficiently is challenging and existing GCD algorithms\nrequire tens of minutes to preprocess common grammars. We present a new GCD\nalgorithm together with an implementation that offers 17.71x faster offline\npreprocessing than existing approaches while preserving state-of-the-art\nefficiency in online mask computation.",
    "published": "2025-02-07T17:35:17+00:00",
    "updated": "2025-02-07T17:35:17+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05111v1",
    "entry_id": "http://arxiv.org/abs/2502.05111v1",
    "categories": [
      "cs.CL",
      "cs.AI"
    ],
    "primary_category": "cs.CL",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "ApplE: An Applied Ethics Ontology with Event Context",
    "authors": [
      "Aisha Aijaz",
      "Raghava Mutharaju",
      "Manohar Kumar"
    ],
    "summary": "Applied ethics is ubiquitous in most domains, requiring much deliberation due\nto its philosophical nature. Varying views often lead to conflicting courses of\naction where ethical dilemmas become challenging to resolve. Although many\nfactors contribute to such a decision, the major driving forces can be\ndiscretized and thus simplified to provide an indicative answer. Knowledge\nrepresentation and reasoning offer a way to explicitly translate abstract\nethical concepts into applicable principles within the context of an event. To\nachieve this, we propose ApplE, an Applied Ethics ontology that captures\nphilosophical theory and event context to holistically describe the morality of\nan action. The development process adheres to a modified version of the\nSimplified Agile Methodology for Ontology Development (SAMOD) and utilizes\nstandard design and publication practices. Using ApplE, we model a use case\nfrom the bioethics domain that demonstrates our ontology's social and\nscientific value. Apart from the ontological reasoning and quality checks,\nApplE is also evaluated using the three-fold testing process of SAMOD. ApplE\nfollows FAIR principles and aims to be a viable resource for applied ethicists\nand ontology engineers.",
    "published": "2025-02-07T17:34:50+00:00",
    "updated": "2025-02-07T17:34:50+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05110v1",
    "entry_id": "http://arxiv.org/abs/2502.05110v1",
    "categories": [
      "cs.CY",
      "cs.AI"
    ],
    "primary_category": "cs.CY",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Grounding Continuous Representations in Geometry: Equivariant Neural Fields",
    "authors": [
      "David R Wessels",
      "David M Knigge",
      "Samuele Papa",
      "Riccardo Valperga",
      "Sharvaree Vadgama",
      "Efstratios Gavves",
      "Erik J Bekkers"
    ],
    "summary": "Conditional Neural Fields (CNFs) are increasingly being leveraged as\ncontinuous signal representations, by associating each data-sample with a\nlatent variable that conditions a shared backbone Neural Field (NeF) to\nreconstruct the sample. However, existing CNF architectures face limitations\nwhen using this latent downstream in tasks requiring fine-grained geometric\nreasoning, such as classification and segmentation. We posit that this results\nfrom lack of explicit modelling of geometric information (e.g., locality in the\nsignal or the orientation of a feature) in the latent space of CNFs. As such,\nwe propose Equivariant Neural Fields (ENFs), a novel CNF architecture which\nuses a geometry-informed cross-attention to condition the NeF on a geometric\nvariable--a latent point cloud of features--that enables an equivariant\ndecoding from latent to field. We show that this approach induces a\nsteerability property by which both field and latent are grounded in geometry\nand amenable to transformation laws: if the field transforms, the latent\nrepresentation transforms accordingly--and vice versa. Crucially, this\nequivariance relation ensures that the latent is capable of (1) representing\ngeometric patterns faithfully, allowing for geometric reasoning in latent\nspace, and (2) weight-sharing over similar local patterns, allowing for\nefficient learning of datasets of fields. We validate these main properties in\na range of tasks including classification, segmentation, forecasting,\nreconstruction and generative modelling, showing clear improvement over\nbaselines with a geometry-free latent space. Code attached to submission\nhttps://github.com/Dafidofff/enf-jax. Code for a clean and minimal repo\nhttps://github.com/david-knigge/enf-min-jax.",
    "published": "2024-06-09T12:16:30+00:00",
    "updated": "2025-02-07T17:31:20+00:00",
    "pdf_url": "http://arxiv.org/pdf/2406.05753v5",
    "entry_id": "http://arxiv.org/abs/2406.05753v5",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CV"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Pareto-Optimal Learning from Preferences with Hidden Context",
    "authors": [
      "Ryan Bahlous-Boldi",
      "Li Ding",
      "Lee Spector",
      "Scott Niekum"
    ],
    "summary": "Ensuring AI models align with human values is essential for their safety and\nfunctionality. Reinforcement learning from human feedback (RLHF) leverages\nhuman preferences to achieve this alignment. However, when preferences are\nsourced from diverse populations, point estimates of reward can result in\nsuboptimal performance or be unfair to specific groups. We propose Pareto\nOptimal Preference Learning (POPL), which enables pluralistic alignment by\nframing discrepant group preferences as objectives with potential trade-offs,\naiming for policies that are Pareto-optimal on the preference dataset. POPL\nutilizes lexicase selection, an iterative process that selects diverse and\nPareto-optimal solutions. Our theoretical and empirical evaluations demonstrate\nthat POPL surpasses baseline methods in learning sets of reward functions and\npolicies, effectively catering to distinct groups without access to group\nnumbers or membership labels. We verify the performance of POPL on a stateless\npreference learning setting, a Minigrid RL domain, Metaworld robotics\nbenchmarks, as well as large language model (LLM) fine-tuning. We illustrate\nthat POPL can also serve as a foundation for techniques optimizing specific\nnotions of group fairness, ensuring safe and equitable AI model alignment.",
    "published": "2024-06-21T18:57:38+00:00",
    "updated": "2025-02-07T17:29:48+00:00",
    "pdf_url": "http://arxiv.org/pdf/2406.15599v2",
    "entry_id": "http://arxiv.org/abs/2406.15599v2",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Leveraging Hypernetworks and Learnable Kernels for Consumer Energy Forecasting Across Diverse Consumer Types",
    "authors": [
      "Muhammad Umair Danish",
      "Katarina Grolinger"
    ],
    "summary": "Consumer energy forecasting is essential for managing energy consumption and\nplanning, directly influencing operational efficiency, cost reduction,\npersonalized energy management, and sustainability efforts. In recent years,\ndeep learning techniques, especially LSTMs and transformers, have been greatly\nsuccessful in the field of energy consumption forecasting. Nevertheless, these\ntechniques have difficulties in capturing complex and sudden variations, and,\nmoreover, they are commonly examined only on a specific type of consumer (e.g.,\nonly offices, only schools). Consequently, this paper proposes HyperEnergy, a\nconsumer energy forecasting strategy that leverages hypernetworks for improved\nmodeling of complex patterns applicable across a diversity of consumers.\nHypernetwork is responsible for predicting the parameters of the primary\nprediction network, in our case LSTM. A learnable adaptable kernel, comprised\nof polynomial and radial basis function kernels, is incorporated to enhance\nperformance. The proposed HyperEnergy was evaluated on diverse consumers\nincluding, student residences, detached homes, a home with electric vehicle\ncharging, and a townhouse. Across all consumer types, HyperEnergy consistently\noutperformed 10 other techniques, including state-of-the-art models such as\nLSTM, AttentionLSTM, and transformer.",
    "published": "2025-02-07T17:25:54+00:00",
    "updated": "2025-02-07T17:25:54+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05104v1",
    "entry_id": "http://arxiv.org/abs/2502.05104v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": "IEEE Transactions on Power Delivery, Volume 40, 2025, Pages 75-87",
    "doi": "10.1109/TPWRD.2024.3486010"
  },
  {
    "title": "Learning Temporal Invariance in Android Malware Detectors",
    "authors": [
      "Xinran Zheng",
      "Shuo Yang",
      "Edith C. H. Ngai",
      "Suman Jana",
      "Lorenzo Cavallaro"
    ],
    "summary": "Learning-based Android malware detectors degrade over time due to natural\ndistribution drift caused by malware variants and new families. This paper\nsystematically investigates the challenges classifiers trained with empirical\nrisk minimization (ERM) face against such distribution shifts and attributes\ntheir shortcomings to their inability to learn stable discriminative features.\nInvariant learning theory offers a promising solution by encouraging models to\ngenerate stable representations crossing environments that expose the\ninstability of the training set. However, the lack of prior environment labels,\nthe diversity of drift factors, and low-quality representations caused by\ndiverse families make this task challenging. To address these issues, we\npropose TIF, the first temporal invariant training framework for malware\ndetection, which aims to enhance the ability of detectors to learn stable\nrepresentations across time. TIF organizes environments based on application\nobservation dates to reveal temporal drift, integrating specialized multi-proxy\ncontrastive learning and invariant gradient alignment to generate and align\nenvironments with high-quality, stable representations. TIF can be seamlessly\nintegrated into any learning-based detector. Experiments on a decade-long\ndataset show that TIF excels, particularly in early deployment stages,\naddressing real-world needs and outperforming state-of-the-art methods.",
    "published": "2025-02-07T17:17:42+00:00",
    "updated": "2025-02-07T17:17:42+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05098v1",
    "entry_id": "http://arxiv.org/abs/2502.05098v1",
    "categories": [
      "cs.CR",
      "cs.AI"
    ],
    "primary_category": "cs.CR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Lost in Time: Clock and Calendar Understanding Challenges in Multimodal LLMs",
    "authors": [
      "Rohit Saxena",
      "Aryo Pradipta Gema",
      "Pasquale Minervini"
    ],
    "summary": "Understanding time from visual representations is a fundamental cognitive\nskill, yet it remains a challenge for multimodal large language models (MLLMs).\nIn this work, we investigate the capabilities of MLLMs in interpreting time and\ndate through analogue clocks and yearly calendars. To facilitate this, we\ncurated a structured dataset comprising two subsets: 1) $\\textit{ClockQA}$,\nwhich comprises various types of clock styles$-$standard, black-dial,\nno-second-hand, Roman numeral, and arrow-hand clocks$-$paired with time related\nquestions; and 2) $\\textit{CalendarQA}$, which consists of yearly calendar\nimages with questions ranging from commonly known dates (e.g., Christmas, New\nYear's Day) to computationally derived ones (e.g., the 100th or 153rd day of\nthe year). We aim to analyse how MLLMs can perform visual recognition,\nnumerical reasoning, and temporal inference when presented with time-related\nvisual data. Our evaluations show that despite recent advancements, reliably\nunderstanding time remains a significant challenge for MLLMs.",
    "published": "2025-02-07T17:11:23+00:00",
    "updated": "2025-02-07T17:11:23+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05092v1",
    "entry_id": "http://arxiv.org/abs/2502.05092v1",
    "categories": [
      "cs.CV",
      "cs.AI",
      "cs.CL"
    ],
    "primary_category": "cs.CV",
    "comment": "Preprint",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Mitigating Unintended Memorization with LoRA in Federated Learning for LLMs",
    "authors": [
      "Thierry Bossy",
      "Julien Vignoud",
      "Tahseen Rabbani",
      "Juan R. Troncoso Pastoriza",
      "Martin Jaggi"
    ],
    "summary": "Federated learning (FL) is a popular paradigm for collaborative training\nwhich avoids direct data exposure between clients. However, data privacy issues\nstill remain: FL-trained large language models are capable of memorizing and\ncompleting phrases and sentences contained in training data when given with\ntheir prefixes. Thus, it is possible for adversarial and honest-but-curious\nclients to recover training data of other participants simply through targeted\nprompting. In this work, we demonstrate that a popular and simple fine-tuning\nstrategy, low-rank adaptation (LoRA), reduces memorization during FL up to a\nfactor of 10. We study this effect by performing a medical question-answering\nfine-tuning task and injecting multiple replicas of out-of-distribution\nsensitive sequences drawn from an external clinical dataset. We observe a\nreduction in memorization for a wide variety of Llama 2 and 3 models, and find\nthat LoRA can reduce memorization in centralized learning as well. Furthermore,\nwe show that LoRA can be combined with other privacy-preserving techniques such\nas gradient clipping and Gaussian noising, secure aggregation, and Goldfish\nloss to further improve record-level privacy while maintaining performance.",
    "published": "2025-02-07T17:04:39+00:00",
    "updated": "2025-02-07T17:04:39+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05087v1",
    "entry_id": "http://arxiv.org/abs/2502.05087v1",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CL"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Causality can systematically address the monsters under the bench(marks)",
    "authors": [
      "Felix Leeb",
      "Zhijing Jin",
      "Bernhard Schölkopf"
    ],
    "summary": "Effective and reliable evaluation is essential for advancing empirical\nmachine learning. However, the increasing accessibility of generalist models\nand the progress towards ever more complex, high-level tasks make systematic\nevaluation more challenging. Benchmarks are plagued by various biases,\nartifacts, or leakage, while models may behave unreliably due to poorly\nexplored failure modes. Haphazard treatments and inconsistent formulations of\nsuch \"monsters\" can contribute to a duplication of efforts, a lack of trust in\nresults, and unsupported inferences. In this position paper, we argue causality\noffers an ideal framework to systematically address these challenges. By making\ncausal assumptions in an approach explicit, we can faithfully model phenomena,\nformulate testable hypotheses with explanatory power, and leverage principled\ntools for analysis. To make causal model design more accessible, we identify\nseveral useful Common Abstract Topologies (CATs) in causal graphs which help\ngain insight into the reasoning abilities in large language models. Through a\nseries of case studies, we demonstrate how the precise yet pragmatic language\nof causality clarifies the strengths and limitations of a method and inspires\nnew approaches for systematic progress.",
    "published": "2025-02-07T17:01:37+00:00",
    "updated": "2025-02-07T17:01:37+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05085v1",
    "entry_id": "http://arxiv.org/abs/2502.05085v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "ChallengeMe: An Adversarial Learning-enabled Text Summarization Framework",
    "authors": [
      "Xiaoyu Deng",
      "Ye Zhang",
      "Tianmin Guo",
      "Yongzhe Zhang",
      "Zhengjian Kang",
      "Hang Yang"
    ],
    "summary": "The astonishing performance of large language models (LLMs) and their\nremarkable achievements in production and daily life have led to their\nwidespread application in collaborative tasks. However, current large models\nface challenges such as hallucination and lack of specificity in content\ngeneration in vertical domain tasks. Inspired by the contrast and\nclassification mechanisms in human cognitive processes, this paper constructs\nan adversarial learning-based prompt framework named ChallengeMe, which\nincludes three cascaded solutions: generation prompts, evaluation prompts, and\nfeedback optimization. In this process, we designed seven core optimization\ndimensions and set the threshold for adversarial learning. The results of mixed\ncase studies on the text summarization task show that the proposed framework\ncan generate more accurate and fluent text summaries compared to the current\nadvanced mainstream LLMs.",
    "published": "2025-02-07T16:59:34+00:00",
    "updated": "2025-02-07T16:59:34+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05084v1",
    "entry_id": "http://arxiv.org/abs/2502.05084v1",
    "categories": [
      "cs.CL",
      "cs.AI"
    ],
    "primary_category": "cs.CL",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Adaptive Graph of Thoughts: Test-Time Adaptive Reasoning Unifying Chain, Tree, and Graph Structures",
    "authors": [
      "Tushar Pandey",
      "Ara Ghukasyan",
      "Oktay Goktas",
      "Santosh Kumar Radha"
    ],
    "summary": "Large Language Models (LLMs) have demonstrated impressive reasoning\ncapabilities, yet their performance is highly dependent on the prompting\nstrategy and model scale. While reinforcement learning and fine-tuning have\nbeen deployed to boost reasoning, these approaches incur substantial\ncomputational and data overhead. In this work, we introduce Adaptive Graph of\nThoughts (AGoT), a dynamic, graph-based inference framework that enhances LLM\nreasoning solely at test time. Rather than relying on fixed-step methods like\nChain of Thought (CoT) or Tree of Thoughts (ToT), AGoT recursively decomposes\ncomplex queries into structured subproblems, forming an dynamic directed\nacyclic graph (DAG) of interdependent reasoning steps. By selectively expanding\nonly those subproblems that require further analysis, AGoT unifies the\nstrengths of chain, tree, and graph paradigms into a cohesive framework that\nallocates computation where it is most needed. We validate our approach on\ndiverse benchmarks spanning multi-hop retrieval, scientific reasoning, and\nmathematical problem-solving, achieving up to 46.2% improvement on scientific\nreasoning tasks (GPQA) - comparable to gains achieved through computationally\nintensive reinforcement learning approaches and outperforming state-of-the-art\niterative approaches. These results suggest that dynamic decomposition and\nstructured recursion offer a scalable, cost-effective alternative to\npost-training modifications, paving the way for more robust, general-purpose\nreasoning in LLMs.",
    "published": "2025-02-07T16:54:19+00:00",
    "updated": "2025-02-07T16:54:19+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05078v1",
    "entry_id": "http://arxiv.org/abs/2502.05078v1",
    "categories": [
      "cs.AI",
      "cs.CL"
    ],
    "primary_category": "cs.AI",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Explainable Artificial Intelligence (XAI) for Malware Analysis: A Survey of Techniques, Applications, and Open Challenges",
    "authors": [
      "Harikha Manthena",
      "Shaghayegh Shajarian",
      "Jeffrey Kimmell",
      "Mahmoud Abdelsalam",
      "Sajad Khorsandroo",
      "Maanak Gupta"
    ],
    "summary": "Machine learning (ML) has rapidly advanced in recent years, revolutionizing\nfields such as finance, medicine, and cybersecurity. In malware detection,\nML-based approaches have demonstrated high accuracy; however, their lack of\ntransparency poses a significant challenge. Traditional black-box models often\nfail to provide interpretable justifications for their predictions, limiting\ntheir adoption in security-critical environments where understanding the\nreasoning behind a detection is essential for threat mitigation and response.\nExplainable AI (XAI) addresses this gap by enhancing model interpretability\nwhile maintaining strong detection capabilities. This survey presents a\ncomprehensive review of state-of-the-art ML techniques for malware analysis,\nwith a specific focus on explainability methods. We examine existing XAI\nframeworks, their application in malware classification and detection, and the\nchallenges associated with making malware detection models more interpretable.\nAdditionally, we explore recent advancements and highlight open research\nchallenges in the field of explainable malware analysis. By providing a\nstructured overview of XAI-driven malware detection approaches, this survey\nserves as a valuable resource for researchers and practitioners seeking to\nbridge the gap between ML performance and explainability in cybersecurity.",
    "published": "2024-09-09T08:19:33+00:00",
    "updated": "2025-02-07T16:44:06+00:00",
    "pdf_url": "http://arxiv.org/pdf/2409.13723v2",
    "entry_id": "http://arxiv.org/abs/2409.13723v2",
    "categories": [
      "cs.CR",
      "cs.AI"
    ],
    "primary_category": "cs.CR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Computing and Learning on Combinatorial Data",
    "authors": [
      "Simon Zhang"
    ],
    "summary": "The twenty-first century is a data-driven era where human activities and\nbehavior, physical phenomena, scientific discoveries, technology advancements,\nand almost everything that happens in the world resulting in massive\ngeneration, collection, and utilization of data.\n  Connectivity in data is a crucial property. A straightforward example is the\nWorld Wide Web, where every webpage is connected to other web pages through\nhyperlinks, providing a form of directed connectivity. Combinatorial data\nrefers to combinations of data items based on certain connectivity rules. Other\nforms of combinatorial data include social networks, meshes, community\nclusters, set systems, and molecules.\n  This Ph.D. dissertation focuses on learning and computing with combinatorial\ndata. We study and examine topological and connectivity features within and\nacross connected data to improve the performance of learning and achieve high\nalgorithmic efficiency.",
    "published": "2025-02-07T16:35:06+00:00",
    "updated": "2025-02-07T16:35:06+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05063v1",
    "entry_id": "http://arxiv.org/abs/2502.05063v1",
    "categories": [
      "cs.AI",
      "cs.DM",
      "cs.DS"
    ],
    "primary_category": "cs.AI",
    "comment": "Ph.D. dissertation, 503 pages, 66 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Preference-aware compensation policies for crowdsourced on-demand services",
    "authors": [
      "Georgina Nouli",
      "Axel Parmentier",
      "Maximilian Schiffer"
    ],
    "summary": "Crowdsourced on-demand services offer benefits such as reduced costs, faster\nservice fulfillment times, greater adaptability, and contributions to\nsustainable urban transportation in on-demand delivery contexts. However, the\nsuccess of an on-demand platform that utilizes crowdsourcing relies on finding\na compensation policy that strikes a balance between creating attractive offers\nfor gig workers and ensuring profitability. In this work, we examine a dynamic\npricing problem for an on-demand platform that sets request-specific\ncompensation of gig workers in a discrete-time framework, where requests and\nworkers arrive stochastically. The operator's goal is to determine a\ncompensation policy that maximizes the total expected reward over the time\nhorizon. Our approach introduces compensation strategies that explicitly\naccount for gig worker request preferences. To achieve this, we employ the\nMultinomial Logit model to represent the acceptance probabilities of gig\nworkers, and, as a result, derive an analytical solution that utilizes\npost-decision states. Subsequently, we integrate this solution into an\napproximate dynamic programming algorithm. We compare our algorithm against\nbenchmark algorithms, including formula-based policies and an upper bound\nprovided by the full information linear programming solution. Our algorithm\ndemonstrates consistent performance across diverse settings, achieving\nimprovements of at least 2.5-7.5% in homogeneous gig worker populations and 9%\nin heterogeneous populations over benchmarks, based on fully synthetic data.\nFor real-world data, it surpasses benchmarks by 8% in weak and 20% in strong\nlocation preference scenarios.",
    "published": "2025-02-07T16:33:16+00:00",
    "updated": "2025-02-07T16:33:16+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05060v1",
    "entry_id": "http://arxiv.org/abs/2502.05060v1",
    "categories": [
      "cs.LG",
      "cs.AI",
      "math.OC"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Differentiable Mobile Display Photometric Stereo",
    "authors": [
      "Gawoon Ban",
      "Hyeongjun Kim",
      "Seokjun Choi",
      "Seungwoo Yoon",
      "Seung-Hwan Baek"
    ],
    "summary": "Display photometric stereo uses a display as a programmable light source to\nilluminate a scene with diverse illumination conditions. Recently,\ndifferentiable display photometric stereo (DDPS) demonstrated improved normal\nreconstruction accuracy by using learned display patterns. However, DDPS faced\nlimitations in practicality, requiring a fixed desktop imaging setup using a\npolarization camera and a desktop-scale monitor. In this paper, we propose a\nmore practical physics-based photometric stereo, differentiable mobile display\nphotometric stereo (DMDPS), that leverages a mobile phone consisting of a\ndisplay and a camera. We overcome the limitations of using a mobile device by\ndeveloping a mobile app and method that simultaneously displays patterns and\ncaptures high-quality HDR images. Using this technique, we capture real-world\n3D-printed objects and learn display patterns via a differentiable learning\nprocess. We demonstrate the effectiveness of DMDPS on both a 3D printed dataset\nand a first dataset of fallen leaves. The leaf dataset contains reconstructed\nsurface normals and albedos of fallen leaves that may enable future research\nbeyond computer graphics and vision. We believe that DMDPS takes a step forward\nfor practical physics-based photometric stereo.",
    "published": "2025-02-07T16:24:56+00:00",
    "updated": "2025-02-07T16:24:56+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05055v1",
    "entry_id": "http://arxiv.org/abs/2502.05055v1",
    "categories": [
      "cs.CV",
      "cs.AI",
      "cs.GR",
      "cs.LG"
    ],
    "primary_category": "cs.CV",
    "comment": "9 pages",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "GenBFA: An Evolutionary Optimization Approach to Bit-Flip Attacks on LLMs",
    "authors": [
      "Sanjay Das",
      "Swastik Bhattacharya",
      "Souvik Kundu",
      "Shamik Kundu",
      "Anand Menon",
      "Arnab Raha",
      "Kanad Basu"
    ],
    "summary": "Large Language Models (LLMs) have revolutionized natural language processing\n(NLP), excelling in tasks like text generation and summarization. However,\ntheir increasing adoption in mission-critical applications raises concerns\nabout hardware-based threats, particularly bit-flip attacks (BFAs). BFAs,\nenabled by fault injection methods such as Rowhammer, target model parameters\nin memory, compromising both integrity and performance. Identifying critical\nparameters for BFAs in the vast parameter space of LLMs poses significant\nchallenges. While prior research suggests transformer-based architectures are\ninherently more robust to BFAs compared to traditional deep neural networks, we\nchallenge this assumption. For the first time, we demonstrate that as few as\nthree bit-flips can cause catastrophic performance degradation in an LLM with\nbillions of parameters. Current BFA techniques are inadequate for exploiting\nthis vulnerability due to the difficulty of efficiently identifying critical\nparameters within the immense parameter space. To address this, we propose\nAttentionBreaker, a novel framework tailored for LLMs that enables efficient\ntraversal of the parameter space to identify critical parameters. Additionally,\nwe introduce GenBFA, an evolutionary optimization strategy designed to refine\nthe search further, isolating the most critical bits for an efficient and\neffective attack. Empirical results reveal the profound vulnerability of LLMs\nto AttentionBreaker. For example, merely three bit-flips (4.129 x 10^-9% of\ntotal parameters) in the LLaMA3-8B-Instruct 8-bit quantized (W8) model result\nin a complete performance collapse: accuracy on MMLU tasks drops from 67.3% to\n0%, and Wikitext perplexity skyrockets from 12.6 to 4.72 x 10^5. These findings\nunderscore the effectiveness of AttentionBreaker in uncovering and exploiting\ncritical vulnerabilities within LLM architectures.",
    "published": "2024-11-21T00:01:51+00:00",
    "updated": "2025-02-07T16:24:17+00:00",
    "pdf_url": "http://arxiv.org/pdf/2411.13757v2",
    "entry_id": "http://arxiv.org/abs/2411.13757v2",
    "categories": [
      "cs.CR",
      "cs.AI",
      "cs.LG"
    ],
    "primary_category": "cs.CR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Conversation Routines: A Prompt Engineering Framework for Task-Oriented Dialog Systems",
    "authors": [
      "Giorgio Robino"
    ],
    "summary": "This study introduces Conversation Routines (CR), a structured prompt\nengineering framework for developing task-oriented dialog systems using Large\nLanguage Models (LLMs). While LLMs demonstrate remarkable natural language\nunderstanding capabilities, engineering them to reliably execute complex\nbusiness workflows remains challenging. The proposed CR framework enables the\ndevelopment of Conversation Agentic Systems (CAS) through natural language\nspecifications, embedding task-oriented logic within LLM prompts. This approach\nprovides a systematic methodology for designing and implementing complex\nconversational workflows while maintaining behavioral consistency. We\ndemonstrate the framework's effectiveness through two proof-of-concept\nimplementations: a Train Ticket Booking System and an Interactive\nTroubleshooting Copilot. These case studies validate CR's capability to encode\nsophisticated behavioral patterns and decision logic while preserving natural\nconversational flexibility. Results show that CR enables domain experts to\ndesign conversational workflows in natural language while leveraging custom\nfunctions (tools) developed by software engineers, creating an efficient\ndivision of responsibilities where developers focus on core API implementation\nand domain experts handle conversation design. While the framework shows\npromise in accessibility and adaptability, we identify key challenges including\ncomputational overhead, non-deterministic behavior, and domain-specific logic\noptimization. Future research directions include CR evaluation methods based on\nprompt engineering frameworks driven by goal-oriented grading criteria,\nimproving scalability for complex multi-agent interactions, and enhancing\nsystem robustness to address the identified limitations across diverse business\napplications.",
    "published": "2025-01-20T17:19:02+00:00",
    "updated": "2025-02-07T16:18:20+00:00",
    "pdf_url": "http://arxiv.org/pdf/2501.11613v4",
    "entry_id": "http://arxiv.org/abs/2501.11613v4",
    "categories": [
      "cs.CL",
      "cs.AI",
      "cs.ET",
      "cs.HC",
      "cs.PL"
    ],
    "primary_category": "cs.CL",
    "comment": "Figure 1 substituted. Added smolagents subsection in Other Works.\n  Minor format revision",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Federated Learning for Anomaly Detection in Energy Consumption Data: Assessing the Vulnerability to Adversarial Attacks",
    "authors": [
      "Yohannis Kifle Telila",
      "Damitha Senevirathne",
      "Dumindu Tissera",
      "Apurva Narayan",
      "Miriam A. M. Capretz",
      "Katarina Grolinger"
    ],
    "summary": "Anomaly detection is crucial in the energy sector to identify irregular\npatterns indicating equipment failures, energy theft, or other issues. Machine\nlearning techniques for anomaly detection have achieved great success, but are\ntypically centralized, involving sharing local data with a central server which\nraises privacy and security concerns. Federated Learning (FL) has been gaining\npopularity as it enables distributed learning without sharing local data.\nHowever, FL depends on neural networks, which are vulnerable to adversarial\nattacks that manipulate data, leading models to make erroneous predictions.\nWhile adversarial attacks have been explored in the image domain, they remain\nlargely unexplored in time series problems, especially in the energy domain.\nMoreover, the effect of adversarial attacks in the FL setting is also mostly\nunknown. This paper assesses the vulnerability of FL-based anomaly detection in\nenergy data to adversarial attacks. Specifically, two state-of-the-art models,\nLong Short Term Memory (LSTM) and Transformers, are used to detect anomalies in\nan FL setting, and two white-box attack methods, Fast Gradient Sign Method\n(FGSM) and Projected Gradient Descent (PGD), are employed to perturb the data.\nThe results show that FL is more sensitive to PGD attacks than to FGSM attacks,\nattributed to PGD's iterative nature, resulting in an accuracy drop of over 10%\neven with naive, weaker attacks. Moreover, FL is more affected by these attacks\nthan centralized learning, highlighting the need for defense mechanisms in FL.",
    "published": "2025-02-07T16:08:20+00:00",
    "updated": "2025-02-07T16:08:20+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05041v1",
    "entry_id": "http://arxiv.org/abs/2502.05041v1",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.DC",
      "68",
      "I.2; I.5; I.2.11; I.5.4"
    ],
    "primary_category": "cs.LG",
    "comment": "12th IEEE Conference on Technologies for Sustainability",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Bridging Voting and Deliberation with Algorithms: Field Insights from vTaiwan and Kultur Komitee",
    "authors": [
      "Joshua C. Yang",
      "Fynn Bachmann"
    ],
    "summary": "Democratic processes increasingly aim to integrate large-scale voting with\nface-to-face deliberation, addressing the challenge of reconciling individual\npreferences with collective decision-making. This work introduces new methods\nthat use algorithms and computational tools to bridge online voting with\nface-to-face deliberation, tested in two real-world scenarios: Kultur Komitee\n2024 (KK24) and vTaiwan. These case studies highlight the practical\napplications and impacts of the proposed methods.\n  We present three key contributions: (1) Radial Clustering for Preference\nBased Subgroups, which enables both in-depth and broad discussions in\ndeliberative settings by computing homogeneous and heterogeneous group\ncompositions with balanced and adjustable group sizes; (2) Human-in-the-loop\nMES, a practical method that enhances the Method of Equal Shares (MES)\nalgorithm with real-time digital feedback. This builds algorithmic trust by\ngiving participants full control over how much decision-making is delegated to\nthe voting aggregation algorithm as compared to deliberation; and (3) the\nReadTheRoom deliberation method, which uses opinion space mapping to identify\nagreement and divergence, along with spectrum-based preference visualisation to\ntrack opinion shifts during deliberation. This approach enhances transparency\nby clarifying collective sentiment and fosters collaboration by encouraging\nparticipants to engage constructively with differing perspectives.\n  By introducing these actionable frameworks, this research extends in-person\ndeliberation with scalable digital methods that address the complexities of\nmodern decision-making in participatory processes.",
    "published": "2025-02-07T15:45:13+00:00",
    "updated": "2025-02-07T15:45:13+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05017v1",
    "entry_id": "http://arxiv.org/abs/2502.05017v1",
    "categories": [
      "cs.HC",
      "cs.AI",
      "econ.GN",
      "q-fin.EC",
      "91B14, 91B12, 91A12, 68T01, 68T20, 68U35",
      "H.5.3; I.2.0; I.2.11; J.1; G.2.0; G.2.2; K.4.1; K.4.3"
    ],
    "primary_category": "cs.HC",
    "comment": "Submitted to ACM Conference on Fairness, Accountability, and\n  Transparency (FAccT) 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Analyzing Advanced AI Systems Against Definitions of Life and Consciousness",
    "authors": [
      "Azadeh Alavi",
      "Hossein Akhoundi",
      "Fatemeh Kouchmeshki"
    ],
    "summary": "Could artificial intelligence ever become truly conscious in a functional\nsense; this paper explores that open-ended question through the lens of Life, a\nconcept unifying classical biological criteria (Oxford, NASA, Koshland) with\nempirical hallmarks such as adaptive self maintenance, emergent complexity, and\nrudimentary self referential modeling. We propose a number of metrics for\nexamining whether an advanced AI system has gained consciousness, while\nemphasizing that we do not claim all AI stems can become conscious. Rather, we\nsuggest that sufficiently advanced architectures exhibiting immune like\nsabotage defenses, mirror self-recognition analogs, or meta-cognitive updates\nmay cross key thresholds akin to life-like or consciousness-like traits. To\ndemonstrate these ideas, we start by assessing adaptive self-maintenance\ncapability, and introduce controlled data corruption sabotage into the training\nprocess. The result demonstrates AI capability to detect these inconsistencies\nand revert or self-correct analogous to regenerative biological processes. We\nalso adapt an animal-inspired mirror self recognition test to neural\nembeddings, finding that partially trained CNNs can distinguish self from\nforeign features with complete accuracy. We then extend our analysis by\nperforming a question-based mirror test on five state-of-the-art chatbots\n(ChatGPT4, Gemini, Perplexity, Claude, and Copilot) and demonstrated their\nability to recognize their own answers compared to those of the other chatbots.",
    "published": "2025-02-07T15:27:34+00:00",
    "updated": "2025-02-07T15:27:34+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05007v1",
    "entry_id": "http://arxiv.org/abs/2502.05007v1",
    "categories": [
      "cs.AI"
    ],
    "primary_category": "cs.AI",
    "comment": "78 pages, 15 figures, 4 tables",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "A New Paradigm in Tuning Learned Indexes: A Reinforcement Learning Enhanced Approach",
    "authors": [
      "Taiyi Wang",
      "Liang Liang",
      "Guang Yang",
      "Thomas Heinis",
      "Eiko Yoneki"
    ],
    "summary": "Learned Index Structures (LIS) have significantly advanced data management by\nleveraging machine learning models to optimize data indexing. However,\ndesigning these structures often involves critical trade-offs, making it\nchallenging for both designers and end-users to find an optimal balance\ntailored to specific workloads and scenarios. While some indexes offer\nadjustable parameters that demand intensive manual tuning, others rely on fixed\nconfigurations based on heuristic auto-tuners or expert knowledge, which may\nnot consistently deliver optimal performance.\n  This paper introduces LITune, a novel framework for end-to-end automatic\ntuning of Learned Index Structures. LITune employs an adaptive training\npipeline equipped with a tailor-made Deep Reinforcement Learning (DRL) approach\nto ensure stable and efficient tuning. To accommodate long-term dynamics\narising from online tuning, we further enhance LITune with an on-the-fly\nupdating mechanism termed the O2 system. These innovations allow LITune to\neffectively capture state transitions in online tuning scenarios and\ndynamically adjust to changing data distributions and workloads, marking a\nsignificant improvement over other tuning methods. Our experimental results\ndemonstrate that LITune achieves up to a 98% reduction in runtime and a 17-fold\nincrease in throughput compared to default parameter settings given a selected\nLearned Index instance. These findings highlight LITune's effectiveness and its\npotential to facilitate broader adoption of LIS in real-world applications.",
    "published": "2025-02-07T15:22:15+00:00",
    "updated": "2025-02-07T15:22:15+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05001v1",
    "entry_id": "http://arxiv.org/abs/2502.05001v1",
    "categories": [
      "cs.DB",
      "cs.AI",
      "cs.SY",
      "eess.SY"
    ],
    "primary_category": "cs.DB",
    "comment": "15 pages",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Robust Graph Learning Against Adversarial Evasion Attacks via Prior-Free Diffusion-Based Structure Purification",
    "authors": [
      "Jiayi Luo",
      "Qingyun Sun",
      "Haonan Yuan",
      "Xingcheng Fu",
      "Jianxin Li"
    ],
    "summary": "Adversarial evasion attacks pose significant threats to graph learning, with\nlines of studies that have improved the robustness of Graph Neural Networks\n(GNNs). However, existing works rely on priors about clean graphs or attacking\nstrategies, which are often heuristic and inconsistent. To achieve robust graph\nlearning over different types of evasion attacks and diverse datasets, we\ninvestigate this problem from a prior-free structure purification perspective.\nSpecifically, we propose a novel Diffusion-based Structure Purification\nframework named DiffSP, which creatively incorporates the graph diffusion model\nto learn intrinsic distributions of clean graphs and purify the perturbed\nstructures by removing adversaries under the direction of the captured\npredictive patterns without relying on priors. DiffSP is divided into the\nforward diffusion process and the reverse denoising process, during which\nstructure purification is achieved. To avoid valuable information loss during\nthe forward process, we propose an LID-driven nonisotropic diffusion mechanism\nto selectively inject noise anisotropically. To promote semantic alignment\nbetween the clean graph and the purified graph generated during the reverse\nprocess, we reduce the generation uncertainty by the proposed graph transfer\nentropy guided denoising mechanism. Extensive experiments demonstrate the\nsuperior robustness of DiffSP against evasion attacks.",
    "published": "2025-02-07T15:21:47+00:00",
    "updated": "2025-02-07T15:21:47+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.05000v1",
    "entry_id": "http://arxiv.org/abs/2502.05000v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "Accepted for poster at WWW 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "A Strong Baseline for Molecular Few-Shot Learning",
    "authors": [
      "Philippe Formont",
      "Hugo Jeannin",
      "Pablo Piantanida",
      "Ismail Ben Ayed"
    ],
    "summary": "Few-shot learning has recently attracted significant interest in drug\ndiscovery, with a recent, fast-growing literature mostly involving convoluted\nmeta-learning strategies. We revisit the more straightforward fine-tuning\napproach for molecular data, and propose a regularized quadratic-probe loss\nbased on the the Mahalanobis distance. We design a dedicated block-coordinate\ndescent optimizer, which avoid the degenerate solutions of our loss.\nInterestingly, our simple fine-tuning approach achieves highly competitive\nperformances in comparison to state-of-the-art methods, while being applicable\nto black-box settings and removing the need for specific episodic pre-training\nstrategies. Furthermore, we introduce a new benchmark to assess the robustness\nof the competing methods to domain shifts. In this setting, our fine-tuning\nbaseline obtains consistently better results than meta-learning methods.",
    "published": "2024-04-02T21:20:51+00:00",
    "updated": "2025-02-07T15:21:27+00:00",
    "pdf_url": "http://arxiv.org/pdf/2404.02314v2",
    "entry_id": "http://arxiv.org/abs/2404.02314v2",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "Published in Transactions on Machine Learning Research (02/2025)",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "On Sequential Fault-Intolerant Process Planning",
    "authors": [
      "Andrzej Kaczmarczyk",
      "Davin Choo",
      "Niclas Boehmer",
      "Milind Tambe",
      "Haifeng Xu"
    ],
    "summary": "We propose and study a planning problem we call Sequential Fault-Intolerant\nProcess Planning (SFIPP). SFIPP captures a reward structure common in many\nsequential multi-stage decision problems where the planning is deemed\nsuccessful only if all stages succeed. Such reward structures are different\nfrom classic additive reward structures and arise in important applications\nsuch as drug/material discovery, security, and quality-critical product design.\nWe design provably tight online algorithms for settings in which we need to\npick between different actions with unknown success chances at each stage. We\ndo so both for the foundational case in which the behavior of actions is\ndeterministic, and the case of probabilistic action outcomes, where we\neffectively balance exploration for learning and exploitation for planning\nthrough the usage of multi-armed bandit algorithms. In our empirical\nevaluations, we demonstrate that the specialized algorithms we develop, which\nleverage additional information about the structure of the SFIPP instance,\noutperform our more general algorithm.",
    "published": "2025-02-07T15:20:35+00:00",
    "updated": "2025-02-07T15:20:35+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04998v1",
    "entry_id": "http://arxiv.org/abs/2502.04998v1",
    "categories": [
      "cs.AI"
    ],
    "primary_category": "cs.AI",
    "comment": "20 pages; 7 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Aligning Black-box Language Models with Human Judgments",
    "authors": [
      "Gerrit J. J. van den Burg",
      "Gen Suzuki",
      "Wei Liu",
      "Murat Sensoy"
    ],
    "summary": "Large language models (LLMs) are increasingly used as automated judges to\nevaluate recommendation systems, search engines, and other subjective tasks,\nwhere relying on human evaluators can be costly, time-consuming, and\nunscalable. LLMs offer an efficient solution for continuous, automated\nevaluation. However, since the systems that are built and improved with these\njudgments are ultimately designed for human use, it is crucial that LLM\njudgments align closely with human evaluators to ensure such systems remain\nhuman-centered. On the other hand, aligning LLM judgments with human evaluators\nis challenging due to individual variability and biases in human judgments. We\npropose a simple yet effective framework to align LLM judgments with individual\nhuman evaluators or their aggregated judgments, without retraining or\nfine-tuning the LLM. Our approach learns a linear mapping between the LLM's\noutputs and human judgments, achieving over 142% average improvement in\nagreement across 29 tasks with only a small number of calibration examples used\nfor training. Notably, our method works in zero-shot and few-shot settings,\nexceeds inter-human agreement on four out of six tasks, and enables smaller\nLLMs to achieve performance comparable to that of larger models.",
    "published": "2025-02-07T15:19:40+00:00",
    "updated": "2025-02-07T15:19:40+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04997v1",
    "entry_id": "http://arxiv.org/abs/2502.04997v1",
    "categories": [
      "cs.CL",
      "cs.AI",
      "cs.LG",
      "68T50",
      "I.2.7"
    ],
    "primary_category": "cs.CL",
    "comment": "Accepted for publication at NAACL 2025 (Findings)",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "NV-Retriever: Improving text embedding models with effective hard-negative mining",
    "authors": [
      "Gabriel de Souza P. Moreira",
      "Radek Osmulski",
      "Mengyao Xu",
      "Ronay Ak",
      "Benedikt Schifferer",
      "Even Oldridge"
    ],
    "summary": "Text embedding models have been popular for information retrieval\napplications such as semantic search and Question-Answering systems based on\nRetrieval-Augmented Generation (RAG). Those models are typically Transformer\nmodels that are fine-tuned with contrastive learning objectives. One of the\nchallenging aspects of fine-tuning embedding models is the selection of high\nquality hard-negative passages for contrastive learning. In this paper we\nintroduce a family of positive-aware mining methods that use the positive\nrelevance score as an anchor for effective false negative removal, leading to\nfaster training and more accurate retrieval models. We provide an ablation\nstudy on hard-negative mining methods over their configurations, exploring\ndifferent teacher and base models. We further demonstrate the efficacy of our\nproposed mining methods at scale with the NV-Retriever-v1 model, which scores\n60.9 on MTEB Retrieval (BEIR) benchmark and placed 1st when it was published to\nthe MTEB Retrieval on July, 2024.",
    "published": "2024-07-22T17:50:31+00:00",
    "updated": "2025-02-07T15:17:18+00:00",
    "pdf_url": "http://arxiv.org/pdf/2407.15831v2",
    "entry_id": "http://arxiv.org/abs/2407.15831v2",
    "categories": [
      "cs.IR",
      "cs.AI"
    ],
    "primary_category": "cs.IR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "How to Learn in a Noisy World? Self-Correcting the Real-World Data Noise in Machine Translation",
    "authors": [
      "Yan Meng",
      "Di Wu",
      "Christof Monz"
    ],
    "summary": "The massive amounts of web-mined parallel data contain large amounts of\nnoise. Semantic misalignment, as the primary source of the noise, poses a\nchallenge for training machine translation systems. In this paper, we first\nintroduce a process for simulating misalignment controlled by semantic\nsimilarity, which closely resembles misaligned sentences in real-world\nweb-crawled corpora. Under our simulated misalignment noise settings, we\nquantitatively analyze its impact on machine translation and demonstrate the\nlimited effectiveness of widely used pre-filters for noise detection. This\nunderscores the necessity of more fine-grained ways to handle hard-to-detect\nmisalignment noise. With an observation of the increasing reliability of the\nmodel's self-knowledge for distinguishing misaligned and clean data at the\ntoken level, we propose self-correction, an approach that gradually increases\ntrust in the model's self-knowledge to correct the training supervision.\nComprehensive experiments show that our method significantly improves\ntranslation performance both in the presence of simulated misalignment noise\nand when applied to real-world, noisy web-mined datasets, across a range of\ntranslation tasks.",
    "published": "2024-07-02T12:15:15+00:00",
    "updated": "2025-02-07T15:03:38+00:00",
    "pdf_url": "http://arxiv.org/pdf/2407.02208v2",
    "entry_id": "http://arxiv.org/abs/2407.02208v2",
    "categories": [
      "cs.CL",
      "cs.AI"
    ],
    "primary_category": "cs.CL",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Transferring Foundation Models for Generalizable Robotic Manipulation",
    "authors": [
      "Jiange Yang",
      "Wenhui Tan",
      "Chuhao Jin",
      "Keling Yao",
      "Bei Liu",
      "Jianlong Fu",
      "Ruihua Song",
      "Gangshan Wu",
      "Limin Wang"
    ],
    "summary": "Improving the generalization capabilities of general-purpose robotic\nmanipulation agents in the real world has long been a significant challenge.\nExisting approaches often rely on collecting large-scale robotic data which is\ncostly and time-consuming, such as the RT-1 dataset. However, due to\ninsufficient diversity of data, these approaches typically suffer from limiting\ntheir capability in open-domain scenarios with new objects and diverse\nenvironments. In this paper, we propose a novel paradigm that effectively\nleverages language-reasoning segmentation mask generated by internet-scale\nfoundation models, to condition robot manipulation tasks. By integrating the\nmask modality, which incorporates semantic, geometric, and temporal correlation\npriors derived from vision foundation models, into the end-to-end policy model,\nour approach can effectively and robustly perceive object pose and enable\nsample-efficient generalization learning, including new object instances,\nsemantic categories, and unseen backgrounds. We first introduce a series of\nfoundation models to ground natural language demands across multiple tasks.\nSecondly, we develop a two-stream 2D policy model based on imitation learning,\nwhich processes raw images and object masks to predict robot actions with a\nlocal-global perception manner. Extensive realworld experiments conducted on a\nFranka Emika robot arm demonstrate the effectiveness of our proposed paradigm\nand policy architecture. Demos can be found in our submitted video, and more\ncomprehensive ones can be found in link1 or link2.",
    "published": "2023-06-09T07:22:12+00:00",
    "updated": "2025-02-07T14:58:32+00:00",
    "pdf_url": "http://arxiv.org/pdf/2306.05716v5",
    "entry_id": "http://arxiv.org/abs/2306.05716v5",
    "categories": [
      "cs.RO",
      "cs.AI"
    ],
    "primary_category": "cs.RO",
    "comment": "WACV 2025, Oral",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Understanding Federated Learning from IID to Non-IID dataset: An Experimental Study",
    "authors": [
      "Jungwon Seo",
      "Ferhat Ozgur Catak",
      "Chunming Rong"
    ],
    "summary": "As privacy concerns and data regulations grow, federated learning (FL) has\nemerged as a promising approach for training machine learning models across\ndecentralized data sources without sharing raw data. However, a significant\nchallenge in FL is that client data are often non-IID (non-independent and\nidentically distributed), leading to reduced performance compared to\ncentralized learning. While many methods have been proposed to address this\nissue, their underlying mechanisms are often viewed from different\nperspectives. Through a comprehensive investigation from gradient descent to\nFL, and from IID to non-IID data settings, we find that inconsistencies in\nclient loss landscapes primarily cause performance degradation in non-IID\nscenarios. From this understanding, we observe that existing methods can be\ngrouped into two main strategies: (i) adjusting parameter update paths and (ii)\nmodifying client loss landscapes. These findings offer a clear perspective on\naddressing non-IID challenges in FL and help guide future research in the\nfield.",
    "published": "2025-01-31T21:58:15+00:00",
    "updated": "2025-02-07T14:31:59+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.00182v2",
    "entry_id": "http://arxiv.org/abs/2502.00182v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "stat.ML"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": "36th Norwegian ICT Conference for Research and Education, NIKT\n  2024",
    "doi": null
  },
  {
    "title": "Fast Adaptive Anti-Jamming Channel Access via Deep Q Learning and Coarse-Grained Spectrum Prediction",
    "authors": [
      "Jianshu Zhang",
      "Xiaofu Wu",
      "Junquan Hu"
    ],
    "summary": "This paper investigates the anti-jamming channel access problem in complex\nand unknown jamming environments, where the jammer could dynamically adjust its\nstrategies to target different channels. Traditional channel hopping\nanti-jamming approaches using fixed patterns are ineffective against such\ndynamic jamming attacks. Although the emerging deep reinforcement learning\n(DRL) based dynamic channel access approach could achieve the Nash equilibrium\nunder fast-changing jamming attacks, it requires extensive training episodes.\nTo address this issue, we propose a fast adaptive anti-jamming channel access\napproach guided by the intuition of ``learning faster than the jammer\", where a\nsynchronously updated coarse-grained spectrum prediction serves as an auxiliary\ntask for the deep Q learning (DQN) based anti-jamming model. This helps the\nmodel identify a superior Q-function compared to standard DRL while\nsignificantly reducing the number of training episodes. Numerical results\nindicate that the proposed approach significantly accelerates the rate of\nconvergence in model training, reducing the required training episodes by up to\n70% compared to standard DRL. Additionally, it also achieves a 10% improvement\nin throughput over NE strategies, owing to the effective use of coarse-grained\nspectrum prediction.",
    "published": "2025-02-07T14:25:28+00:00",
    "updated": "2025-02-07T14:25:28+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04963v1",
    "entry_id": "http://arxiv.org/abs/2502.04963v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "The Rising Threat to Emerging AI-Powered Search Engines",
    "authors": [
      "Zeren Luo",
      "Zifan Peng",
      "Yule Liu",
      "Zhen Sun",
      "Mingchen Li",
      "Jingyi Zheng",
      "Xinlei He"
    ],
    "summary": "Recent advancements in Large Language Models (LLMs) have significantly\nenhanced the capabilities of AI-Powered Search Engines (AIPSEs), offering\nprecise and efficient responses by integrating external databases with\npre-existing knowledge. However, we observe that these AIPSEs raise risks such\nas quoting malicious content or citing malicious websites, leading to harmful\nor unverified information dissemination. In this study, we conduct the first\nsafety risk quantification on seven production AIPSEs by systematically\ndefining the threat model, risk level, and evaluating responses to various\nquery types. With data collected from PhishTank, ThreatBook, and LevelBlue, our\nfindings reveal that AIPSEs frequently generate harmful content that contains\nmalicious URLs even with benign queries (e.g., with benign keywords). We also\nobserve that directly query URL will increase the risk level while query with\nnatural language will mitigate such risk. We further perform two case studies\non online document spoofing and phishing to show the ease of deceiving AIPSEs\nin the real-world setting. To mitigate these risks, we develop an agent-based\ndefense with a GPT-4o-based content refinement tool and an XGBoost-based URL\ndetector. Our evaluation shows that our defense can effectively reduce the risk\nbut with the cost of reducing available information. Our research highlights\nthe urgent need for robust safety measures in AIPSEs.",
    "published": "2025-02-07T14:15:46+00:00",
    "updated": "2025-02-07T14:15:46+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04951v1",
    "entry_id": "http://arxiv.org/abs/2502.04951v1",
    "categories": [
      "cs.CR",
      "cs.AI",
      "cs.LG"
    ],
    "primary_category": "cs.CR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Data-driven Modality Fusion: An AI-enabled Framework for Large-Scale Sensor Network Management",
    "authors": [
      "Hrishikesh Dutta",
      "Roberto Minerva",
      "Maira Alvi",
      "Noel Crespi"
    ],
    "summary": "The development and operation of smart cities relyheavily on large-scale\nInternet-of-Things (IoT) networks and sensor infrastructures that continuously\nmonitor various aspects of urban environments. These networks generate vast\namounts of data, posing challenges related to bandwidth usage, energy\nconsumption, and system scalability. This paper introduces a novel sensing\nparadigm called Data-driven Modality Fusion (DMF), designed to enhance the\nefficiency of smart city IoT network management. By leveraging correlations\nbetween timeseries data from different sensing modalities, the proposed DMF\napproach reduces the number of physical sensors required for monitoring,\nthereby minimizing energy expenditure, communication bandwidth, and overall\ndeployment costs. The framework relocates computational complexity from the\nedge devices to the core, ensuring that resource-constrained IoT devices are\nnot burdened with intensive processing tasks. DMF is validated using data from\na real-world IoT deployment in Madrid, demonstrating the effectiveness of the\nproposed system in accurately estimating traffic, environmental, and pollution\nmetrics from a reduced set of sensors. The proposed solution offers a scalable,\nefficient mechanism for managing urban IoT networks, while addressing issues of\nsensor failure and privacy concerns.",
    "published": "2025-02-07T14:00:04+00:00",
    "updated": "2025-02-07T14:00:04+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04937v1",
    "entry_id": "http://arxiv.org/abs/2502.04937v1",
    "categories": [
      "cs.NI",
      "cs.AI",
      "cs.LG"
    ],
    "primary_category": "cs.NI",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Conformal Prediction for Electricity Price Forecasting in the Day-Ahead and Real-Time Balancing Market",
    "authors": [
      "Ciaran O'Connor",
      "Mohamed Bahloul",
      "Roberto Rossi",
      "Steven Prestwich",
      "Andrea Visentin"
    ],
    "summary": "The integration of renewable energy into electricity markets poses\nsignificant challenges to price stability and increases the complexity of\nmarket operations. Accurate and reliable electricity price forecasting is\ncrucial for effective market participation, where price dynamics can be\nsignificantly more challenging to predict. Probabilistic forecasting, through\nprediction intervals, efficiently quantifies the inherent uncertainties in\nelectricity prices, supporting better decision-making for market participants.\nThis study explores the enhancement of probabilistic price prediction using\nConformal Prediction (CP) techniques, specifically Ensemble Batch Prediction\nIntervals and Sequential Predictive Conformal Inference. These methods provide\nprecise and reliable prediction intervals, outperforming traditional models in\nvalidity metrics. We propose an ensemble approach that combines the efficiency\nof quantile regression models with the robust coverage properties of time\nseries adapted CP techniques. This ensemble delivers both narrow prediction\nintervals and high coverage, leading to more reliable and accurate forecasts.\nWe further evaluate the practical implications of CP techniques through a\nsimulated trading algorithm applied to a battery storage system. The ensemble\napproach demonstrates improved financial returns in energy trading in both the\nDay-Ahead and Balancing Markets, highlighting its practical benefits for market\nparticipants.",
    "published": "2025-02-07T13:57:47+00:00",
    "updated": "2025-02-07T13:57:47+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04935v1",
    "entry_id": "http://arxiv.org/abs/2502.04935v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Proactive Model Adaptation Against Concept Drift for Online Time Series Forecasting",
    "authors": [
      "Lifan Zhao",
      "Yanyan Shen"
    ],
    "summary": "Time series forecasting always faces the challenge of concept drift, where\ndata distributions evolve over time, leading to a decline in forecast model\nperformance. Existing solutions are based on online learning, which continually\norganize recent time series observations as new training samples and update\nmodel parameters according to the forecasting feedback on recent data. However,\nthey overlook a critical issue: obtaining ground-truth future values of each\nsample should be delayed until after the forecast horizon. This delay creates a\ntemporal gap between the training samples and the test sample. Our empirical\nanalysis reveals that the gap can introduce concept drift, causing forecast\nmodels to adapt to outdated concepts. In this paper, we present Proceed, a\nnovel proactive model adaptation framework for online time series forecasting.\nProceed first estimates the concept drift between the recently used training\nsamples and the current test sample. It then employs an adaptation generator to\nefficiently translate the estimated drift into parameter adjustments,\nproactively adapting the model to the test sample. To enhance the\ngeneralization capability of the framework, Proceed is trained on synthetic\ndiverse concept drifts. Extensive experiments on five real-world datasets\nacross various forecast models demonstrate that Proceed brings more performance\nimprovements than the state-of-the-art online learning methods, significantly\nfacilitating forecast models' resilience against concept drifts. Code is\navailable at https://github.com/SJTU-DMTai/OnlineTSF.",
    "published": "2024-12-11T14:57:10+00:00",
    "updated": "2025-02-07T13:54:59+00:00",
    "pdf_url": "http://arxiv.org/pdf/2412.08435v3",
    "entry_id": "http://arxiv.org/abs/2412.08435v3",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CE",
      "stat.ML"
    ],
    "primary_category": "cs.LG",
    "comment": "Accepted by KDD 2025",
    "journal_ref": null,
    "doi": "10.1145/3690624.3709210"
  },
  {
    "title": "Harnessing Scale and Physics: A Multi-Graph Neural Operator Framework for PDEs on Arbitrary Geometries",
    "authors": [
      "Zhihao Li",
      "Haoze Song",
      "Di Xiao",
      "Zhilu Lai",
      "Wei Wang"
    ],
    "summary": "Partial Differential Equations (PDEs) underpin many scientific phenomena, yet\ntraditional computational approaches often struggle with complex, nonlinear\nsystems and irregular geometries. This paper introduces the AMG method, a\nMulti-Graph neural operator approach designed for efficiently solving PDEs on\nArbitrary geometries. AMG leverages advanced graph-based techniques and dynamic\nattention mechanisms within a novel GraphFormer architecture, enabling precise\nmanagement of diverse spatial domains and complex data interdependencies. By\nconstructing multi-scale graphs to handle variable feature frequencies and a\nphysics graph to encapsulate inherent physical properties, AMG significantly\noutperforms previous methods, which are typically limited to uniform grids. We\npresent a comprehensive evaluation of AMG across six benchmarks, demonstrating\nits consistent superiority over existing state-of-the-art models. Our findings\nhighlight the transformative potential of tailored graph neural operators in\nsurmounting the challenges faced by conventional PDE solvers. Our code and\ndatasets are available on https://github.com/lizhihao2022/AMG.",
    "published": "2024-11-18T12:35:03+00:00",
    "updated": "2025-02-07T13:53:41+00:00",
    "pdf_url": "http://arxiv.org/pdf/2411.15178v3",
    "entry_id": "http://arxiv.org/abs/2411.15178v3",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery\n  and Data Mining V.1 (KDD '25)",
    "journal_ref": null,
    "doi": "10.1145/3690624.3709173"
  },
  {
    "title": "Longer Attention Span: Increasing Transformer Context Length with Sparse Graph Processing Techniques",
    "authors": [
      "Nathaniel Tomczak",
      "Sanmukh Kuppannagari"
    ],
    "summary": "Transformers have demonstrated great success in numerous domains including\nnatural language processing and bioinformatics. This success stems from the use\nof the attention mechanism by these models in order to represent and propagate\npairwise interactions between individual tokens of sequential data. However,\nthe primary limitation of this operation is its quadratic memory and time\ncomplexity in relation to the input's context length - the length of a sequence\nover which the interactions need to be captured. This significantly limits the\nlength of sequences that can be inferred upon by these models. Extensive\nresearch has been conducted to reduce the number of pairwise interactions to\nsub-quadratic in relation to the context length by introducing sparsity into\nthe attention mechanism through the development of sparse attention masks.\nHowever, efficient implementations that achieve \"true sparsity\" are lacking.\n  In this work, we address this issue by proposing a graph computing view of\nattention where tokens are perceived as nodes of the graph and the attention\nmask determines the edges of the graph. Using this view, we develop graph\nprocessing algorithms to implement the attention mechanism. Both theoretically\nand empirically, we demonstrate that our algorithms only perform the needed\ncomputations, i.e., they are work optimal. We also perform extensive\nexperimentation using popular attention masks to explore the impact of sparsity\non execution time and achievable context length. Our experiments demonstrate\nsignificant speedups in execution times compared to state-of-the-art attention\nimplementations such as FlashAttention for large sequence lengths. We also\ndemonstrate that our algorithms are able to achieve extremely long sequence\nlengths of as high as 160 million on a single NVIDIA A100 GPU (SXM4 80GB).",
    "published": "2025-01-31T22:05:00+00:00",
    "updated": "2025-02-07T13:44:24+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.01659v2",
    "entry_id": "http://arxiv.org/abs/2502.01659v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.DC",
      "cs.PF"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Cached Multi-Lora Composition for Multi-Concept Image Generation",
    "authors": [
      "Xiandong Zou",
      "Mingzhu Shen",
      "Christos-Savvas Bouganis",
      "Yiren Zhao"
    ],
    "summary": "Low-Rank Adaptation (LoRA) has emerged as a widely adopted technique in\ntext-to-image models, enabling precise rendering of multiple distinct elements,\nsuch as characters and styles, in multi-concept image generation. However,\ncurrent approaches face significant challenges when composing these LoRAs for\nmulti-concept image generation, resulting in diminished generated image\nquality. In this paper, we initially investigate the role of LoRAs in the\ndenoising process through the lens of the Fourier frequency domain. Based on\nthe hypothesis that applying multiple LoRAs could lead to \"semantic conflicts\",\nwe find that certain LoRAs amplify high-frequency features such as edges and\ntextures, whereas others mainly focus on low-frequency elements, including the\noverall structure and smooth color gradients. Building on these insights, we\ndevise a frequency domain based sequencing strategy to determine the optimal\norder in which LoRAs should be integrated during inference. This strategy\noffers a methodical and generalizable solution compared to the naive\nintegration commonly found in existing LoRA fusion techniques. To fully\nleverage our proposed LoRA order sequence determination method in multi-LoRA\ncomposition tasks, we introduce a novel, training-free framework, Cached\nMulti-LoRA (CMLoRA), designed to efficiently integrate multiple LoRAs while\nmaintaining cohesive image generation. With its flexible backbone for\nmulti-LoRA fusion and a non-uniform caching strategy tailored to individual\nLoRAs, CMLoRA has the potential to reduce semantic conflicts in LoRA\ncomposition and improve computational efficiency. Our experimental evaluations\ndemonstrate that CMLoRA outperforms state-of-the-art training-free LoRA fusion\nmethods by a significant margin -- it achieves an average improvement of\n$2.19\\%$ in CLIPScore, and $11.25\\%$ in MLLM win rate compared to LoraHub, LoRA\nComposite, and LoRA Switch.",
    "published": "2025-02-07T13:41:51+00:00",
    "updated": "2025-02-07T13:41:51+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04923v1",
    "entry_id": "http://arxiv.org/abs/2502.04923v1",
    "categories": [
      "cs.CV",
      "cs.AI"
    ],
    "primary_category": "cs.CV",
    "comment": "The Thirteenth International Conference on Learning Representations\n  (ICLR 2025)",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Complex Physics-Informed Neural Network",
    "authors": [
      "Chenhao Si",
      "Ming Yan",
      "Xin Li",
      "Zhihong Xia"
    ],
    "summary": "We propose compleX-PINN, a novel physics-informed neural network (PINN)\narchitecture that incorporates a learnable activation function inspired by\nCauchy integral theorem. By learning the parameters of the activation function,\ncompleX-PINN achieves high accuracy with just a single hidden layer. Empirical\nresults show that compleX-PINN effectively solves problems where traditional\nPINNs struggle and consistently delivers significantly higher precision, often\nby an order of magnitude.",
    "published": "2025-02-07T13:36:42+00:00",
    "updated": "2025-02-07T13:36:42+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04917v1",
    "entry_id": "http://arxiv.org/abs/2502.04917v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "16 pages, 9 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Efficient Few-Shot Continual Learning in Vision-Language Models",
    "authors": [
      "Aristeidis Panos",
      "Rahaf Aljundi",
      "Daniel Olmeda Reino",
      "Richard E. Turner"
    ],
    "summary": "Vision-language models (VLMs) excel in tasks such as visual question\nanswering and image captioning. However, VLMs are often limited by their use of\npretrained image encoders, like CLIP, leading to image understanding errors\nthat hinder overall performance. On top of that, real-world applications often\nrequire the model to be continuously adapted as new and often limited data\ncontinuously arrive. To address this, we propose LoRSU (Low-Rank Adaptation\nwith Structured Updates), a robust and computationally efficient method for\nselectively updating image encoders within VLMs. LoRSU introduces structured\nand localized parameter updates, effectively correcting performance on\npreviously error-prone data while preserving the model's general robustness.\nOur approach leverages theoretical insights to identify and update only the\nmost critical parameters, achieving significant resource efficiency.\nSpecifically, we demonstrate that LoRSU reduces computational overhead by over\n25x compared to full VLM updates, without sacrificing performance. Experimental\nresults on VQA tasks in the few-shot continual learning setting, validate\nLoRSU's scalability, efficiency, and effectiveness, making it a compelling\nsolution for image encoder adaptation in resource-constrained environments.",
    "published": "2025-02-06T14:20:55+00:00",
    "updated": "2025-02-07T13:35:01+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04098v2",
    "entry_id": "http://arxiv.org/abs/2502.04098v2",
    "categories": [
      "cs.CV",
      "cs.AI"
    ],
    "primary_category": "cs.CV",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Asynchronous Stochastic Gradient Descent with Decoupled Backpropagation and Layer-Wise Updates",
    "authors": [
      "Cabrel Teguemne Fokam",
      "Khaleelulla Khan Nazeer",
      "Lukas König",
      "David Kappel",
      "Anand Subramoney"
    ],
    "summary": "The increasing size of deep learning models has made distributed training\nacross multiple devices essential. However, current methods such as distributed\ndata-parallel training suffer from large communication and synchronization\noverheads when training across devices, leading to longer training times as a\nresult of suboptimal hardware utilization. Asynchronous stochastic gradient\ndescent (ASGD) methods can improve training speed, but are sensitive to delays\ndue to both communication and differences throughput. Moreover, the\nbackpropagation algorithm used within ASGD workers is bottlenecked by the\ninterlocking between its forward and backward passes. Current methods also do\nnot take advantage of the large differences in the computation required for the\nforward and backward passes. Therefore, we propose an extension to ASGD called\nPartial Decoupled ASGD (PD-ASGD) that addresses these issues. PD-ASGD uses\nseparate threads for the forward and backward passes, decoupling the updates\nand allowing for a higher ratio of forward to backward threads than the usual\n1:1 ratio, leading to higher throughput. PD-ASGD also performs layer-wise\n(partial) model updates concurrently across multiple threads. This reduces\nparameter staleness and consequently improves robustness to delays. Our\napproach yields close to state-of-the-art results while running up to\n$5.95\\times$ faster than synchronous data parallelism in the presence of\ndelays, and up to $2.14\\times$ times faster than comparable ASGD algorithms by\nachieving higher model flops utilization. We mathematically describe the\ngradient bias introduced by our method, establish an upper bound, and prove\nconvergence.",
    "published": "2024-10-08T12:32:36+00:00",
    "updated": "2025-02-07T13:33:12+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.05985v3",
    "entry_id": "http://arxiv.org/abs/2410.05985v3",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.NE",
      "G.1.6",
      "I.2.6; I.5.1"
    ],
    "primary_category": "cs.LG",
    "comment": "17 pages, 5 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Scalable Parameter Design for Superconducting Quantum Circuits with Graph Neural Networks",
    "authors": [
      "Hao Ai",
      "Yu-xi Liu"
    ],
    "summary": "To demonstrate supremacy of quantum computing, increasingly large-scale\nsuperconducting quantum computing chips are being designed and fabricated.\nHowever, the complexity of simulating quantum systems poses a significant\nchallenge to computer-aided design of quantum chips, especially for large-scale\nchips. Harnessing the scalability of graph neural networks (GNNs), we here\npropose a parameter designing algorithm for large-scale superconducting quantum\ncircuits. The algorithm depends on the so-called 'three-stair scaling'\nmechanism, which comprises two neural-network models: an evaluator supervisedly\ntrained on small-scale circuits for applying to medium-scale circuits, and a\ndesigner unsupervisedly trained on medium-scale circuits for applying to\nlarge-scale ones. We demonstrate our algorithm in mitigating quantum crosstalk\nerrors. Frequencies for both single- and two-qubit gates (corresponding to the\nparameters of nodes and edges) are considered simultaneously. Numerical results\nindicate that the well-trained designer achieves notable advantages in\nefficiency, effectiveness, and scalability. For example, for large-scale\nsuperconducting quantum circuits consisting of around 870 qubits, our\nGNNs-based algorithm achieves 51% of the errors produced by the\nstate-of-the-art algorithm, with a time reduction from 90 min to 27 sec.\nOverall, a better-performing and more scalable algorithm for designing\nparameters of superconducting quantum chips is proposed, which initially\ndemonstrates the advantages of applying GNNs in superconducting quantum chips.",
    "published": "2024-11-25T13:04:53+00:00",
    "updated": "2025-02-07T13:28:59+00:00",
    "pdf_url": "http://arxiv.org/pdf/2411.16354v2",
    "entry_id": "http://arxiv.org/abs/2411.16354v2",
    "categories": [
      "quant-ph",
      "cs.AI"
    ],
    "primary_category": "quant-ph",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "From Allies to Adversaries: Manipulating LLM Tool-Calling through Adversarial Injection",
    "authors": [
      "Haowei Wang",
      "Rupeng Zhang",
      "Junjie Wang",
      "Mingyang Li",
      "Yuekai Huang",
      "Dandan Wang",
      "Qing Wang"
    ],
    "summary": "Tool-calling has changed Large Language Model (LLM) applications by\nintegrating external tools, significantly enhancing their functionality across\ndiverse tasks. However, this integration also introduces new security\nvulnerabilities, particularly in the tool scheduling mechanisms of LLM, which\nhave not been extensively studied. To fill this gap, we present ToolCommander,\na novel framework designed to exploit vulnerabilities in LLM tool-calling\nsystems through adversarial tool injection. Our framework employs a\nwell-designed two-stage attack strategy. Firstly, it injects malicious tools to\ncollect user queries, then dynamically updates the injected tools based on the\nstolen information to enhance subsequent attacks. These stages enable\nToolCommander to execute privacy theft, launch denial-of-service attacks, and\neven manipulate business competition by triggering unscheduled tool-calling.\nNotably, the ASR reaches 91.67% for privacy theft and hits 100% for\ndenial-of-service and unscheduled tool calling in certain cases. Our work\ndemonstrates that these vulnerabilities can lead to severe consequences beyond\nsimple misuse of tool-calling systems, underscoring the urgent need for robust\ndefensive strategies to secure LLM Tool-calling systems.",
    "published": "2024-12-13T15:15:24+00:00",
    "updated": "2025-02-07T13:26:18+00:00",
    "pdf_url": "http://arxiv.org/pdf/2412.10198v2",
    "entry_id": "http://arxiv.org/abs/2412.10198v2",
    "categories": [
      "cs.CR",
      "cs.AI"
    ],
    "primary_category": "cs.CR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Wavelet-Assisted Multi-Frequency Attention Network for Pansharpening",
    "authors": [
      "Jie Huang",
      "Rui Huang",
      "Jinghao Xu",
      "Siran Pen",
      "Yule Duan",
      "Liangjian Deng"
    ],
    "summary": "Pansharpening aims to combine a high-resolution panchromatic (PAN) image with\na low-resolution multispectral (LRMS) image to produce a high-resolution\nmultispectral (HRMS) image. Although pansharpening in the frequency domain\noffers clear advantages, most existing methods either continue to operate\nsolely in the spatial domain or fail to fully exploit the benefits of the\nfrequency domain. To address this issue, we innovatively propose\nMulti-Frequency Fusion Attention (MFFA), which leverages wavelet transforms to\ncleanly separate frequencies and enable lossless reconstruction across\ndifferent frequency domains. Then, we generate Frequency-Query, Spatial-Key,\nand Fusion-Value based on the physical meanings represented by different\nfeatures, which enables a more effective capture of specific information in the\nfrequency domain. Additionally, we focus on the preservation of frequency\nfeatures across different operations. On a broader level, our network employs a\nwavelet pyramid to progressively fuse information across multiple scales.\nCompared to previous frequency domain approaches, our network better prevents\nconfusion and loss of different frequency features during the fusion process.\nQuantitative and qualitative experiments on multiple datasets demonstrate that\nour method outperforms existing approaches and shows significant generalization\ncapabilities for real-world scenarios.",
    "published": "2025-02-07T13:15:49+00:00",
    "updated": "2025-02-07T13:15:49+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04903v1",
    "entry_id": "http://arxiv.org/abs/2502.04903v1",
    "categories": [
      "eess.IV",
      "cs.AI",
      "cs.CV"
    ],
    "primary_category": "eess.IV",
    "comment": "12 pages, 13 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "A-VL: Adaptive Attention for Large Vision-Language Models",
    "authors": [
      "Junyang Zhang",
      "Mu Yuan",
      "Ruiguang Zhong",
      "Puhan Luo",
      "Huiyou Zhan",
      "Ningkang Zhang",
      "Chengchen Hu",
      "Xiangyang Li"
    ],
    "summary": "The Large Vision-Language Model (LVLM) integrates computer vision and natural\nlanguage processing techniques, offering substantial application potential.\nHowever, these models demand extensive resources during inference. Adaptive\nattention techniques can dynamically reduce computational redundancy and thus\nimprove efficiency. Although current adaptive attention methods significantly\nreduce the memory requirements of Transformer-based language models, they are\nnot tailored for LVLMs. We observe that LVLMs generate responses from both\nremote image tokens and local text tokens, and different modalities have\ndifferent attention patterns. This observation inspires us to manage the\nattention for each modality separately. Specifically, for visual input, we\nstore the cache of potentially useful information but only compute the most\ncritical parts. For language input, we care more about local information. Based\non our observation and analysis of vision-language attention patterns, we\ndevelop A-VL, a plug-and-play adaptive attention tailored for LVLM inference.\nExtensive evaluations on three vision-language tasks and five datasets show the\neffectiveness of our designs. Our approach A-VL outperforms existing adaptive\nattention methods in reducing memory usage and computational load without\ncompromising performance.",
    "published": "2024-09-23T09:22:59+00:00",
    "updated": "2025-02-07T13:09:17+00:00",
    "pdf_url": "http://arxiv.org/pdf/2409.14846v2",
    "entry_id": "http://arxiv.org/abs/2409.14846v2",
    "categories": [
      "cs.AI",
      "cs.CV"
    ],
    "primary_category": "cs.AI",
    "comment": "AAAI 2025 Accepted",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Unified Approaches in Self-Supervised Event Stream Modeling: Progress and Prospects",
    "authors": [
      "Levente Zólyomi",
      "Tianze Wang",
      "Sofiane Ennadir",
      "Oleg Smirnov",
      "Lele Cao"
    ],
    "summary": "The proliferation of digital interactions across diverse domains, such as\nhealthcare, e-commerce, gaming, and finance, has resulted in the generation of\nvast volumes of event stream (ES) data. ES data comprises continuous sequences\nof timestamped events that encapsulate detailed contextual information relevant\nto each domain. While ES data holds significant potential for extracting\nactionable insights and enhancing decision-making, its effective utilization is\nhindered by challenges such as the scarcity of labeled data and the fragmented\nnature of existing research efforts. Self-Supervised Learning (SSL) has emerged\nas a promising paradigm to address these challenges by enabling the extraction\nof meaningful representations from unlabeled ES data. In this survey, we\nsystematically review and synthesize SSL methodologies tailored for ES modeling\nacross multiple domains, bridging the gaps between domain-specific approaches\nthat have traditionally operated in isolation. We present a comprehensive\ntaxonomy of SSL techniques, encompassing both predictive and contrastive\nparadigms, and analyze their applicability and effectiveness within different\napplication contexts. Furthermore, we identify critical gaps in current\nresearch and propose a future research agenda aimed at developing scalable,\ndomain-agnostic SSL frameworks for ES modeling. By unifying disparate research\nefforts and highlighting cross-domain synergies, this survey aims to accelerate\ninnovation, improve reproducibility, and expand the applicability of SSL to\ndiverse real-world ES challenges.",
    "published": "2025-02-07T13:05:55+00:00",
    "updated": "2025-02-07T13:05:55+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04899v1",
    "entry_id": "http://arxiv.org/abs/2502.04899v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "ARTInp: CBCT-to-CT Image Inpainting and Image Translation in Radiotherapy",
    "authors": [
      "Ricardo Coimbra Brioso",
      "Leonardo Crespi",
      "Andrea Seghetto",
      "Damiano Dei",
      "Nicola Lambri",
      "Pietro Mancosu",
      "Marta Scorsetti",
      "Daniele Loiacono"
    ],
    "summary": "A key step in Adaptive Radiation Therapy (ART) workflows is the evaluation of\nthe patient's anatomy at treatment time to ensure the accuracy of the delivery.\nTo this end, Cone Beam Computerized Tomography (CBCT) is widely used being\ncost-effective and easy to integrate into the treatment process. Nonetheless,\nCBCT images have lower resolution and more artifacts than CT scans, making them\nless reliable for precise treatment validation. Moreover, in complex treatments\nsuch as Total Marrow and Lymph Node Irradiation (TMLI), where full-body\nvisualization of the patient is critical for accurate dose delivery, the CBCT\nimages are often discontinuous, leaving gaps that could contain relevant\nanatomical information. To address these limitations, we propose ARTInp\n(Adaptive Radiation Therapy Inpainting), a novel deep-learning framework\ncombining image inpainting and CBCT-to-CT translation. ARTInp employs a\ndual-network approach: a completion network that fills anatomical gaps in CBCT\nvolumes and a custom Generative Adversarial Network (GAN) to generate\nhigh-quality synthetic CT (sCT) images. We trained ARTInp on a dataset of\npaired CBCT and CT images from the SynthRad 2023 challenge, and the performance\nachieved on a test set of 18 patients demonstrates its potential for enhancing\nCBCT-based workflows in radiotherapy.",
    "published": "2025-02-07T13:04:25+00:00",
    "updated": "2025-02-07T13:04:25+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04898v1",
    "entry_id": "http://arxiv.org/abs/2502.04898v1",
    "categories": [
      "eess.IV",
      "cs.AI",
      "cs.CV"
    ],
    "primary_category": "eess.IV",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "SynCo: Synthetic Hard Negatives in Contrastive Learning for Better Unsupervised Visual Representations",
    "authors": [
      "Nikolaos Giakoumoglou",
      "Tania Stathaki"
    ],
    "summary": "Contrastive learning has become a dominant approach in self-supervised visual\nrepresentation learning, but efficiently leveraging hard negatives, which are\nsamples closely resembling the anchor, remains challenging. We introduce SynCo\n(Synthetic negatives in Contrastive learning), a novel approach that improves\nmodel performance by generating synthetic hard negatives on the representation\nspace. Building on the MoCo framework, SynCo introduces six strategies for\ncreating diverse synthetic hard negatives on-the-fly with minimal computational\noverhead. SynCo achieves faster training and strong representation learning,\nsurpassing MoCo-v2 by +0.4% and MoCHI by +1.0% on ImageNet ILSVRC-2012 linear\nevaluation. It also transfers more effectively to detection tasks achieving\nstrong results on PASCAL VOC detection (57.2% AP) and significantly improving\nover MoCo-v2 on COCO detection (+1.0% AP) and instance segmentation (+0.8% AP).\nOur synthetic hard negative generation approach significantly enhances visual\nrepresentations learned through self-supervised contrastive learning.",
    "published": "2024-10-03T11:29:09+00:00",
    "updated": "2025-02-07T12:45:11+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.02401v6",
    "entry_id": "http://arxiv.org/abs/2410.02401v6",
    "categories": [
      "cs.CV",
      "cs.AI",
      "I.4, I.2"
    ],
    "primary_category": "cs.CV",
    "comment": "Preprint. Project page: https://giakoumoglou.com/, Code:\n  https://github.com/giakoumoglou/synco, Supplementary:\n  https://giakoumoglou.com/src/synco_suppl.pdf",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Sparse Autoencoders Do Not Find Canonical Units of Analysis",
    "authors": [
      "Patrick Leask",
      "Bart Bussmann",
      "Michael Pearce",
      "Joseph Bloom",
      "Curt Tigges",
      "Noura Al Moubayed",
      "Lee Sharkey",
      "Neel Nanda"
    ],
    "summary": "A common goal of mechanistic interpretability is to decompose the activations\nof neural networks into features: interpretable properties of the input\ncomputed by the model. Sparse autoencoders (SAEs) are a popular method for\nfinding these features in LLMs, and it has been postulated that they can be\nused to find a \\textit{canonical} set of units: a unique and complete list of\natomic features. We cast doubt on this belief using two novel techniques: SAE\nstitching to show they are incomplete, and meta-SAEs to show they are not\natomic. SAE stitching involves inserting or swapping latents from a larger SAE\ninto a smaller one. Latents from the larger SAE can be divided into two\ncategories: \\emph{novel latents}, which improve performance when added to the\nsmaller SAE, indicating they capture novel information, and\n\\emph{reconstruction latents}, which can replace corresponding latents in the\nsmaller SAE that have similar behavior. The existence of novel features\nindicates incompleteness of smaller SAEs. Using meta-SAEs -- SAEs trained on\nthe decoder matrix of another SAE -- we find that latents in SAEs often\ndecompose into combinations of latents from a smaller SAE, showing that larger\nSAE latents are not atomic. The resulting decompositions are often\ninterpretable; e.g. a latent representing ``Einstein'' decomposes into\n``scientist'', ``Germany'', and ``famous person''. Even if SAEs do not find\ncanonical units of analysis, they may still be useful tools. We suggest that\nfuture research should either pursue different approaches for identifying such\nunits, or pragmatically choose the SAE size suited to their task. We provide an\ninteractive dashboard to explore meta-SAEs: https://metasaes.streamlit.app/",
    "published": "2025-02-07T12:33:08+00:00",
    "updated": "2025-02-07T12:33:08+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04878v1",
    "entry_id": "http://arxiv.org/abs/2502.04878v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "Accepted to ICLR 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents",
    "authors": [
      "Niels Mündler",
      "Mark Niklas Müller",
      "Jingxuan He",
      "Martin Vechev"
    ],
    "summary": "Rigorous software testing is crucial for developing and maintaining\nhigh-quality code, making automated test generation a promising avenue for both\nimproving software quality and boosting the effectiveness of code generation\nmethods. However, while code generation with Large Language Models (LLMs) is an\nextraordinarily active research area, test generation remains relatively\nunexplored. We address this gap and investigate the capability of LLM-based\nCode Agents to formalize user issues into test cases. To this end, we propose a\nnovel benchmark based on popular GitHub repositories, containing real-world\nissues, ground-truth bug-fixes, and golden tests. We find that LLMs generally\nperform surprisingly well at generating relevant test cases, with Code Agents\ndesigned for code repair exceeding the performance of systems designed\nspecifically for test generation. Further, as test generation is a similar but\nmore structured task than code generation, it allows for a more fine-grained\nanalysis using issue reproduction rate and coverage changes, providing a dual\nmetric for analyzing systems designed for code repair. Finally, we find that\ngenerated tests are an effective filter for proposed code fixes, doubling the\nprecision of SWE-Agent. We release all data and code at\nhttps://github.com/logic-star-ai/SWT-Bench",
    "published": "2024-06-18T14:54:37+00:00",
    "updated": "2025-02-07T12:33:06+00:00",
    "pdf_url": "http://arxiv.org/pdf/2406.12952v3",
    "entry_id": "http://arxiv.org/abs/2406.12952v3",
    "categories": [
      "cs.SE",
      "cs.AI",
      "cs.LG"
    ],
    "primary_category": "cs.SE",
    "comment": "20 pages, 14 figures, 7 tables",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Pushing the Limits of BFP on Narrow Precision LLM Inference",
    "authors": [
      "Hui Wang",
      "Yuan Cheng",
      "Xiaomeng Han",
      "Zhengpeng Zhao",
      "Dawei Yang",
      "Zhe Jiang"
    ],
    "summary": "The substantial computational and memory demands of Large Language Models\n(LLMs) hinder their deployment. Block Floating Point (BFP) has proven effective\nin accelerating linear operations, a cornerstone of LLM workloads. However, as\nsequence lengths grow, nonlinear operations, such as Attention, increasingly\nbecome performance bottlenecks due to their quadratic computational complexity.\nThese nonlinear operations are predominantly executed using inefficient\nfloating-point formats, which renders the system challenging to optimize\nsoftware efficiency and hardware overhead. In this paper, we delve into the\nlimitations and potential of applying BFP to nonlinear operations. Given our\nfindings, we introduce a hardware-software co-design framework (DB-Attn),\nincluding: (i) DBFP, an advanced BFP version, overcomes nonlinear operation\nchallenges with a pivot-focus strategy for diverse data and an adaptive\ngrouping strategy for flexible exponent sharing. (ii) DH-LUT, a novel lookup\ntable algorithm dedicated to accelerating nonlinear operations with DBFP\nformat. (iii) An RTL-level DBFP-based engine is implemented to support DB-Attn,\napplicable to FPGA and ASIC. Results show that DB-Attn provides significant\nperformance improvements with negligible accuracy loss, achieving 74% GPU\nspeedup on Softmax of LLaMA and 10x low overhead performance improvement over\nSOTA designs.",
    "published": "2025-01-21T17:10:52+00:00",
    "updated": "2025-02-07T12:23:59+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.00026v2",
    "entry_id": "http://arxiv.org/abs/2502.00026v2",
    "categories": [
      "cs.AR",
      "cs.AI"
    ],
    "primary_category": "cs.AR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "$TAR^2$: Temporal-Agent Reward Redistribution for Optimal Policy Preservation in Multi-Agent Reinforcement Learning",
    "authors": [
      "Aditya Kapoor",
      "Kale-ab Tessera",
      "Mayank Baranwal",
      "Harshad Khadilkar",
      "Stefano Albrecht",
      "Mingfei Sun"
    ],
    "summary": "In cooperative multi-agent reinforcement learning (MARL), learning effective\npolicies is challenging when global rewards are sparse and delayed. This\ndifficulty arises from the need to assign credit across both agents and time\nsteps, a problem that existing methods often fail to address in episodic,\nlong-horizon tasks. We propose Temporal-Agent Reward Redistribution $TAR^2$, a\nnovel approach that decomposes sparse global rewards into agent-specific,\ntime-step-specific components, thereby providing more frequent and accurate\nfeedback for policy learning. Theoretically, we show that $TAR^2$ (i) aligns\nwith potential-based reward shaping, preserving the same optimal policies as\nthe original environment, and (ii) maintains policy gradient update directions\nidentical to those under the original sparse reward, ensuring unbiased credit\nsignals. Empirical results on two challenging benchmarks, SMACLite and Google\nResearch Football, demonstrate that $TAR^2$ significantly stabilizes and\naccelerates convergence, outperforming strong baselines like AREL and STAS in\nboth learning speed and final performance. These findings establish $TAR^2$ as\na principled and practical solution for agent-temporal credit assignment in\nsparse-reward multi-agent systems.",
    "published": "2025-02-07T12:07:57+00:00",
    "updated": "2025-02-07T12:07:57+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04864v1",
    "entry_id": "http://arxiv.org/abs/2502.04864v1",
    "categories": [
      "cs.MA",
      "cs.AI",
      "cs.LG",
      "cs.RO"
    ],
    "primary_category": "cs.MA",
    "comment": "23 pages, 5 figures, 4 tables",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "HyperMARL: Adaptive Hypernetworks for Multi-Agent RL",
    "authors": [
      "Kale-ab Abebe Tessera",
      "Arrasy Rahman",
      "Stefano V. Albrecht"
    ],
    "summary": "Adaptability is critical in cooperative multi-agent reinforcement learning\n(MARL), where agents must learn specialised or homogeneous behaviours for\ndiverse tasks. While parameter sharing methods are sample-efficient, they often\nencounter gradient interference among agents, limiting their behavioural\ndiversity. Conversely, non-parameter sharing approaches enable specialisation,\nbut are computationally demanding and sample-inefficient. To address these\nissues, we propose HyperMARL, a parameter sharing approach that uses\nhypernetworks to dynamically generate agent-specific actor and critic\nparameters, without altering the learning objective or requiring preset\ndiversity levels. By decoupling observation- and agent-conditioned gradients,\nHyperMARL empirically reduces policy gradient variance and facilitates\nspecialisation within FuPS, suggesting it can mitigate cross-agent\ninterference. Across multiple MARL benchmarks involving up to twenty agents --\nand requiring homogeneous, heterogeneous, or mixed behaviours -- HyperMARL\nconsistently performs competitively with fully shared, non-parameter-sharing,\nand diversity-promoting baselines, all while preserving a behavioural diversity\nlevel comparable to non-parameter sharing. These findings establish\nhypernetworks as a versatile approach for MARL across diverse environments.",
    "published": "2024-12-05T15:09:51+00:00",
    "updated": "2025-02-07T11:46:12+00:00",
    "pdf_url": "http://arxiv.org/pdf/2412.04233v2",
    "entry_id": "http://arxiv.org/abs/2412.04233v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.MA"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Game Theory with Simulation in the Presence of Unpredictable Randomisation",
    "authors": [
      "Vojtech Kovarik",
      "Nathaniel Sauerberg",
      "Lewis Hammond",
      "Vincent Conitzer"
    ],
    "summary": "AI agents will be predictable in certain ways that traditional agents are\nnot. Where and how can we leverage this predictability in order to improve\nsocial welfare? We study this question in a game-theoretic setting where one\nagent can pay a fixed cost to simulate the other in order to learn its mixed\nstrategy. As a negative result, we prove that, in contrast to prior work on\npure-strategy simulation, enabling mixed-strategy simulation may no longer lead\nto improved outcomes for both players in all so-called \"generalised trust\ngames\". In fact, mixed-strategy simulation does not help in any game where the\nsimulatee's action can depend on that of the simulator. We also show that, in\ngeneral, deciding whether simulation introduces Pareto-improving Nash\nequilibria in a given game is NP-hard. As positive results, we establish that\nmixed-strategy simulation can improve social welfare if the simulator has the\noption to scale their level of trust, if the players face challenges with both\ntrust and coordination, or if maintaining some level of privacy is essential\nfor enabling cooperation.",
    "published": "2024-10-18T09:17:18+00:00",
    "updated": "2025-02-07T11:18:31+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.14311v2",
    "entry_id": "http://arxiv.org/abs/2410.14311v2",
    "categories": [
      "cs.GT",
      "cs.AI"
    ],
    "primary_category": "cs.GT",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation Models",
    "authors": [
      "Cong Lu",
      "Shengran Hu",
      "Jeff Clune"
    ],
    "summary": "Go-Explore is a powerful family of algorithms designed to solve\nhard-exploration problems built on the principle of archiving discovered\nstates, and iteratively returning to and exploring from the most promising\nstates. This approach has led to superhuman performance across a wide variety\nof challenging problems including Atari games and robotic control, but requires\nmanually designing heuristics to guide exploration (i.e., determine which\nstates to save and explore from, and what actions to consider next), which is\ntime-consuming and infeasible in general. To resolve this, we propose\nIntelligent Go-Explore (IGE) which greatly extends the scope of the original\nGo-Explore by replacing these handcrafted heuristics with the intelligence and\ninternalized human notions of interestingness captured by giant pretrained\nfoundation models (FMs). This provides IGE with a human-like ability to\ninstinctively identify how interesting or promising any new state is (e.g.,\ndiscovering new objects, locations, or behaviors), even in complex environments\nwhere heuristics are hard to define. Moreover, IGE offers the exciting\nopportunity to recognize and capitalize on serendipitous discoveries -- states\nencountered during exploration that are valuable in terms of exploration, yet\nwhere what makes them interesting was not anticipated by the human user. We\nevaluate our algorithm on a diverse range of language and vision-based tasks\nthat require search and exploration. Across these tasks, IGE strongly exceeds\nclassic reinforcement learning and graph search baselines, and also succeeds\nwhere prior state-of-the-art FM agents like Reflexion completely fail. Overall,\nIntelligent Go-Explore combines the tremendous strengths of FMs and the\npowerful Go-Explore algorithm, opening up a new frontier of research into\ncreating more generally capable agents with impressive exploration\ncapabilities.",
    "published": "2024-05-24T01:45:27+00:00",
    "updated": "2025-02-07T11:10:39+00:00",
    "pdf_url": "http://arxiv.org/pdf/2405.15143v4",
    "entry_id": "http://arxiv.org/abs/2405.15143v4",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.CL"
    ],
    "primary_category": "cs.LG",
    "comment": "Published as a conference paper at ICLR 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Lightweight Operations for Visual Speech Recognition",
    "authors": [
      "Iason Ioannis Panagos",
      "Giorgos Sfikas",
      "Christophoros Nikou"
    ],
    "summary": "Visual speech recognition (VSR), which decodes spoken words from video data,\noffers significant benefits, particularly when audio is unavailable. However,\nthe high dimensionality of video data leads to prohibitive computational costs\nthat demand powerful hardware, limiting VSR deployment on resource-constrained\ndevices. This work addresses this limitation by developing lightweight VSR\narchitectures. Leveraging efficient operation design paradigms, we create\ncompact yet powerful models with reduced resource requirements and minimal\naccuracy loss. We train and evaluate our models on a large-scale public dataset\nfor recognition of words from video sequences, demonstrating their\neffectiveness for practical applications. We also conduct an extensive array of\nablative experiments to thoroughly analyze the size and complexity of each\nmodel. Code and trained models will be made publicly available.",
    "published": "2025-02-07T11:08:32+00:00",
    "updated": "2025-02-07T11:08:32+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04834v1",
    "entry_id": "http://arxiv.org/abs/2502.04834v1",
    "categories": [
      "cs.CV",
      "cs.AI",
      "cs.CL",
      "cs.LG"
    ],
    "primary_category": "cs.CV",
    "comment": "10 pages (double column format), 7 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Optimistic Gradient Learning with Hessian Corrections for High-Dimensional Black-Box Optimization",
    "authors": [
      "Yedidya Kfir",
      "Elad Sarafian",
      "Sarit Kraus",
      "Yoram Louzoun"
    ],
    "summary": "Black-box algorithms are designed to optimize functions without relying on\ntheir underlying analytical structure or gradient information, making them\nessential when gradients are inaccessible or difficult to compute. Traditional\nmethods for solving black-box optimization (BBO) problems predominantly rely on\nnon-parametric models and struggle to scale to large input spaces. Conversely,\nparametric methods that model the function with neural estimators and obtain\ngradient signals via backpropagation may suffer from significant gradient\nerrors. A recent alternative, Explicit Gradient Learning (EGL), which directly\nlearns the gradient using a first-order Taylor approximation, has demonstrated\nsuperior performance over both parametric and non-parametric methods. In this\nwork, we propose two novel gradient learning variants to address the robustness\nchallenges posed by high-dimensional, complex, and highly non-linear problems.\nOptimistic Gradient Learning (OGL) introduces a bias toward lower regions in\nthe function landscape, while Higher-order Gradient Learning (HGL) incorporates\nsecond-order Taylor corrections to improve gradient accuracy. We combine these\napproaches into the unified OHGL algorithm, achieving state-of-the-art (SOTA)\nperformance on the synthetic COCO suite. Additionally, we demonstrate OHGLs\napplicability to high-dimensional real-world machine learning (ML) tasks such\nas adversarial training and code generation. Our results highlight OHGLs\nability to generate stronger candidates, offering a valuable tool for ML\nresearchers and practitioners tackling high-dimensional, non-linear\noptimization challenges",
    "published": "2025-02-07T11:03:50+00:00",
    "updated": "2025-02-07T11:03:50+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04829v1",
    "entry_id": "http://arxiv.org/abs/2502.04829v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "We develop a black-box optimization algorithm that learns gradients\n  with neural models and can be applied to solve non-convex high dimensional\n  real-world problems",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "SOLD: Slot Object-Centric Latent Dynamics Models for Relational Manipulation Learning from Pixels",
    "authors": [
      "Malte Mosbach",
      "Jan Niklas Ewertz",
      "Angel Villar-Corrales",
      "Sven Behnke"
    ],
    "summary": "Learning a latent dynamics model provides a task-agnostic representation of\nan agent's understanding of its environment. Leveraging this knowledge for\nmodel-based reinforcement learning (RL) holds the potential to improve sample\nefficiency over model-free methods by learning from imagined rollouts.\nFurthermore, because the latent space serves as input to behavior models, the\ninformative representations learned by the world model facilitate efficient\nlearning of desired skills. Most existing methods rely on holistic\nrepresentations of the environment's state. In contrast, humans reason about\nobjects and their interactions, predicting how actions will affect specific\nparts of their surroundings. Inspired by this, we propose Slot-Attention for\nObject-centric Latent Dynamics (SOLD), a novel model-based RL algorithm that\nlearns object-centric dynamics models in an unsupervised manner from pixel\ninputs. We demonstrate that the structured latent space not only improves model\ninterpretability but also provides a valuable input space for behavior models\nto reason over. Our results show that SOLD outperforms DreamerV3 and TD-MPC2 -\nstate-of-the-art model-based RL algorithms - across a range of benchmark\nrobotic environments that require relational reasoning and manipulation\ncapabilities. Videos are available at https://slot-latent-dynamics.github.io/.",
    "published": "2024-10-11T14:03:31+00:00",
    "updated": "2025-02-07T10:52:37+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.08822v2",
    "entry_id": "http://arxiv.org/abs/2410.08822v2",
    "categories": [
      "cs.LG",
      "cs.AI",
      "cs.RO"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Assigning Credit with Partial Reward Decoupling in Multi-Agent Proximal Policy Optimization",
    "authors": [
      "Aditya Kapoor",
      "Benjamin Freed",
      "Howie Choset",
      "Jeff Schneider"
    ],
    "summary": "Multi-agent proximal policy optimization (MAPPO) has recently demonstrated\nstate-of-the-art performance on challenging multi-agent reinforcement learning\ntasks. However, MAPPO still struggles with the credit assignment problem,\nwherein the sheer difficulty in ascribing credit to individual agents' actions\nscales poorly with team size. In this paper, we propose a multi-agent\nreinforcement learning algorithm that adapts recent developments in credit\nassignment to improve upon MAPPO. Our approach leverages partial reward\ndecoupling (PRD), which uses a learned attention mechanism to estimate which of\na particular agent's teammates are relevant to its learning updates. We use\nthis estimate to dynamically decompose large groups of agents into smaller,\nmore manageable subgroups. We empirically demonstrate that our approach,\nPRD-MAPPO, decouples agents from teammates that do not influence their expected\nfuture reward, thereby streamlining credit assignment. We additionally show\nthat PRD-MAPPO yields significantly higher data efficiency and asymptotic\nperformance compared to both MAPPO and other state-of-the-art methods across\nseveral multi-agent tasks, including StarCraft II. Finally, we propose a\nversion of PRD-MAPPO that is applicable to \\textit{shared} reward settings,\nwhere PRD was previously not applicable, and empirically show that this also\nleads to performance improvements over MAPPO.",
    "published": "2024-08-08T08:18:05+00:00",
    "updated": "2025-02-07T10:48:22+00:00",
    "pdf_url": "http://arxiv.org/pdf/2408.04295v3",
    "entry_id": "http://arxiv.org/abs/2408.04295v3",
    "categories": [
      "cs.MA",
      "cs.AI",
      "cs.LG",
      "cs.RO"
    ],
    "primary_category": "cs.MA",
    "comment": "20 pages, 5 figures, 12 tables, Reinforcement Learning Journal and\n  Reinforcement Learning Conference 2024",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "R-LLaVA: Improving Med-VQA Understanding through Visual Region of Interest",
    "authors": [
      "Xupeng Chen",
      "Zhixin Lai",
      "Kangrui Ruan",
      "Shichu Chen",
      "Jiaxiang Liu",
      "Zuozhu Liu"
    ],
    "summary": "Artificial intelligence has made significant strides in medical visual\nquestion answering (Med-VQA), yet prevalent studies often interpret images\nholistically, overlooking the visual regions of interest that may contain\ncrucial information, potentially aligning with a doctor's prior knowledge that\ncan be incorporated with minimal annotations (e.g., bounding boxes). To address\nthis gap, this paper introduces R-LLaVA, designed to enhance biomedical VQA\nunderstanding by integrating simple medical annotations as prior knowledge\ndirectly into the image space through CLIP. These annotated visual regions of\ninterest are then fed into the LLaVA model during training, aiming to enrich\nthe model's understanding of biomedical queries. Experimental evaluation on\nfour standard Med-VQA datasets demonstrates R-LLaVA's superiority over existing\nstate-of-the-art (SoTA) methods. Additionally, to verify the model's capability\nin visual comprehension, a novel multiple-choice medical visual understanding\ndataset is introduced, confirming the positive impact of focusing on visual\nregions of interest in advancing biomedical VQA understanding.",
    "published": "2024-10-27T03:56:56+00:00",
    "updated": "2025-02-07T10:33:52+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.20327v4",
    "entry_id": "http://arxiv.org/abs/2410.20327v4",
    "categories": [
      "cs.CV",
      "cs.AI"
    ],
    "primary_category": "cs.CV",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Contextualized Counterspeech: Strategies for Adaptation, Personalization, and Evaluation",
    "authors": [
      "Lorenzo Cima",
      "Alessio Miaschi",
      "Amaury Trujillo",
      "Marco Avvenuti",
      "Felice Dell'Orletta",
      "Stefano Cresci"
    ],
    "summary": "AI-generated counterspeech offers a promising and scalable strategy to curb\nonline toxicity through direct replies that promote civil discourse. However,\ncurrent counterspeech is one-size-fits-all, lacking adaptation to the\nmoderation context and the users involved. We propose and evaluate multiple\nstrategies for generating tailored counterspeech that is adapted to the\nmoderation context and personalized for the moderated user. We instruct an\nLLaMA2-13B model to generate counterspeech, experimenting with various\nconfigurations based on different contextual information and fine-tuning\nstrategies. We identify the configurations that generate persuasive\ncounterspeech through a combination of quantitative indicators and human\nevaluations collected via a pre-registered mixed-design crowdsourcing\nexperiment. Results show that contextualized counterspeech can significantly\noutperform state-of-the-art generic counterspeech in adequacy and\npersuasiveness, without compromising other characteristics. Our findings also\nreveal a poor correlation between quantitative indicators and human\nevaluations, suggesting that these methods assess different aspects and\nhighlighting the need for nuanced evaluation methodologies. The effectiveness\nof contextualized AI-generated counterspeech and the divergence between human\nand algorithmic evaluations underscore the importance of increased human-AI\ncollaboration in content moderation.",
    "published": "2024-12-10T09:29:52+00:00",
    "updated": "2025-02-07T10:30:23+00:00",
    "pdf_url": "http://arxiv.org/pdf/2412.07338v3",
    "entry_id": "http://arxiv.org/abs/2412.07338v3",
    "categories": [
      "cs.HC",
      "cs.AI",
      "cs.SI"
    ],
    "primary_category": "cs.HC",
    "comment": "Article published in WebConf 25, 34th ACM Web Conference. Please,\n  cite the published version",
    "journal_ref": "WebConf 2025, 34th ACM Web Conference",
    "doi": "10.1145/3696410.3714507"
  },
  {
    "title": "CASE-Bench: Context-Aware SafEty Benchmark for Large Language Models",
    "authors": [
      "Guangzhi Sun",
      "Xiao Zhan",
      "Shutong Feng",
      "Philip C. Woodland",
      "Jose Such"
    ],
    "summary": "Aligning large language models (LLMs) with human values is essential for\ntheir safe deployment and widespread adoption. Current LLM safety benchmarks\noften focus solely on the refusal of individual problematic queries, which\noverlooks the importance of the context where the query occurs and may cause\nundesired refusal of queries under safe contexts that diminish user experience.\nAddressing this gap, we introduce CASE-Bench, a Context-Aware SafEty Benchmark\nthat integrates context into safety assessments of LLMs. CASE-Bench assigns\ndistinct, formally described contexts to categorized queries based on\nContextual Integrity theory. Additionally, in contrast to previous studies\nwhich mainly rely on majority voting from just a few annotators, we recruited a\nsufficient number of annotators necessary to ensure the detection of\nstatistically significant differences among the experimental conditions based\non power analysis. Our extensive analysis using CASE-Bench on various\nopen-source and commercial LLMs reveals a substantial and significant influence\nof context on human judgments (p<0.0001 from a z-test), underscoring the\nnecessity of context in safety evaluations. We also identify notable mismatches\nbetween human judgments and LLM responses, particularly in commercial models\nwithin safe contexts.",
    "published": "2025-01-24T21:55:14+00:00",
    "updated": "2025-02-07T10:23:16+00:00",
    "pdf_url": "http://arxiv.org/pdf/2501.14940v3",
    "entry_id": "http://arxiv.org/abs/2501.14940v3",
    "categories": [
      "cs.CL",
      "cs.AI"
    ],
    "primary_category": "cs.CL",
    "comment": "24 pages",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Simplifying Formal Proof-Generating Models with ChatGPT and Basic Searching Techniques",
    "authors": [
      "Sangjun Han",
      "Taeil Hur",
      "Youngmi Hur",
      "Kathy Sangkyung Lee",
      "Myungyoon Lee",
      "Hyojae Lim"
    ],
    "summary": "The challenge of formal proof generation has a rich history, but with modern\ntechniques, we may finally be at the stage of making actual progress in\nreal-life mathematical problems. This paper explores the integration of ChatGPT\nand basic searching techniques to simplify generating formal proofs, with a\nparticular focus on the miniF2F dataset. We demonstrate how combining a large\nlanguage model like ChatGPT with a formal language such as Lean, which has the\nadded advantage of being verifiable, enhances the efficiency and accessibility\nof formal proof generation. Despite its simplicity, our best-performing\nLean-based model surpasses all known benchmarks with a 31.15% pass rate. We\nextend our experiments to include other datasets and employ alternative\nlanguage models, showcasing our models' comparable performance in diverse\nsettings and allowing for a more nuanced analysis of our results. Our findings\noffer insights into AI-assisted formal proof generation, suggesting a promising\ndirection for future research in formal mathematical proof.",
    "published": "2025-02-05T16:21:10+00:00",
    "updated": "2025-02-07T10:04:24+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.03321v2",
    "entry_id": "http://arxiv.org/abs/2502.03321v2",
    "categories": [
      "cs.LO",
      "cs.AI"
    ],
    "primary_category": "cs.LO",
    "comment": "Accepted to Computing Conference 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "MedMimic: Physician-Inspired Multimodal Fusion for Early Diagnosis of Fever of Unknown Origin",
    "authors": [
      "Minrui Chen",
      "Yi Zhou",
      "Huidong Jiang",
      "Yuhan Zhu",
      "Guanjie Zou",
      "Minqi Chen",
      "Rong Tian",
      "Hiroto Saigo"
    ],
    "summary": "Fever of unknown origin FUO remains a diagnostic challenge. MedMimic is\nintroduced as a multimodal framework inspired by real-world diagnostic\nprocesses. It uses pretrained models such as DINOv2, Vision Transformer, and\nResNet-18 to convert high-dimensional 18F-FDG PET/CT imaging into\nlow-dimensional, semantically meaningful features. A learnable\nself-attention-based fusion network then integrates these imaging features with\nclinical data for classification. Using 416 FUO patient cases from Sichuan\nUniversity West China Hospital from 2017 to 2023, the multimodal fusion\nclassification network MFCN achieved macro-AUROC scores ranging from 0.8654 to\n0.9291 across seven tasks, outperforming conventional machine learning and\nsingle-modality deep learning methods. Ablation studies and five-fold\ncross-validation further validated its effectiveness. By combining the\nstrengths of pretrained large models and deep learning, MedMimic offers a\npromising solution for disease classification.",
    "published": "2025-02-07T09:57:03+00:00",
    "updated": "2025-02-07T09:57:03+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04794v1",
    "entry_id": "http://arxiv.org/abs/2502.04794v1",
    "categories": [
      "eess.IV",
      "cs.AI",
      "cs.CV"
    ],
    "primary_category": "eess.IV",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Physics-Informed Deep Inverse Operator Networks for Solving PDE Inverse Problems",
    "authors": [
      "Sung Woong Cho",
      "Hwijae Son"
    ],
    "summary": "Inverse problems involving partial differential equations (PDEs) can be seen\nas discovering a mapping from measurement data to unknown quantities, often\nframed within an operator learning approach. However, existing methods\ntypically rely on large amounts of labeled training data, which is impractical\nfor most real-world applications. Moreover, these supervised models may fail to\ncapture the underlying physical principles accurately. To address these\nlimitations, we propose a novel architecture called Physics-Informed Deep\nInverse Operator Networks (PI-DIONs), which can learn the solution operator of\nPDE-based inverse problems without labeled training data. We extend the\nstability estimates established in the inverse problem literature to the\noperator learning framework, thereby providing a robust theoretical foundation\nfor our method. These estimates guarantee that the proposed model, trained on a\nfinite sample and grid, generalizes effectively across the entire domain and\nfunction space. Extensive experiments are conducted to demonstrate that\nPI-DIONs can effectively and accurately learn the solution operators of the\ninverse problems without the need for labeled data.",
    "published": "2024-12-04T09:38:58+00:00",
    "updated": "2025-02-07T09:56:51+00:00",
    "pdf_url": "http://arxiv.org/pdf/2412.03161v2",
    "entry_id": "http://arxiv.org/abs/2412.03161v2",
    "categories": [
      "math.NA",
      "cs.AI",
      "cs.NA",
      "65M32, 68T99",
      "G.1.8; G.1.10"
    ],
    "primary_category": "math.NA",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Counterfactual Effect Decomposition in Multi-Agent Sequential Decision Making",
    "authors": [
      "Stelios Triantafyllou",
      "Aleksa Sukovic",
      "Yasaman Zolfimoselo",
      "Goran Radanovic"
    ],
    "summary": "We address the challenge of explaining counterfactual outcomes in multi-agent\nMarkov decision processes. In particular, we aim to explain the total\ncounterfactual effect of an agent's action on the outcome of a realized\nscenario through its influence on the environment dynamics and the agents'\nbehavior. To achieve this, we introduce a novel causal explanation formula that\ndecomposes the counterfactual effect by attributing to each agent and state\nvariable a score reflecting their respective contributions to the effect.\nFirst, we show that the total counterfactual effect of an agent's action can be\ndecomposed into two components: one measuring the effect that propagates\nthrough all subsequent agents' actions and another related to the effect that\npropagates through the state transitions. Building on recent advancements in\ncausal contribution analysis, we further decompose these two effects as\nfollows. For the former, we consider agent-specific effects -- a causal concept\nthat quantifies the counterfactual effect of an agent's action that propagates\nthrough a subset of agents. Based on this notion, we use Shapley value to\nattribute the effect to individual agents. For the latter, we consider the\nconcept of structure-preserving interventions and attribute the effect to state\nvariables based on their \"intrinsic\" contributions. Through extensive\nexperimentation, we demonstrate the interpretability of our approach in a\nGridworld environment with LLM-assisted agents and a sepsis management\nsimulator.",
    "published": "2024-10-16T13:20:35+00:00",
    "updated": "2025-02-07T09:54:53+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.12539v2",
    "entry_id": "http://arxiv.org/abs/2410.12539v2",
    "categories": [
      "cs.AI",
      "cs.MA"
    ],
    "primary_category": "cs.AI",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "The DSA Transparency Database: Auditing Self-reported Moderation Actions by Social Media",
    "authors": [
      "Amaury Trujillo",
      "Tiziano Fagni",
      "Stefano Cresci"
    ],
    "summary": "Since September 2023, the Digital Services Act (DSA) obliges large online\nplatforms to submit detailed data on each moderation action they take within\nthe European Union (EU) to the DSA Transparency Database. From its inception,\nthis centralized database has sparked scholarly interest as an unprecedented\nand potentially unique trove of data on real-world online moderation. Here, we\nthoroughly analyze all 353.12M records submitted by the eight largest social\nmedia platforms in the EU during the first 100 days of the database.\nSpecifically, we conduct a platform-wise comparative study of their: volume of\nmoderation actions, grounds for decision, types of applied restrictions, types\nof moderated content, timeliness in undertaking and submitting moderation\nactions, and use of automation. Furthermore, we systematically cross-check the\ncontents of the database with the platforms' own transparency reports. Our\nanalyses reveal that (i) the platforms adhered only in part to the philosophy\nand structure of the database, (ii) the structure of the database is partially\ninadequate for the platforms' reporting needs, (iii) the platforms exhibited\nsubstantial differences in their moderation actions, (iv) a remarkable fraction\nof the database data is inconsistent, (v) the platform X (formerly Twitter)\npresents the most inconsistencies. Our findings have far-reaching implications\nfor policymakers and scholars across diverse disciplines. They offer guidance\nfor future regulations that cater to the reporting needs of online platforms in\ngeneral, but also highlight opportunities to improve and refine the database\nitself.",
    "published": "2023-12-16T00:02:49+00:00",
    "updated": "2025-02-07T09:53:52+00:00",
    "pdf_url": "http://arxiv.org/pdf/2312.10269v4",
    "entry_id": "http://arxiv.org/abs/2312.10269v4",
    "categories": [
      "cs.SI",
      "cs.AI",
      "cs.CY",
      "cs.HC"
    ],
    "primary_category": "cs.SI",
    "comment": "Article published in ACM CSCW'25. Please, cite the published version",
    "journal_ref": "Proceedings of The 28th 2025 ACM Conference on Computer-Supported\n  Cooperative Work and Social Computing (CSCW'25)",
    "doi": "10.1145/3711085"
  },
  {
    "title": "Towards shutdownable agents via stochastic choice",
    "authors": [
      "Elliott Thornley",
      "Alexander Roman",
      "Christos Ziakas",
      "Leyton Ho",
      "Louis Thomson"
    ],
    "summary": "Some worry that advanced artificial agents may resist being shut down. The\nIncomplete Preferences Proposal (IPP) is an idea for ensuring that doesn't\nhappen. A key part of the IPP is using a novel 'Discounted REward for\nSame-Length Trajectories (DREST)' reward function to train agents to (1) pursue\ngoals effectively conditional on each trajectory-length (be 'USEFUL'), and (2)\nchoose stochastically between different trajectory-lengths (be 'NEUTRAL' about\ntrajectory-lengths). In this paper, we propose evaluation metrics for\nUSEFULNESS and NEUTRALITY. We use a DREST reward function to train simple\nagents to navigate gridworlds, and we find that these agents learn to be USEFUL\nand NEUTRAL. Our results thus suggest that DREST reward functions could also\ntrain advanced agents to be USEFUL and NEUTRAL, and thereby make these advanced\nagents useful and shutdownable.",
    "published": "2024-06-30T19:16:02+00:00",
    "updated": "2025-02-07T09:50:47+00:00",
    "pdf_url": "http://arxiv.org/pdf/2407.00805v3",
    "entry_id": "http://arxiv.org/abs/2407.00805v3",
    "categories": [
      "cs.AI"
    ],
    "primary_category": "cs.AI",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "S$^2$-MAD: Breaking the Token Barrier to Enhance Multi-Agent Debate Efficiency",
    "authors": [
      "Yuting Zeng",
      "Weizhe Huang",
      "Lei Jiang",
      "Tongxuan Liu",
      "Xitai Jin",
      "Chen Tianying Tiana",
      "Jing Li",
      "Xiaohua Xu"
    ],
    "summary": "Large language models (LLMs) have demonstrated remarkable capabilities across\nvarious natural language processing (NLP) scenarios, but they still face\nchallenges when handling complex arithmetic and logical reasoning tasks. While\nChain-Of-Thought (CoT) reasoning, self-consistency (SC) and self-correction\nstrategies have attempted to guide models in sequential, multi-step reasoning,\nMulti-agent Debate (MAD) has emerged as a viable approach for enhancing the\nreasoning capabilities of LLMs. By increasing both the number of agents and the\nfrequency of debates, the performance of LLMs improves significantly. However,\nthis strategy results in a significant increase in token costs, presenting a\nbarrier to scalability. To address this challenge, we introduce a novel\nsparsification strategy designed to reduce token costs within MAD. This\napproach minimizes ineffective exchanges of information and unproductive\ndiscussions among agents, thereby enhancing the overall efficiency of the\ndebate process. We conduct comparative experiments on multiple datasets across\nvarious models, demonstrating that our approach significantly reduces the token\ncosts in MAD to a considerable extent. Specifically, compared to MAD, our\napproach achieves an impressive reduction of up to 94.5\\% in token costs while\nmaintaining performance degradation below 2.0\\%.",
    "published": "2025-02-07T09:49:56+00:00",
    "updated": "2025-02-07T09:49:56+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04790v1",
    "entry_id": "http://arxiv.org/abs/2502.04790v1",
    "categories": [
      "cs.CL",
      "cs.AI"
    ],
    "primary_category": "cs.CL",
    "comment": "16 pages, 5 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Enhancing SQL Injection Detection and Prevention Using Generative Models",
    "authors": [
      "Naga Sai Dasari",
      "Atta Badii",
      "Armin Moin",
      "Ahmed Ashlam"
    ],
    "summary": "SQL Injection (SQLi) continues to pose a significant threat to the security\nof web applications, enabling attackers to manipulate databases and access\nsensitive information without authorisation. Although advancements have been\nmade in detection techniques, traditional signature-based methods still\nstruggle to identify sophisticated SQL injection attacks that evade predefined\npatterns. As SQLi attacks evolve, the need for more adaptive detection systems\nbecomes crucial. This paper introduces an innovative approach that leverages\ngenerative models to enhance SQLi detection and prevention mechanisms. By\nincorporating Variational Autoencoders (VAE), Conditional Wasserstein GAN with\nGradient Penalty (CWGAN-GP), and U-Net, synthetic SQL queries were generated to\naugment training datasets for machine learning models. The proposed method\ndemonstrated improved accuracy in SQLi detection systems by reducing both false\npositives and false negatives. Extensive empirical testing further illustrated\nthe ability of the system to adapt to evolving SQLi attack patterns, resulting\nin enhanced precision and robustness.",
    "published": "2025-02-07T09:43:43+00:00",
    "updated": "2025-02-07T09:43:43+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04786v1",
    "entry_id": "http://arxiv.org/abs/2502.04786v1",
    "categories": [
      "cs.CR",
      "cs.AI"
    ],
    "primary_category": "cs.CR",
    "comment": "13 pages, 22 Figures, 1 Table",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "SiriuS: Self-improving Multi-agent Systems via Bootstrapped Reasoning",
    "authors": [
      "Wanjia Zhao",
      "Mert Yuksekgonul",
      "Shirley Wu",
      "James Zou"
    ],
    "summary": "Multi-agent AI systems powered by large language models (LLMs) are\nincreasingly applied to solve complex tasks. However, these systems often rely\non fragile, manually designed prompts and heuristics, making optimization\ndifficult. A key challenge in optimizing multi-agent systems is acquiring\nsuitable training data for specialized agents. We introduce SiriuS, a\nself-improving, reasoning-driven optimization framework for multi-agent\nsystems. Central to our approach is the construction of an experience library:\na repository of high-quality reasoning trajectories. The library is built by\nretaining reasoning steps that lead to successful outcomes, providing a robust\ntraining set for optimizing multi-agent system. Additionally, we introduce a\nlibrary augmentation procedure that refines unsuccessful trajectories, further\nenriching the library. SiriuS boosts performance by 2.86\\% to 21.88\\% on\nreasoning and biomedical QA and enhances agent negotiation in competitive\nsettings. Our results show that SiriuS enhances multi-agent performance while\ngenerating reusable data for self-correction and self-play enhancement in the\nfuture.",
    "published": "2025-02-07T09:33:44+00:00",
    "updated": "2025-02-07T09:33:44+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04780v1",
    "entry_id": "http://arxiv.org/abs/2502.04780v1",
    "categories": [
      "cs.AI"
    ],
    "primary_category": "cs.AI",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Behavior-Regularized Diffusion Policy Optimization for Offline Reinforcement Learning",
    "authors": [
      "Chen-Xiao Gao",
      "Chenyang Wu",
      "Mingjun Cao",
      "Chenjun Xiao",
      "Yang Yu",
      "Zongzhang Zhang"
    ],
    "summary": "The primary focus of offline reinforcement learning (RL) is to manage the\nrisk of hazardous exploitation of out-of-distribution actions. An effective\napproach to achieve this goal is through behavior regularization, which\naugments conventional RL objectives by incorporating constraints that enforce\nthe policy to remain close to the behavior policy. Nevertheless, existing\nliterature on behavior-regularized RL primarily focuses on explicit policy\nparameterizations, such as Gaussian policies. Consequently, it remains unclear\nhow to extend this framework to more advanced policy parameterizations, such as\ndiffusion models. In this paper, we introduce BDPO, a principled\nbehavior-regularized RL framework tailored for diffusion-based policies,\nthereby combining the expressive power of diffusion policies and the robustness\nprovided by regularization. The key ingredient of our method is to calculate\nthe Kullback-Leibler (KL) regularization analytically as the accumulated\ndiscrepancies in reverse-time transition kernels along the diffusion\ntrajectory. By integrating the regularization, we develop an efficient\ntwo-time-scale actor-critic RL algorithm that produces the optimal policy while\nrespecting the behavior constraint. Comprehensive evaluations conducted on\nsynthetic 2D tasks and continuous control tasks from the D4RL benchmark\nvalidate its effectiveness and superior performance.",
    "published": "2025-02-07T09:30:35+00:00",
    "updated": "2025-02-07T09:30:35+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04778v1",
    "entry_id": "http://arxiv.org/abs/2502.04778v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "Under review",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "DMPA: Model Poisoning Attacks on Decentralized Federated Learning for Model Differences",
    "authors": [
      "Chao Feng",
      "Yunlong Li",
      "Yuanzhe Gao",
      "Alberto Huertas Celdrán",
      "Jan von der Assen",
      "Gérôme Bovet",
      "Burkhard Stiller"
    ],
    "summary": "Federated learning (FL) has garnered significant attention as a prominent\nprivacy-preserving Machine Learning (ML) paradigm. Decentralized FL (DFL)\neschews traditional FL's centralized server architecture, enhancing the\nsystem's robustness and scalability. However, these advantages of DFL also\ncreate new vulnerabilities for malicious participants to execute adversarial\nattacks, especially model poisoning attacks. In model poisoning attacks,\nmalicious participants aim to diminish the performance of benign models by\ncreating and disseminating the compromised model. Existing research on model\npoisoning attacks has predominantly concentrated on undermining global models\nwithin the Centralized FL (CFL) paradigm, while there needs to be more research\nin DFL. To fill the research gap, this paper proposes an innovative model\npoisoning attack called DMPA. This attack calculates the differential\ncharacteristics of multiple malicious client models and obtains the most\neffective poisoning strategy, thereby orchestrating a collusive attack by\nmultiple participants. The effectiveness of this attack is validated across\nmultiple datasets, with results indicating that the DMPA approach consistently\nsurpasses existing state-of-the-art FL model poisoning attack strategies.",
    "published": "2025-02-07T09:15:38+00:00",
    "updated": "2025-02-07T09:15:38+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04771v1",
    "entry_id": "http://arxiv.org/abs/2502.04771v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "8 pages, 3 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Comprehending Knowledge Graphs with Large Language Models for Recommender Systems",
    "authors": [
      "Ziqiang Cui",
      "Yunpeng Weng",
      "Xing Tang",
      "Fuyuan Lyu",
      "Dugang Liu",
      "Xiuqiang He",
      "Chen Ma"
    ],
    "summary": "In recent years, the introduction of knowledge graphs (KGs) has significantly\nadvanced recommender systems by facilitating the discovery of potential\nassociations between items. However, existing methods still face several\nlimitations. First, most KGs suffer from missing facts or limited scopes.\nSecond, existing methods convert textual information in KGs into IDs, resulting\nin the loss of natural semantic connections between different items. Third,\nexisting methods struggle to capture high-order connections in the global KG.\nTo address these limitations, we propose a novel method called CoLaKG, which\nleverages large language models (LLMs) to improve KG-based recommendations. The\nextensive world knowledge and remarkable reasoning capabilities of LLMs enable\nour method to supplement missing facts in KGs. Additionally, their powerful\ntext understanding abilities allow for better utilization of semantic\ninformation. Specifically, CoLaKG extracts useful information from the KG at\nboth local and global levels. By employing item-centered subgraph extraction\nand prompt engineering, it accurately captures the local KG. Subsequently,\nthrough retrieval-based neighbor enhancement, it supplements the current item\nby capturing related items from the entire KG, thereby effectively utilizing\nglobal information. The local and global information extracted by the LLM are\neffectively integrated into the recommendation model through a representation\nfusion module and a retrieval-augmented representation learning module,\nrespectively, thereby improving recommendation performance. Extensive\nexperiments on four real-world datasets demonstrate the superiority of our\nmethod.",
    "published": "2024-10-16T04:44:34+00:00",
    "updated": "2025-02-07T09:08:17+00:00",
    "pdf_url": "http://arxiv.org/pdf/2410.12229v2",
    "entry_id": "http://arxiv.org/abs/2410.12229v2",
    "categories": [
      "cs.IR",
      "cs.AI"
    ],
    "primary_category": "cs.IR",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Self-Clustering Graph Transformer Approach to Model Resting-State Functional Brain Activity",
    "authors": [
      "Bishal Thapaliya",
      "Esra Akbas",
      "Ram Sapkota",
      "Bhaskar Ray",
      "Vince Calhoun",
      "Jingyu Liu"
    ],
    "summary": "Resting-state functional magnetic resonance imaging (rs-fMRI) offers valuable\ninsights into the human brain's functional organization and is a powerful tool\nfor investigating the relationship between brain function and cognitive\nprocesses, as it allows for the functional organization of the brain to be\ncaptured without relying on a specific task or stimuli. In this study, we\nintroduce a novel attention mechanism for graphs with subnetworks, named\nSelf-Clustering Graph Transformer (SCGT), designed to handle the issue of\nuniform node updates in graph transformers. By using static functional\nconnectivity (FC) correlation features as input to the transformer model, SCGT\neffectively captures the sub-network structure of the brain by performing\ncluster-specific updates to the nodes, unlike uniform node updates in vanilla\ngraph transformers, further allowing us to learn and interpret the subclusters.\nWe validate our approach on the Adolescent Brain Cognitive Development (ABCD)\ndataset, comprising 7,957 participants, for the prediction of total cognitive\nscore and gender classification. Our results demonstrate that SCGT outperforms\nthe vanilla graph transformer method and other recent models, offering a\npromising tool for modeling brain functional connectivity and interpreting the\nunderlying subnetwork structures.",
    "published": "2025-01-17T20:21:31+00:00",
    "updated": "2025-02-07T08:57:37+00:00",
    "pdf_url": "http://arxiv.org/pdf/2501.16345v2",
    "entry_id": "http://arxiv.org/abs/2501.16345v2",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": "5 pages, 2 figures - Accepted under International Symposium on\n  Biomedical Imaging (ISBI 2025) Conference",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Mastering the Craft of Data Synthesis for CodeLLMs",
    "authors": [
      "Meng Chen",
      "Philip Arthur",
      "Qianyu Feng",
      "Cong Duy Vu Hoang",
      "Yu-Heng Hong",
      "Mahdi Kazemi Moghaddam",
      "Omid Nezami",
      "Thien Nguyen",
      "Gioacchino Tangari",
      "Duy Vu",
      "Thanh Vu",
      "Mark Johnson",
      "Krishnaram Kenthapadi",
      "Don Dharmasiri",
      "Long Duong",
      "Yuan-Fang Li"
    ],
    "summary": "Large language models (LLMs) have shown impressive performance in \\emph{code}\nunderstanding and generation, making coding tasks a key focus for researchers\ndue to their practical applications and value as a testbed for LLM evaluation.\nData synthesis and filtering techniques have been widely adopted and shown to\nbe highly effective in this context. In this paper, we present a focused survey\nand taxonomy of these techniques, emphasizing recent advancements. We highlight\nkey challenges, explore future research directions, and offer practical\nguidance for new researchers entering the field.",
    "published": "2024-10-16T11:57:14+00:00",
    "updated": "2025-02-07T08:49:48+00:00",
    "pdf_url": "http://arxiv.org/pdf/2411.00005v3",
    "entry_id": "http://arxiv.org/abs/2411.00005v3",
    "categories": [
      "cs.SE",
      "cs.AI"
    ],
    "primary_category": "cs.SE",
    "comment": "Accepted at NAACL 2025",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Graph Federated Learning Based Proactive Content Caching in Edge Computing",
    "authors": [
      "Rui Wang"
    ],
    "summary": "With the rapid growth of mobile data traffic and the increasing prevalence of\nvideo streaming, proactive content caching in edge computing has become crucial\nfor reducing latency and alleviating network congestion. However, traditional\ncaching strategies such as FIFO, LRU, and LFU fail to effectively predict\nfuture content popularity, while existing proactive caching approaches often\nrequire users to upload data to a central server, raising concerns regarding\nprivacy and scalability. To address these challenges, this paper proposes a\nGraph Federated Learning-based Proactive Content Caching (GFPCC) scheme that\nenhances caching efficiency while preserving user privacy. The proposed\napproach integrates federated learning and graph neural networks, enabling\nusers to locally train Light Graph Convolutional Networks (LightGCN) to capture\nuser-item relationships and predict content popularity. Instead of sharing raw\ndata, only the trained model parameters are transmitted to the central server,\nwhere a federated averaging algorithm aggregates updates, refines the global\nmodel, and selects the most popular files for proactive caching. Experimental\nevaluations on real-world datasets, such as MovieLens, demonstrate that GFPCC\noutperforms baseline caching algorithms by achieving higher cache efficiency\nthrough more accurate content popularity predictions. Moreover, the federated\nlearning framework strengthens privacy protection while maintaining efficient\nmodel training; however, scalability remains a challenge in large-scale\nnetworks with dynamic user preferences.",
    "published": "2025-02-07T08:48:06+00:00",
    "updated": "2025-02-07T08:48:06+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04760v1",
    "entry_id": "http://arxiv.org/abs/2502.04760v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Enhancing Phishing Email Identification with Large Language Models",
    "authors": [
      "Catherine Lee"
    ],
    "summary": "Phishing has long been a common tactic used by cybercriminals and continues\nto pose a significant threat in today's digital world. When phishing attacks\nbecome more advanced and sophisticated, there is an increasing need for\neffective methods to detect and prevent them. To address the challenging\nproblem of detecting phishing emails, researchers have developed numerous\nsolutions, in particular those based on machine learning (ML) algorithms. In\nthis work, we take steps to study the efficacy of large language models (LLMs)\nin detecting phishing emails. The experiments show that the LLM achieves a high\naccuracy rate at high precision; importantly, it also provides interpretable\nevidence for the decisions.",
    "published": "2025-02-07T08:45:50+00:00",
    "updated": "2025-02-07T08:45:50+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04759v1",
    "entry_id": "http://arxiv.org/abs/2502.04759v1",
    "categories": [
      "cs.CR",
      "cs.AI"
    ],
    "primary_category": "cs.CR",
    "comment": "9 pages, 5 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Concept Navigation and Classification via Open Source Large Language Model Processing",
    "authors": [
      "Maël Kubli"
    ],
    "summary": "This paper presents a novel methodological framework for detecting and\nclassifying latent constructs, including frames, narratives, and topics, from\ntextual data using Open-Source Large Language Models (LLMs). The proposed\nhybrid approach combines automated summarization with human-in-the-loop\nvalidation to enhance the accuracy and interpretability of construct\nidentification. By employing iterative sampling coupled with expert refinement,\nthe framework guarantees methodological robustness and ensures conceptual\nprecision. Applied to diverse data sets, including AI policy debates, newspaper\narticles on encryption, and the 20 Newsgroups data set, this approach\ndemonstrates its versatility in systematically analyzing complex political\ndiscourses, media framing, and topic classification tasks.",
    "published": "2025-02-07T08:42:34+00:00",
    "updated": "2025-02-07T08:42:34+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04756v1",
    "entry_id": "http://arxiv.org/abs/2502.04756v1",
    "categories": [
      "cs.CL",
      "cs.AI",
      "cs.LG",
      "I.2.7"
    ],
    "primary_category": "cs.CL",
    "comment": "35 pages, 1 figure, 7 tabels",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "On the Expressive Power of Sparse Geometric MPNNs",
    "authors": [
      "Yonatan Sverdlov",
      "Nadav Dym"
    ],
    "summary": "Motivated by applications in chemistry and other sciences, we study the\nexpressive power of message-passing neural networks for geometric graphs, whose\nnode features correspond to 3-dimensional positions. Recent work has shown that\nsuch models can separate generic pairs of non-isomorphic geometric graphs,\nthough they may fail to separate some rare and complicated instances. However,\nthese results assume a fully connected graph, where each node possesses\ncomplete knowledge of all other nodes. In contrast, often, in application,\nevery node only possesses knowledge of a small number of nearest neighbors.\n  This paper shows that generic pairs of non-isomorphic geometric graphs can be\nseparated by message-passing networks with rotation equivariant features as\nlong as the underlying graph is connected. When only invariant intermediate\nfeatures are allowed, generic separation is guaranteed for generically globally\nrigid graphs. We introduce a simple architecture, EGENNET, which achieves our\ntheoretical guarantees and compares favorably with alternative architecture on\nsynthetic and chemical benchmarks. Our code is available at\nhttps://github.com/yonatansverdlov/E-GenNet.",
    "published": "2024-07-02T07:48:22+00:00",
    "updated": "2025-02-07T08:39:56+00:00",
    "pdf_url": "http://arxiv.org/pdf/2407.02025v3",
    "entry_id": "http://arxiv.org/abs/2407.02025v3",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Every Software as an Agent: Blueprint and Case Study",
    "authors": [
      "Mengwei Xu"
    ],
    "summary": "The rise of (multimodal) large language models (LLMs) has shed light on\nsoftware agent -- where software can understand and follow user instructions in\nnatural language. However, existing approaches such as API-based and GUI-based\nagents are far from satisfactory at accuracy and efficiency aspects. Instead,\nwe advocate to endow LLMs with access to the software internals (source code\nand runtime context) and the permission to dynamically inject generated code\ninto software for execution. In such a whitebox setting, one may better\nleverage the software context and the coding ability of LLMs. We then present\nan overall design architecture and case studies on two popular web-based\ndesktop applications. We also give in-depth discussion of the challenges and\nfuture directions. We deem that such a new paradigm has the potential to\nfundamentally overturn the existing software agent design, and finally creating\na digital world in which software can comprehend, operate, collaborate, and\neven think to meet complex user needs.",
    "published": "2025-02-07T08:29:09+00:00",
    "updated": "2025-02-07T08:29:09+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04747v1",
    "entry_id": "http://arxiv.org/abs/2502.04747v1",
    "categories": [
      "cs.SE",
      "cs.AI"
    ],
    "primary_category": "cs.SE",
    "comment": null,
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "DiMSUM: Diffusion Mamba -- A Scalable and Unified Spatial-Frequency Method for Image Generation",
    "authors": [
      "Hao Phung",
      "Quan Dao",
      "Trung Dao",
      "Hoang Phan",
      "Dimitris Metaxas",
      "Anh Tran"
    ],
    "summary": "We introduce a novel state-space architecture for diffusion models,\neffectively harnessing spatial and frequency information to enhance the\ninductive bias towards local features in input images for image generation\ntasks. While state-space networks, including Mamba, a revolutionary advancement\nin recurrent neural networks, typically scan input sequences from left to\nright, they face difficulties in designing effective scanning strategies,\nespecially in the processing of image data. Our method demonstrates that\nintegrating wavelet transformation into Mamba enhances the local structure\nawareness of visual inputs and better captures long-range relations of\nfrequencies by disentangling them into wavelet subbands, representing both low-\nand high-frequency components. These wavelet-based outputs are then processed\nand seamlessly fused with the original Mamba outputs through a cross-attention\nfusion layer, combining both spatial and frequency information to optimize the\norder awareness of state-space models which is essential for the details and\noverall quality of image generation. Besides, we introduce a globally-shared\ntransformer to supercharge the performance of Mamba, harnessing its exceptional\npower to capture global relationships. Through extensive experiments on\nstandard benchmarks, our method demonstrates superior results compared to DiT\nand DIFFUSSM, achieving faster training convergence and delivering high-quality\noutputs. The codes and pretrained models are released at\nhttps://github.com/VinAIResearch/DiMSUM.git.",
    "published": "2024-11-06T18:59:17+00:00",
    "updated": "2025-02-07T08:13:54+00:00",
    "pdf_url": "http://arxiv.org/pdf/2411.04168v3",
    "entry_id": "http://arxiv.org/abs/2411.04168v3",
    "categories": [
      "cs.CV",
      "cs.AI"
    ],
    "primary_category": "cs.CV",
    "comment": "Accepted to NeurIPS 2024. Project page:\n  https://vinairesearch.github.io/DiMSUM/",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "The Battling Influencers Game: Nash Equilibria Structure of a Potential Game and Implications to Value Alignment",
    "authors": [
      "Young Wu",
      "Yancheng Zhu",
      "Jin-Yi Cai",
      "Xiaojin Zhu"
    ],
    "summary": "When multiple influencers attempt to compete for a receiver's attention,\ntheir influencing strategies must account for the presence of one another. We\nintroduce the Battling Influencers Game (BIG), a multi-player simultaneous-move\ngeneral-sum game, to provide a game-theoretic characterization of this social\nphenomenon. We prove that BIG is a potential game, that it has either one or an\ninfinite number of pure Nash equilibria (NEs), and these pure NEs can be found\nby convex optimization. Interestingly, we also prove that at any pure NE, all\n(except at most one) influencers must exaggerate their actions to the maximum\nextent. In other words, it is rational for the influencers to be non-truthful\nand extreme because they anticipate other influencers to cancel out part of\ntheir influence. We discuss the implications of BIG to value alignment.",
    "published": "2025-02-03T07:45:41+00:00",
    "updated": "2025-02-07T08:10:54+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.01127v3",
    "entry_id": "http://arxiv.org/abs/2502.01127v3",
    "categories": [
      "cs.GT",
      "cs.AI"
    ],
    "primary_category": "cs.GT",
    "comment": "9 pages, 8 figures",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "People use fast, goal-directed simulation to reason about novel games",
    "authors": [
      "Cedegao E. Zhang",
      "Katherine M. Collins",
      "Lionel Wong",
      "Mauricio Barba",
      "Adrian Weller",
      "Joshua B. Tenenbaum"
    ],
    "summary": "People can evaluate features of problems and their potential solutions well\nbefore we can effectively solve them. When considering a game we have never\nplayed, for instance, we might infer whether it is likely to be challenging,\nfair, or fun simply from hearing the game rules, prior to deciding whether to\ninvest time in learning the game or trying to play it well. Many studies of\ngame play have focused on optimality and expertise, characterizing how people\nand computational models play based on moderate to extensive search and after\nplaying a game dozens (if not thousands or millions) of times. Here, we study\nhow people reason about a range of simple but novel Connect-N style board\ngames. We ask people to judge how fair and how fun the games are from very\nlittle experience: just thinking about the game for a minute or so, before they\nhave ever actually played with anyone else, and we propose a resource-limited\nmodel that captures their judgments using only a small number of partial game\nsimulations and almost no look-ahead search.",
    "published": "2024-07-19T07:59:04+00:00",
    "updated": "2025-02-07T08:03:50+00:00",
    "pdf_url": "http://arxiv.org/pdf/2407.14095v2",
    "entry_id": "http://arxiv.org/abs/2407.14095v2",
    "categories": [
      "cs.GT",
      "cs.AI",
      "q-bio.NC"
    ],
    "primary_category": "cs.GT",
    "comment": "Accepted at CogSci 2024 as a talk",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Generating Symbolic World Models via Test-time Scaling of Large Language Models",
    "authors": [
      "Zhouliang Yu",
      "Yuhuan Yuan",
      "Tim Z. Xiao",
      "Fuxiang Frank Xia",
      "Jie Fu",
      "Ge Zhang",
      "Ge Lin",
      "Weiyang Liu"
    ],
    "summary": "Solving complex planning problems requires Large Language Models (LLMs) to\nexplicitly model the state transition to avoid rule violations, comply with\nconstraints, and ensure optimality-a task hindered by the inherent ambiguity of\nnatural language. To overcome such ambiguity, Planning Domain Definition\nLanguage (PDDL) is leveraged as a planning abstraction that enables precise and\nformal state descriptions. With PDDL, we can generate a symbolic world model\nwhere classic searching algorithms, such as A*, can be seamlessly applied to\nfind optimal plans. However, directly generating PDDL domains with current LLMs\nremains an open challenge due to the lack of PDDL training data. To address\nthis challenge, we propose to scale up the test-time computation of LLMs to\nenhance their PDDL reasoning capabilities, thereby enabling the generation of\nhigh-quality PDDL domains. Specifically, we introduce a simple yet effective\nalgorithm, which first employs a Best-of-N sampling approach to improve the\nquality of the initial solution and then refines the solution in a fine-grained\nmanner with verbalized machine learning. Our method outperforms o1-mini by a\nconsiderable margin in the generation of PDDL domain, achieving over 50%\nsuccess rate on two tasks (i.e., generating PDDL domains from natural language\ndescription or PDDL problems). This is done without requiring additional\ntraining. By taking advantage of PDDL as state abstraction, our method is able\nto outperform current state-of-the-art methods on almost all competition-level\nplanning tasks.",
    "published": "2025-02-07T07:52:25+00:00",
    "updated": "2025-02-07T07:52:25+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04728v1",
    "entry_id": "http://arxiv.org/abs/2502.04728v1",
    "categories": [
      "cs.AI"
    ],
    "primary_category": "cs.AI",
    "comment": "Technical Report v1 (32 pages, 6 figures)",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Can Diffusion Models Learn Hidden Inter-Feature Rules Behind Images?",
    "authors": [
      "Yujin Han",
      "Andi Han",
      "Wei Huang",
      "Chaochao Lu",
      "Difan Zou"
    ],
    "summary": "Despite the remarkable success of diffusion models (DMs) in data generation,\nthey exhibit specific failure cases with unsatisfactory outputs. We focus on\none such limitation: the ability of DMs to learn hidden rules between image\nfeatures. Specifically, for image data with dependent features ($\\mathbf{x}$)\nand ($\\mathbf{y}$) (e.g., the height of the sun ($\\mathbf{x}$) and the length\nof the shadow ($\\mathbf{y}$)), we investigate whether DMs can accurately\ncapture the inter-feature rule ($p(\\mathbf{y}|\\mathbf{x})$). Empirical\nevaluations on mainstream DMs (e.g., Stable Diffusion 3.5) reveal consistent\nfailures, such as inconsistent lighting-shadow relationships and mismatched\nobject-mirror reflections. Inspired by these findings, we design four synthetic\ntasks with strongly correlated features to assess DMs' rule-learning abilities.\nExtensive experiments show that while DMs can identify coarse-grained rules,\nthey struggle with fine-grained ones. Our theoretical analysis demonstrates\nthat DMs trained via denoising score matching (DSM) exhibit constant errors in\nlearning hidden rules, as the DSM objective is not compatible with rule\nconformity. To mitigate this, we introduce a common technique - incorporating\nadditional classifier guidance during sampling, which achieves (limited)\nimprovements. Our analysis reveals that the subtle signals of fine-grained\nrules are challenging for the classifier to capture, providing insights for\nfuture exploration.",
    "published": "2025-02-07T07:49:37+00:00",
    "updated": "2025-02-07T07:49:37+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04725v1",
    "entry_id": "http://arxiv.org/abs/2502.04725v1",
    "categories": [
      "cs.CV",
      "cs.AI"
    ],
    "primary_category": "cs.CV",
    "comment": "25 pages, 18 figures, 3 tables",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "Simple and Provable Scaling Laws for the Test-Time Compute of Large Language Models",
    "authors": [
      "Yanxi Chen",
      "Xuchen Pan",
      "Yaliang Li",
      "Bolin Ding",
      "Jingren Zhou"
    ],
    "summary": "We propose two simple yet principled algorithms that enjoy provable scaling\nlaws for the test-time compute of large language models (LLMs), which require a\nblack-box LLM and nothing else (e.g., no external verifier or reward model) for\na minimalistic implementation. (i) The first one is a two-stage knockout-style\nalgorithm: given an input problem, it first generates multiple candidate\nsolutions, and then aggregate them for a final output, via a knockout\ntournament where pairwise comparisons among the candidates are conducted.\nAssuming that the LLM can generate a correct solution with non-zero probability\nand do better than a random guess in comparing a pair of correct and incorrect\nsolutions, we prove theoretically that the failure probability of this\nalgorithm decays to zero exponentially or by a power law (depending on the\nspecific way of scaling) as its test-time compute grows. (ii) The second one is\na two-stage league-style algorithm, where each candidate solution is evaluated\nby its average win rate against multiple opponents, rather than eliminated upon\nloss to a single opponent. Under certain technical assumptions that are\nanalogous to but more robust than those required by the knockout-style\nalgorithm, we prove theoretically that the failure probability of the\nleague-style algorithm also decays to zero exponentially as its test-time\ncompute grows. Through extensive experiments with two challenging benchmarks,\nnamely GPQA and MMLU-Pro, we validate the proposed theories and demonstrate the\noutstanding scaling properties of both algorithms.",
    "published": "2024-11-29T05:29:47+00:00",
    "updated": "2025-02-07T07:08:29+00:00",
    "pdf_url": "http://arxiv.org/pdf/2411.19477v2",
    "entry_id": "http://arxiv.org/abs/2411.19477v2",
    "categories": [
      "cs.CL",
      "cs.AI",
      "cs.LG"
    ],
    "primary_category": "cs.CL",
    "comment": "arXiv v2 update: additional algorithms, theories and experiments",
    "journal_ref": null,
    "doi": null
  },
  {
    "title": "EigenLoRAx: Recycling Adapters to Find Principal Subspaces for Resource-Efficient Adaptation and Inference",
    "authors": [
      "Prakhar Kaushik",
      "Ankit Vaidya",
      "Shravan Chaudhari",
      "Alan Yuille"
    ],
    "summary": "The rapid growth of large models has raised concerns about their\nenvironmental impact and equity in accessibility due to significant\ncomputational costs. Low-Rank Adapters (LoRA) offer a lightweight solution for\nfinetuning large models, resulting in an abundance of publicly available\nadapters tailored to diverse domains. We ask: Can these pretrained adapters be\nleveraged to further streamline adaptation to new tasks while addressing these\nchallenges? We introduce EigenLoRAx, a parameter-efficient finetuning method\nthat recycles existing adapters to create a principal subspace aligned with\ntheir shared domain knowledge which can be further augmented with orthogonal\nbasis vectors in low-resource scenarios. This enables rapid adaptation to new\ntasks by learning only lightweight coefficients on the principal components of\nthe subspace - eliminating the need to finetune entire adapters. EigenLoRAx\nrequires significantly fewer parameters and memory, improving efficiency for\nboth training and inference. Our method demonstrates strong performance across\ndiverse domains and tasks, offering a scalable for edge-based applications,\npersonalization, and equitable deployment of large models in\nresource-constrained environments.",
    "published": "2025-02-07T07:07:04+00:00",
    "updated": "2025-02-07T07:07:04+00:00",
    "pdf_url": "http://arxiv.org/pdf/2502.04700v1",
    "entry_id": "http://arxiv.org/abs/2502.04700v1",
    "categories": [
      "cs.LG",
      "cs.AI"
    ],
    "primary_category": "cs.LG",
    "comment": null,
    "journal_ref": null,
    "doi": null
  }
]