@inproceedings{ben-kish-etal-2024-mitigating,
title = "Mitigating Open-Vocabulary Caption Hallucinations",
author = "Ben-Kish, Assaf and
Yanuka, Moran and
Alper, Morris and
Giryes, Raja and
Averbuch-Elor, Hadar",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1263/",
doi = "10.18653/v1/2024.emnlp-main.1263",
pages = "22680--22698",
abstract = "While recent years have seen rapid progress in image-conditioned text generation, image captioning still suffers from the fundamental issue of hallucinations, namely, the generation of spurious details that cannot be inferred from the given image. Existing methods largely use closed-vocabulary object lists to mitigate or evaluate hallucinations in image captioning, ignoring the long-tailed nature of hallucinations that occur in practice. To this end, we propose a framework for addressing hallucinations in image captioning in the open-vocabulary setting. Our framework includes a new benchmark, OpenCHAIR, that leverages generative foundation models to evaluate open-vocabulary object hallucinations for image captioning, surpassing the popular and similarly-sized CHAIR benchmark in both diversity and accuracy. Furthermore, to mitigate open-vocabulary hallucinations without using a closed object list, we propose MOCHa, an approach harnessing advancements in reinforcement learning. Our multi-objective reward function explicitly targets the trade-off between fidelity and adequacy in generations without requiring any strong supervision. MOCHa improves a large variety of image captioning models, as captured by our OpenCHAIR benchmark and other existing metrics. We will release our code and models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ben-kish-etal-2024-mitigating">
<titleInfo>
<title>Mitigating Open-Vocabulary Caption Hallucinations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Assaf</namePart>
<namePart type="family">Ben-Kish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moran</namePart>
<namePart type="family">Yanuka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Morris</namePart>
<namePart type="family">Alper</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raja</namePart>
<namePart type="family">Giryes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadar</namePart>
<namePart type="family">Averbuch-Elor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While recent years have seen rapid progress in image-conditioned text generation, image captioning still suffers from the fundamental issue of hallucinations, namely, the generation of spurious details that cannot be inferred from the given image. Existing methods largely use closed-vocabulary object lists to mitigate or evaluate hallucinations in image captioning, ignoring the long-tailed nature of hallucinations that occur in practice. To this end, we propose a framework for addressing hallucinations in image captioning in the open-vocabulary setting. Our framework includes a new benchmark, OpenCHAIR, that leverages generative foundation models to evaluate open-vocabulary object hallucinations for image captioning, surpassing the popular and similarly-sized CHAIR benchmark in both diversity and accuracy. Furthermore, to mitigate open-vocabulary hallucinations without using a closed object list, we propose MOCHa, an approach harnessing advancements in reinforcement learning. Our multi-objective reward function explicitly targets the trade-off between fidelity and adequacy in generations without requiring any strong supervision. MOCHa improves a large variety of image captioning models, as captured by our OpenCHAIR benchmark and other existing metrics. We will release our code and models.</abstract>
<identifier type="citekey">ben-kish-etal-2024-mitigating</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1263</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1263/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>22680</start>
<end>22698</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mitigating Open-Vocabulary Caption Hallucinations
%A Ben-Kish, Assaf
%A Yanuka, Moran
%A Alper, Morris
%A Giryes, Raja
%A Averbuch-Elor, Hadar
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ben-kish-etal-2024-mitigating
%X While recent years have seen rapid progress in image-conditioned text generation, image captioning still suffers from the fundamental issue of hallucinations, namely, the generation of spurious details that cannot be inferred from the given image. Existing methods largely use closed-vocabulary object lists to mitigate or evaluate hallucinations in image captioning, ignoring the long-tailed nature of hallucinations that occur in practice. To this end, we propose a framework for addressing hallucinations in image captioning in the open-vocabulary setting. Our framework includes a new benchmark, OpenCHAIR, that leverages generative foundation models to evaluate open-vocabulary object hallucinations for image captioning, surpassing the popular and similarly-sized CHAIR benchmark in both diversity and accuracy. Furthermore, to mitigate open-vocabulary hallucinations without using a closed object list, we propose MOCHa, an approach harnessing advancements in reinforcement learning. Our multi-objective reward function explicitly targets the trade-off between fidelity and adequacy in generations without requiring any strong supervision. MOCHa improves a large variety of image captioning models, as captured by our OpenCHAIR benchmark and other existing metrics. We will release our code and models.
%R 10.18653/v1/2024.emnlp-main.1263
%U https://aclanthology.org/2024.emnlp-main.1263/
%U https://doi.org/10.18653/v1/2024.emnlp-main.1263
%P 22680-22698
Markdown (Informal)
[Mitigating Open-Vocabulary Caption Hallucinations](https://aclanthology.org/2024.emnlp-main.1263/) (Ben-Kish et al., EMNLP 2024)
ACL
- Assaf Ben-Kish, Moran Yanuka, Morris Alper, Raja Giryes, and Hadar Averbuch-Elor. 2024. Mitigating Open-Vocabulary Caption Hallucinations. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 22680–22698, Miami, Florida, USA. Association for Computational Linguistics.