@inproceedings{zhang-etal-2024-mar,
title = "{MAR}: Matching-Augmented Reasoning for Enhancing Visual-based Entity Question Answering",
author = "Zhang, Zhengxuan and
Wu, Yin and
Luo, Yuyu and
Tang, Nan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.91/",
doi = "10.18653/v1/2024.emnlp-main.91",
pages = "1520--1530",
abstract = "A multimodal large language model MLLMs may struggle with answering visual-based (personal) entity questions (VEQA), such as {\textquotedblright}who is A?{\textquotedblright} or {\textquotedblright}who is A that B is talking to?{\textquotedblright} for various reasons, e.g., the absence of the name of A in the caption or the inability of MLLMs to recognize A, particularly for less common entities. Furthermore, even if the MLLMs can identify A, it may refrain from answering due to privacy concerns. In this paper, we introduce a novel method called Matching-Augmented Reasoning (MAR) to enhance VEQA. Given a collection of visual objects with captions, MAR preprocesses each object individually, identifying faces, names, and their alignments within the object. It encodes this information and stores their vector representations in vector databases. When handling VEQA, MAR retrieves matching faces and names and organizes these entities into a matching graph. MAR then derives the answer to the query by reasoning over this matching graph. Extensive experiments show that MAR significantly improves VEQA compared with the state-of-the-art methods using MLLMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2024-mar">
<titleInfo>
<title>MAR: Matching-Augmented Reasoning for Enhancing Visual-based Entity Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhengxuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuyu</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nan</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A multimodal large language model MLLMs may struggle with answering visual-based (personal) entity questions (VEQA), such as ”who is A?” or ”who is A that B is talking to?” for various reasons, e.g., the absence of the name of A in the caption or the inability of MLLMs to recognize A, particularly for less common entities. Furthermore, even if the MLLMs can identify A, it may refrain from answering due to privacy concerns. In this paper, we introduce a novel method called Matching-Augmented Reasoning (MAR) to enhance VEQA. Given a collection of visual objects with captions, MAR preprocesses each object individually, identifying faces, names, and their alignments within the object. It encodes this information and stores their vector representations in vector databases. When handling VEQA, MAR retrieves matching faces and names and organizes these entities into a matching graph. MAR then derives the answer to the query by reasoning over this matching graph. Extensive experiments show that MAR significantly improves VEQA compared with the state-of-the-art methods using MLLMs.</abstract>
<identifier type="citekey">zhang-etal-2024-mar</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.91</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.91/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1520</start>
<end>1530</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MAR: Matching-Augmented Reasoning for Enhancing Visual-based Entity Question Answering
%A Zhang, Zhengxuan
%A Wu, Yin
%A Luo, Yuyu
%A Tang, Nan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zhang-etal-2024-mar
%X A multimodal large language model MLLMs may struggle with answering visual-based (personal) entity questions (VEQA), such as ”who is A?” or ”who is A that B is talking to?” for various reasons, e.g., the absence of the name of A in the caption or the inability of MLLMs to recognize A, particularly for less common entities. Furthermore, even if the MLLMs can identify A, it may refrain from answering due to privacy concerns. In this paper, we introduce a novel method called Matching-Augmented Reasoning (MAR) to enhance VEQA. Given a collection of visual objects with captions, MAR preprocesses each object individually, identifying faces, names, and their alignments within the object. It encodes this information and stores their vector representations in vector databases. When handling VEQA, MAR retrieves matching faces and names and organizes these entities into a matching graph. MAR then derives the answer to the query by reasoning over this matching graph. Extensive experiments show that MAR significantly improves VEQA compared with the state-of-the-art methods using MLLMs.
%R 10.18653/v1/2024.emnlp-main.91
%U https://aclanthology.org/2024.emnlp-main.91/
%U https://doi.org/10.18653/v1/2024.emnlp-main.91
%P 1520-1530
Markdown (Informal)
[MAR: Matching-Augmented Reasoning for Enhancing Visual-based Entity Question Answering](https://aclanthology.org/2024.emnlp-main.91/) (Zhang et al., EMNLP 2024)
ACL