@inproceedings{kaur-etal-2024-evaluating,
title = "Evaluating Large Language Models for Health-related Queries with Presuppositions",
author = "Kaur, Navreet and
Choudhury, Monojit and
Pruthi, Danish",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.850/",
doi = "10.18653/v1/2024.findings-acl.850",
pages = "14308--14331",
abstract = "As corporations rush to integrate large language models (LLMs) it is critical that they provide factually accurate information, that is robust to any presuppositions that a user may express. In this work, we introduce UPHILL, a dataset consisting of health-related queries with varying degrees of presuppositions. Using UPHILL, we evaluate the factual accuracy and consistency of InstructGPT, ChatGPT, GPT-4 and Bing Copilot models. We find that while model responses rarely contradict true health claims (posed as questions), all investigated models fail to challenge false claims. Alarmingly, responses from these models agree with 23-32{\%} of the existing false claims, and 49-55{\%} with novel fabricated claims. As we increase the extent of presupposition in input queries, responses from all models except Bing Copilot agree with the claim considerably more often, regardless of its veracity. Given the moderate factual accuracy, and the inability of models to challenge false assumptions, our work calls for a careful assessment of current LLMs for use in high-stakes scenarios."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kaur-etal-2024-evaluating">
<titleInfo>
<title>Evaluating Large Language Models for Health-related Queries with Presuppositions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Navreet</namePart>
<namePart type="family">Kaur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Monojit</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danish</namePart>
<namePart type="family">Pruthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As corporations rush to integrate large language models (LLMs) it is critical that they provide factually accurate information, that is robust to any presuppositions that a user may express. In this work, we introduce UPHILL, a dataset consisting of health-related queries with varying degrees of presuppositions. Using UPHILL, we evaluate the factual accuracy and consistency of InstructGPT, ChatGPT, GPT-4 and Bing Copilot models. We find that while model responses rarely contradict true health claims (posed as questions), all investigated models fail to challenge false claims. Alarmingly, responses from these models agree with 23-32% of the existing false claims, and 49-55% with novel fabricated claims. As we increase the extent of presupposition in input queries, responses from all models except Bing Copilot agree with the claim considerably more often, regardless of its veracity. Given the moderate factual accuracy, and the inability of models to challenge false assumptions, our work calls for a careful assessment of current LLMs for use in high-stakes scenarios.</abstract>
<identifier type="citekey">kaur-etal-2024-evaluating</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.850</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.850/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>14308</start>
<end>14331</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Large Language Models for Health-related Queries with Presuppositions
%A Kaur, Navreet
%A Choudhury, Monojit
%A Pruthi, Danish
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F kaur-etal-2024-evaluating
%X As corporations rush to integrate large language models (LLMs) it is critical that they provide factually accurate information, that is robust to any presuppositions that a user may express. In this work, we introduce UPHILL, a dataset consisting of health-related queries with varying degrees of presuppositions. Using UPHILL, we evaluate the factual accuracy and consistency of InstructGPT, ChatGPT, GPT-4 and Bing Copilot models. We find that while model responses rarely contradict true health claims (posed as questions), all investigated models fail to challenge false claims. Alarmingly, responses from these models agree with 23-32% of the existing false claims, and 49-55% with novel fabricated claims. As we increase the extent of presupposition in input queries, responses from all models except Bing Copilot agree with the claim considerably more often, regardless of its veracity. Given the moderate factual accuracy, and the inability of models to challenge false assumptions, our work calls for a careful assessment of current LLMs for use in high-stakes scenarios.
%R 10.18653/v1/2024.findings-acl.850
%U https://aclanthology.org/2024.findings-acl.850/
%U https://doi.org/10.18653/v1/2024.findings-acl.850
%P 14308-14331
Markdown (Informal)
[Evaluating Large Language Models for Health-related Queries with Presuppositions](https://aclanthology.org/2024.findings-acl.850/) (Kaur et al., Findings 2024)
ACL