@inproceedings{melamed-etal-2024-prompts,
title = "Prompts have evil twins",
author = "Melamed, Rimon and
McCabe, Lucas Hurley and
Wakhare, Tanay and
Kim, Yejin and
Huang, H. Howie and
Boix-Adser{\`a}, Enric",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.4/",
doi = "10.18653/v1/2024.emnlp-main.4",
pages = "46--74",
abstract = "We discover that many natural-language prompts can be replaced by corresponding prompts that are unintelligible to humans but that provably elicit similar behavior in language models. We call these prompts {\textquotedblleft}evil twins{\textquotedblright} because they are obfuscated and uninterpretable (evil), but at the same time mimic the functionality of the original natural-language prompts (twins). Remarkably, evil twins transfer between models. We find these prompts by solving a maximum-likelihood problem which has applications of independent interest."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="melamed-etal-2024-prompts">
<titleInfo>
<title>Prompts have evil twins</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rimon</namePart>
<namePart type="family">Melamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="given">Hurley</namePart>
<namePart type="family">McCabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanay</namePart>
<namePart type="family">Wakhare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yejin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">Howie</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enric</namePart>
<namePart type="family">Boix-Adserà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We discover that many natural-language prompts can be replaced by corresponding prompts that are unintelligible to humans but that provably elicit similar behavior in language models. We call these prompts “evil twins” because they are obfuscated and uninterpretable (evil), but at the same time mimic the functionality of the original natural-language prompts (twins). Remarkably, evil twins transfer between models. We find these prompts by solving a maximum-likelihood problem which has applications of independent interest.</abstract>
<identifier type="citekey">melamed-etal-2024-prompts</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.4</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.4/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>46</start>
<end>74</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Prompts have evil twins
%A Melamed, Rimon
%A McCabe, Lucas Hurley
%A Wakhare, Tanay
%A Kim, Yejin
%A Huang, H. Howie
%A Boix-Adserà, Enric
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F melamed-etal-2024-prompts
%X We discover that many natural-language prompts can be replaced by corresponding prompts that are unintelligible to humans but that provably elicit similar behavior in language models. We call these prompts “evil twins” because they are obfuscated and uninterpretable (evil), but at the same time mimic the functionality of the original natural-language prompts (twins). Remarkably, evil twins transfer between models. We find these prompts by solving a maximum-likelihood problem which has applications of independent interest.
%R 10.18653/v1/2024.emnlp-main.4
%U https://aclanthology.org/2024.emnlp-main.4/
%U https://doi.org/10.18653/v1/2024.emnlp-main.4
%P 46-74
Markdown (Informal)
[Prompts have evil twins](https://aclanthology.org/2024.emnlp-main.4/) (Melamed et al., EMNLP 2024)
ACL
- Rimon Melamed, Lucas Hurley McCabe, Tanay Wakhare, Yejin Kim, H. Howie Huang, and Enric Boix-Adserà. 2024. Prompts have evil twins. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 46–74, Miami, Florida, USA. Association for Computational Linguistics.