Output formatting: preserve quoting for string categories (#61891)

jorisvandenbossche · web-flow · commit c849d39c4c95 · 2025-07-25T15:26:36.000+02:00
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2233,8 +2233,16 @@ def _repr_categories(self) -> list[str]:
         )
         from pandas.io.formats import format as fmt
 
+        formatter = None
+        if self.categories.dtype == "str":
+            # the extension array formatter defaults to boxed=True in format_array
+            # override here to boxed=False to be consistent with QUOTE_NONNUMERIC
+            formatter = cast(ExtensionArray, self.categories._values)._formatter(
+                boxed=False
+            )
+
         format_array = partial(
-            fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
+            fmt.format_array, formatter=formatter, quoting=QUOTE_NONNUMERIC
         )
         if len(self.categories) > max_categories:
             num = max_categories // 2
diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py
@@ -16,16 +16,11 @@
 class TestCategoricalReprWithFactor:
     def test_print(self, using_infer_string):
         factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
-        if using_infer_string:
-            expected = [
-                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-                "Categories (3, str): [a < b < c]",
-            ]
-        else:
-            expected = [
-                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-                "Categories (3, object): ['a' < 'b' < 'c']",
-            ]
+        dtype = "str" if using_infer_string else "object"
+        expected = [
+            "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
+            f"Categories (3, {dtype}): ['a' < 'b' < 'c']",
+        ]
         expected = "\n".join(expected)
         actual = repr(factor)
         assert actual == expected
@@ -82,10 +77,7 @@ def test_unicode_print(self, using_infer_string):
 Categories (3, object): ['aaaaa', 'bb', 'cccc']"""
 
         if using_infer_string:
-            expected = expected.replace(
-                "(3, object): ['aaaaa', 'bb', 'cccc']",
-                "(3, str): [aaaaa, bb, cccc]",
-            )
+            expected = expected.replace("object", "str")
 
         assert repr(c) == expected
 
@@ -96,10 +88,7 @@ def test_unicode_print(self, using_infer_string):
 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']"""  # noqa: E501
 
         if using_infer_string:
-            expected = expected.replace(
-                "(3, object): ['ああああ', 'いいいいい', 'ううううううう']",
-                "(3, str): [ああああ, いいいいい, ううううううう]",
-            )
+            expected = expected.replace("object", "str")
 
         assert repr(c) == expected
 
@@ -112,12 +101,9 @@ def test_unicode_print(self, using_infer_string):
 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']"""  # noqa: E501
 
         if using_infer_string:
-            expected = expected.replace(
-                "(3, object): ['ああああ', 'いいいいい', 'ううううううう']",
-                "(3, str): [ああああ, いいいいい, ううううううう]",
-            )
+            expected = expected.replace("object", "str")
 
-            assert repr(c) == expected
+        assert repr(c) == expected
 
     def test_categorical_repr(self):
         c = Categorical([1, 2, 3])
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import index as libindex
 from pandas._libs.arrays import NDArrayBacked
 
@@ -199,8 +197,6 @@ def test_unique(self, data, categories, expected_data, ordered):
         expected = CategoricalIndex(expected_data, dtype=dtype)
         tm.assert_index_equal(idx.unique(), expected)
 
-    # TODO(3.0): remove this test once using_string_dtype() is always True
-    @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip")
     def test_repr_roundtrip(self):
         ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
         str(ci)
diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py
@@ -8,125 +8,78 @@
 
 
 class TestCategoricalIndexReprStringCategories:
-    def test_string_categorical_index_repr(self, using_infer_string):
+    def test_string_categorical_index_repr(self):
         # short
         idx = CategoricalIndex(["a", "bb", "ccc"])
         expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['a', 'bb', 'ccc']",
-                "categories=[a, bb, ccc]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_multiline(self, using_infer_string):
+    def test_categorical_index_repr_multiline(self):
         # multiple lines
         idx = CategoricalIndex(["a", "bb", "ccc"] * 10)
         expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                   'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb',
                   'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                  categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['a', 'bb', 'ccc']",
-                "categories=[a, bb, ccc]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_truncated(self, using_infer_string):
+    def test_categorical_index_repr_truncated(self):
         # truncated
         idx = CategoricalIndex(["a", "bb", "ccc"] * 100)
         expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                   ...
                   'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                  categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['a', 'bb', 'ccc']",
-                "categories=[a, bb, ccc]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_many_categories(self, using_infer_string):
+    def test_categorical_index_repr_many_categories(self):
         # larger categories
         idx = CategoricalIndex(list("abcdefghijklmmo"))
         expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                   'm', 'm', 'o'],
                  categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o']",
-                "categories=[a, b, c, d, ..., k, l, m, o]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_unicode(self, using_infer_string):
+    def test_categorical_index_repr_unicode(self):
         # short
         idx = CategoricalIndex(["あ", "いい", "ううう"])
         expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['あ', 'いい', 'ううう']",
-                "categories=[あ, いい, ううう]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_unicode_multiline(self, using_infer_string):
+    def test_categorical_index_repr_unicode_multiline(self):
         # multiple lines
         idx = CategoricalIndex(["あ", "いい", "ううう"] * 10)
         expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                   'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                   'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                  categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['あ', 'いい', 'ううう']",
-                "categories=[あ, いい, ううう]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_unicode_truncated(self, using_infer_string):
+    def test_categorical_index_repr_unicode_truncated(self):
         # truncated
         idx = CategoricalIndex(["あ", "いい", "ううう"] * 100)
         expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                   ...
                   'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                  categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['あ', 'いい', 'ううう']",
-                "categories=[あ, いい, ううう]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_unicode_many_categories(self, using_infer_string):
+    def test_categorical_index_repr_unicode_many_categories(self):
         # larger categories
         idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
         expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し',
                   'す', 'せ', 'そ'],
                  categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')"""  # noqa: E501
-        if using_infer_string:
-            expected = expected.replace(
-                "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']",
-                "categories=[あ, い, う, え, ..., し, す, せ, そ]",
-            )
         assert repr(idx) == expected
 
-    def test_categorical_index_repr_east_asian_width(self, using_infer_string):
+    def test_categorical_index_repr_east_asian_width(self):
         with cf.option_context("display.unicode.east_asian_width", True):
             # short
             idx = CategoricalIndex(["あ", "いい", "ううう"])
             expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa: E501
-            if using_infer_string:
-                expected = expected.replace(
-                    "categories=['あ', 'いい', 'ううう']",
-                    "categories=[あ, いい, ううう]",
-                )
             assert repr(idx) == expected
 
-    def test_categorical_index_repr_east_asian_width_multiline(
-        self, using_infer_string
-    ):
+    def test_categorical_index_repr_east_asian_width_multiline(self):
         with cf.option_context("display.unicode.east_asian_width", True):
             # multiple lines
             idx = CategoricalIndex(["あ", "いい", "ううう"] * 10)
@@ -136,16 +89,9 @@ def test_categorical_index_repr_east_asian_width_multiline(
                   'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                  categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa: E501
 
-            if using_infer_string:
-                expected = expected.replace(
-                    "categories=['あ', 'いい', 'ううう']",
-                    "categories=[あ, いい, ううう]",
-                )
             assert repr(idx) == expected
 
-    def test_categorical_index_repr_east_asian_width_truncated(
-        self, using_infer_string
-    ):
+    def test_categorical_index_repr_east_asian_width_truncated(self):
         with cf.option_context("display.unicode.east_asian_width", True):
             # truncated
             idx = CategoricalIndex(["あ", "いい", "ううう"] * 100)
@@ -156,25 +102,13 @@ def test_categorical_index_repr_east_asian_width_truncated(
                   'あ', 'いい', 'ううう'],
                  categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa: E501
 
-            if using_infer_string:
-                expected = expected.replace(
-                    "categories=['あ', 'いい', 'ううう']",
-                    "categories=[あ, いい, ううう]",
-                )
             assert repr(idx) == expected
 
-    def test_categorical_index_repr_east_asian_width_many_categories(
-        self, using_infer_string
-    ):
+    def test_categorical_index_repr_east_asian_width_many_categories(self):
         with cf.option_context("display.unicode.east_asian_width", True):
             idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
             expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ',
                   'さ', 'し', 'す', 'せ', 'そ'],
                  categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')"""  # noqa: E501
 
-            if using_infer_string:
-                expected = expected.replace(
-                    "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']",
-                    "categories=[あ, い, う, え, ..., し, す, せ, そ]",
-                )
             assert repr(idx) == expected
diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py
@@ -309,38 +309,27 @@ def test_categorical_repr(self, using_infer_string):
         assert exp == a.__str__()
 
         a = Series(Categorical(["a", "b"] * 25))
+        exp = (
+            "0     a\n1     b\n"
+            "     ..\n"
+            "48    a\n49    b\n"
+            "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']"
+        )
         if using_infer_string:
-            exp = (
-                "0     a\n1     b\n"
-                "     ..\n"
-                "48    a\n49    b\n"
-                "Length: 50, dtype: category\nCategories (2, str): [a, b]"
-            )
-        else:
-            exp = (
-                "0     a\n1     b\n"
-                "     ..\n"
-                "48    a\n49    b\n"
-                "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']"
-            )
+            exp = exp.replace("object", "str")
         with option_context("display.max_rows", 5):
             assert exp == repr(a)
 
         levs = list("abcdefghijklmnopqrstuvwxyz")
         a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
+        exp = (
+            "0    a\n1    b\n"
+            "dtype: category\n"
+            "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... "
+            "'w' < 'x' < 'y' < 'z']"
+        )
         if using_infer_string:
-            exp = (
-                "0    a\n1    b\n"
-                "dtype: category\n"
-                "Categories (26, str): [a < b < c < d ... w < x < y < z]"
-            )
-        else:
-            exp = (
-                "0    a\n1    b\n"
-                "dtype: category\n"
-                "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... "
-                "'w' < 'x' < 'y' < 'z']"
-            )
+            exp = exp.replace("object", "str")
         assert exp == a.__str__()
 
     def test_categorical_series_repr(self):
diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py
@@ -214,24 +214,15 @@ def test_series_equal_numeric_values_mismatch(rtol):
 
 
 def test_series_equal_categorical_values_mismatch(rtol, using_infer_string):
-    if using_infer_string:
-        msg = """Series are different
-
-Series values are different \\(66\\.66667 %\\)
-\\[index\\]: \\[0, 1, 2\\]
-\\[left\\]:  \\['a', 'b', 'c'\\]
-Categories \\(3, str\\): \\[a, b, c\\]
-\\[right\\]: \\['a', 'c', 'b'\\]
-Categories \\(3, str\\): \\[a, b, c\\]"""
-    else:
-        msg = """Series are different
+    dtype = "str" if using_infer_string else "object"
+    msg = f"""Series are different
 
 Series values are different \\(66\\.66667 %\\)
 \\[index\\]: \\[0, 1, 2\\]
 \\[left\\]:  \\['a', 'b', 'c'\\]
-Categories \\(3, object\\): \\['a', 'b', 'c'\\]
+Categories \\(3, {dtype}\\): \\['a', 'b', 'c'\\]
 \\[right\\]: \\['a', 'c', 'b'\\]
-Categories \\(3, object\\): \\['a', 'b', 'c'\\]"""
+Categories \\(3, {dtype}\\): \\['a', 'b', 'c'\\]"""
 
     s1 = Series(Categorical(["a", "b", "c"]))
     s2 = Series(Categorical(["a", "c", "b"]))