fix issue with missing value in column labels

jorisvandenbossche · Dec 8, 2024 · 2cb15e5 · 2cb15e5
1 parent 0c5f862
commit 2cb15e5
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 12 deletions.
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -174,7 +174,11 @@ def get_column_metadata(column, name, arrow_type, field_name):
         }
         string_dtype = 'object'
 
-    if name is not None and not isinstance(name, str):
+    if (
+        name is not None
+        and not (isinstance(name, float) and np.isnan(name))
+        and not isinstance(name, str)
+    ):
         raise TypeError(
             'Column name must be a string. Got column {} of type {}'.format(
                 name, type(name).__name__
@@ -331,8 +335,8 @@ def _column_name_to_strings(name):
         return str(tuple(map(_column_name_to_strings, name)))
     elif isinstance(name, Sequence):
         raise TypeError("Unsupported type for MultiIndex level")
-    elif name is None:
-        return None
+    elif name is None or (isinstance(name, float) and np.isnan(name)):
+        return name
     return str(name)
 
 
@@ -1068,9 +1072,9 @@ def get_pandas_logical_type_map():
             'date': 'datetime64[D]',
             'datetime': 'datetime64[ns]',
             'datetimetz': 'datetime64[ns]',
-            'unicode': np.str_,
+            'unicode': 'str',
             'bytes': np.bytes_,
-            'string': np.str_,
+            'string': 'str',
             'integer': np.int64,
             'floating': np.float64,
             'decimal': np.object_,

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -349,10 +349,18 @@ def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
 
-    def test_index_metadata_field_name(self, request):
-        if _pandas_api.uses_string_dtype():
-            # https://github.com/pandas-dev/pandas/issues/59879
-            request.applymarker(pytest.mark.xfail(reason="bug in pandas string dtype"))
+    def test_float_column_index_with_missing(self):
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=[1.5, np.nan])
+        _check_pandas_roundtrip(df, preserve_index=True)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The DataFrame has column names of mixed type:UserWarning"
+    )
+    def test_string_column_index_with_missing(self):
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=["A", None])
+        _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_index_metadata_field_name(self):
         # test None case, and strangely named non-index columns
         df = pd.DataFrame(
             [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
@@ -362,17 +370,24 @@ def test_index_metadata_field_name(self, request):
             ),
             columns=['a', None, '__index_level_0__'],
         )
-        with pytest.warns(UserWarning):
+        if _pandas_api.uses_string_dtype():
             t = pa.Table.from_pandas(df, preserve_index=True)
+        else:
+            with pytest.warns(UserWarning):
+                t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata
 
         col1, col2, col3, idx0, foo = js['columns']
 
         assert col1['name'] == 'a'
         assert col1['name'] == col1['field_name']
 
-        assert col2['name'] is None
-        assert col2['field_name'] == 'None'
+        if _pandas_api.uses_string_dtype():
+            assert np.isnan(col2['name'])
+            assert col2['field_name'] == 'nan'
+        else:
+            assert col2['name'] is None
+            assert col2['field_name'] == 'None'
 
         assert col3['name'] == '__index_level_0__'
         assert col3['name'] == col3['field_name']