From a2aa47717b03557be211036a91ed18ead69cbd6a Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Tue, 4 Jan 2022 01:26:02 +0100
Subject: [PATCH] TYP: type read_xml and deprecate passing positional arguments
(#45133)
---
doc/source/whatsnew/v1.4.0.rst | 1 +
pandas/_typing.py | 1 +
pandas/io/xml.py | 73 +++++++++++++++++++--------------
pandas/tests/io/xml/test_xml.py | 39 ++++++++++++++++++
4 files changed, 83 insertions(+), 31 deletions(-)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 6f724ec221c83..a940a1a1c0a08 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -611,6 +611,7 @@ Other Deprecations
- Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`)
- Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`)
- Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`):
+- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`):
- Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
- Deprecated the behavior of :func:`to_datetime` with the string "now" with ``utc=False``; in a future version this will match ``Timestamp("now")``, which in turn matches :meth:`Timestamp.now` returning the local time (:issue:`18705`)
- Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 159d57fb27c89..5c8f0be39f712 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -246,6 +246,7 @@ def closed(self) -> bool:
CompressionOptions = Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
]
+XMLParsers = Literal["lxml", "etree"]
# types in DataFrameFormatter
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index d72f02fa817ce..ad87b18bd1683 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -5,19 +5,24 @@
from __future__ import annotations
import io
+from typing import Sequence
from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
+ XMLParsers,
)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
AbstractMethodError,
ParserError,
)
-from pandas.util._decorators import doc
+from pandas.util._decorators import (
+ deprecate_nonkeyword_arguments,
+ doc,
+)
from pandas.core.dtypes.common import is_list_like
@@ -98,17 +103,17 @@ class _XMLFrameParser:
def __init__(
self,
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- encoding,
- stylesheet,
+ path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+ xpath: str,
+ namespaces: dict[str, str] | None,
+ elems_only: bool,
+ attrs_only: bool,
+ names: Sequence[str] | None,
+ encoding: str | None,
+ stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
storage_options: StorageOptions,
- ) -> None:
+ ):
self.path_or_buffer = path_or_buffer
self.xpath = xpath
self.namespaces = namespaces
@@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser):
XPath 1.0 and XSLT 1.0.
"""
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
-
def parse_data(self) -> list[dict[str, str | None]]:
"""
Parse xml data.
@@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes:
curr_parser = XMLParser(encoding=self.encoding)
if isinstance(xml_data, io.StringIO):
+ if self.encoding is None:
+ raise TypeError(
+ "Can not pass encoding None when input is StringIO."
+ )
+
doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
@@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes:
def get_data_from_filepath(
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
- encoding,
+ encoding: str | None,
compression: CompressionOptions,
storage_options: StorageOptions,
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
@@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes.
def _parse(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- encoding,
- parser,
- stylesheet,
+ path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+ xpath: str,
+ namespaces: dict[str, str] | None,
+ elems_only: bool,
+ attrs_only: bool,
+ names: Sequence[str] | None,
+ encoding: str | None,
+ parser: XMLParsers,
+ stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
storage_options: StorageOptions,
**kwargs,
@@ -686,11 +693,11 @@ def _parse(
* If parser is not lxml or etree.
"""
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
-
p: _EtreeFrameParser | _LxmlFrameParser
if parser == "lxml":
+ lxml = import_optional_dependency("lxml.etree", errors="ignore")
+
if lxml is not None:
p = _LxmlFrameParser(
path_or_buffer,
@@ -728,19 +735,23 @@ def _parse(
return _data_to_frame(data=data_dicts, **kwargs)
+@deprecate_nonkeyword_arguments(
+ version=None, allowed_args=["path_or_buffer"], stacklevel=2
+)
@doc(
storage_options=_shared_docs["storage_options"],
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
)
def read_xml(
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str | None = "./*",
- namespaces: dict | list[dict] | None = None,
- elems_only: bool | None = False,
- attrs_only: bool | None = False,
- names: list[str] | None = None,
+ xpath: str = "./*",
+ namespaces: dict[str, str] | None = None,
+ elems_only: bool = False,
+ attrs_only: bool = False,
+ names: Sequence[str] | None = None,
+ # encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
- parser: str | None = "lxml",
+ parser: XMLParsers = "lxml",
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index b72111ec6cf1e..8809c423a29ba 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath):
tm.assert_frame_equal(df_lxml, df_etree)
+@td.skip_if_no("lxml")
+def test_wrong_encoding_for_lxml():
+ # GH#45133
+ data = """
+
+ c
+
+
+"""
+ with pytest.raises(TypeError, match="encoding None"):
+ read_xml(StringIO(data), parser="lxml", encoding=None)
+
+
+def test_none_encoding_etree():
+ # GH#45133
+ data = """
+
+ c
+
+
+"""
+ result = read_xml(StringIO(data), parser="etree", encoding=None)
+ expected = DataFrame({"a": ["c"]})
+ tm.assert_frame_equal(result, expected)
+
+
# PARSER
@@ -769,6 +795,19 @@ def test_stylesheet_file(datapath):
tm.assert_frame_equal(df_kml, df_style)
+def test_read_xml_passing_as_positional_deprecated(datapath, parser):
+ # GH#45133
+ kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
+
+ with tm.assert_produces_warning(FutureWarning, match="keyword-only"):
+ read_xml(
+ kml,
+ ".//k:Placemark",
+ namespaces={"k": "http://www.opengis.net/kml/2.2"},
+ parser=parser,
+ )
+
+
@td.skip_if_no("lxml")
def test_stylesheet_file_like(datapath, mode):
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")