From a2aa47717b03557be211036a91ed18ead69cbd6a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 4 Jan 2022 01:26:02 +0100 Subject: [PATCH] TYP: type read_xml and deprecate passing positional arguments (#45133) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_typing.py | 1 + pandas/io/xml.py | 73 +++++++++++++++++++-------------- pandas/tests/io/xml/test_xml.py | 39 ++++++++++++++++++ 4 files changed, 83 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 6f724ec221c83..a940a1a1c0a08 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -611,6 +611,7 @@ Other Deprecations - Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`) - Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`) - Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`): +- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`): - Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`) - Deprecated the behavior of :func:`to_datetime` with the string "now" with ``utc=False``; in a future version this will match ``Timestamp("now")``, which in turn matches :meth:`Timestamp.now` returning the local time (:issue:`18705`) - Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`) diff --git a/pandas/_typing.py b/pandas/_typing.py index 159d57fb27c89..5c8f0be39f712 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -246,6 +246,7 @@ def closed(self) -> bool: CompressionOptions = Optional[ Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict] ] +XMLParsers = Literal["lxml", "etree"] # types in DataFrameFormatter diff --git a/pandas/io/xml.py b/pandas/io/xml.py index d72f02fa817ce..ad87b18bd1683 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,19 +5,24 @@ from __future__ import annotations import io +from typing import Sequence from pandas._typing import ( CompressionOptions, FilePath, ReadBuffer, StorageOptions, + XMLParsers, ) from pandas.compat._optional import import_optional_dependency from pandas.errors import ( AbstractMethodError, ParserError, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + deprecate_nonkeyword_arguments, + doc, +) from pandas.core.dtypes.common import is_list_like @@ -98,17 +103,17 @@ class _XMLFrameParser: def __init__( self, - path_or_buffer, - xpath, - namespaces, - elems_only, - attrs_only, - names, - encoding, - stylesheet, + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + xpath: str, + namespaces: dict[str, str] | None, + elems_only: bool, + attrs_only: bool, + names: Sequence[str] | None, + encoding: str | None, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, storage_options: StorageOptions, - ) -> None: + ): self.path_or_buffer = path_or_buffer self.xpath = xpath self.namespaces = namespaces @@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser): XPath 1.0 and XSLT 1.0. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - def parse_data(self) -> list[dict[str, str | None]]: """ Parse xml data. @@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): + if self.encoding is None: + raise TypeError( + "Can not pass encoding None when input is StringIO." + ) + doc = fromstring( xml_data.getvalue().encode(self.encoding), parser=curr_parser ) @@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes: def get_data_from_filepath( filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], - encoding, + encoding: str | None, compression: CompressionOptions, storage_options: StorageOptions, ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: @@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes. def _parse( - path_or_buffer, - xpath, - namespaces, - elems_only, - attrs_only, - names, - encoding, - parser, - stylesheet, + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + xpath: str, + namespaces: dict[str, str] | None, + elems_only: bool, + attrs_only: bool, + names: Sequence[str] | None, + encoding: str | None, + parser: XMLParsers, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, storage_options: StorageOptions, **kwargs, @@ -686,11 +693,11 @@ def _parse( * If parser is not lxml or etree. """ - lxml = import_optional_dependency("lxml.etree", errors="ignore") - p: _EtreeFrameParser | _LxmlFrameParser if parser == "lxml": + lxml = import_optional_dependency("lxml.etree", errors="ignore") + if lxml is not None: p = _LxmlFrameParser( path_or_buffer, @@ -728,19 +735,23 @@ def _parse( return _data_to_frame(data=data_dicts, **kwargs) +@deprecate_nonkeyword_arguments( + version=None, allowed_args=["path_or_buffer"], stacklevel=2 +) @doc( storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", ) def read_xml( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], - xpath: str | None = "./*", - namespaces: dict | list[dict] | None = None, - elems_only: bool | None = False, - attrs_only: bool | None = False, - names: list[str] | None = None, + xpath: str = "./*", + namespaces: dict[str, str] | None = None, + elems_only: bool = False, + attrs_only: bool = False, + names: Sequence[str] | None = None, + # encoding can not be None for lxml and StringIO input encoding: str | None = "utf-8", - parser: str | None = "lxml", + parser: XMLParsers = "lxml", stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b72111ec6cf1e..8809c423a29ba 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath): tm.assert_frame_equal(df_lxml, df_etree) +@td.skip_if_no("lxml") +def test_wrong_encoding_for_lxml(): + # GH#45133 + data = """ + + c + + +""" + with pytest.raises(TypeError, match="encoding None"): + read_xml(StringIO(data), parser="lxml", encoding=None) + + +def test_none_encoding_etree(): + # GH#45133 + data = """ + + c + + +""" + result = read_xml(StringIO(data), parser="etree", encoding=None) + expected = DataFrame({"a": ["c"]}) + tm.assert_frame_equal(result, expected) + + # PARSER @@ -769,6 +795,19 @@ def test_stylesheet_file(datapath): tm.assert_frame_equal(df_kml, df_style) +def test_read_xml_passing_as_positional_deprecated(datapath, parser): + # GH#45133 + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with tm.assert_produces_warning(FutureWarning, match="keyword-only"): + read_xml( + kml, + ".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + parser=parser, + ) + + @td.skip_if_no("lxml") def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml")