Skip to content

Commit

Permalink
TYP: type read_xml and deprecate passing positional arguments (pandas…
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Jan 4, 2022
1 parent 490b189 commit a2aa477
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 31 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ Other Deprecations
- Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`)
- Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`)
- Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`):
- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`):
- Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
- Deprecated the behavior of :func:`to_datetime` with the string "now" with ``utc=False``; in a future version this will match ``Timestamp("now")``, which in turn matches :meth:`Timestamp.now` returning the local time (:issue:`18705`)
- Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def closed(self) -> bool:
CompressionOptions = Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
]
XMLParsers = Literal["lxml", "etree"]


# types in DataFrameFormatter
Expand Down
73 changes: 42 additions & 31 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,24 @@
from __future__ import annotations

import io
from typing import Sequence

from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
XMLParsers,
)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
AbstractMethodError,
ParserError,
)
from pandas.util._decorators import doc
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)

from pandas.core.dtypes.common import is_list_like

Expand Down Expand Up @@ -98,17 +103,17 @@ class _XMLFrameParser:

def __init__(
self,
path_or_buffer,
xpath,
namespaces,
elems_only,
attrs_only,
names,
encoding,
stylesheet,
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str,
namespaces: dict[str, str] | None,
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
encoding: str | None,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
storage_options: StorageOptions,
) -> None:
):
self.path_or_buffer = path_or_buffer
self.xpath = xpath
self.namespaces = namespaces
Expand Down Expand Up @@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser):
XPath 1.0 and XSLT 1.0.
"""

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

def parse_data(self) -> list[dict[str, str | None]]:
"""
Parse xml data.
Expand Down Expand Up @@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes:
curr_parser = XMLParser(encoding=self.encoding)

if isinstance(xml_data, io.StringIO):
if self.encoding is None:
raise TypeError(
"Can not pass encoding None when input is StringIO."
)

doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
Expand All @@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes:

def get_data_from_filepath(
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
encoding,
encoding: str | None,
compression: CompressionOptions,
storage_options: StorageOptions,
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
Expand Down Expand Up @@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes.


def _parse(
path_or_buffer,
xpath,
namespaces,
elems_only,
attrs_only,
names,
encoding,
parser,
stylesheet,
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str,
namespaces: dict[str, str] | None,
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
encoding: str | None,
parser: XMLParsers,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
storage_options: StorageOptions,
**kwargs,
Expand All @@ -686,11 +693,11 @@ def _parse(
* If parser is not lxml or etree.
"""

lxml = import_optional_dependency("lxml.etree", errors="ignore")

p: _EtreeFrameParser | _LxmlFrameParser

if parser == "lxml":
lxml = import_optional_dependency("lxml.etree", errors="ignore")

if lxml is not None:
p = _LxmlFrameParser(
path_or_buffer,
Expand Down Expand Up @@ -728,19 +735,23 @@ def _parse(
return _data_to_frame(data=data_dicts, **kwargs)


@deprecate_nonkeyword_arguments(
version=None, allowed_args=["path_or_buffer"], stacklevel=2
)
@doc(
storage_options=_shared_docs["storage_options"],
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
)
def read_xml(
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str | None = "./*",
namespaces: dict | list[dict] | None = None,
elems_only: bool | None = False,
attrs_only: bool | None = False,
names: list[str] | None = None,
xpath: str = "./*",
namespaces: dict[str, str] | None = None,
elems_only: bool = False,
attrs_only: bool = False,
names: Sequence[str] | None = None,
# encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
parser: str | None = "lxml",
parser: XMLParsers = "lxml",
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath):
tm.assert_frame_equal(df_lxml, df_etree)


@td.skip_if_no("lxml")
def test_wrong_encoding_for_lxml():
# GH#45133
data = """<data>
<row>
<a>c</a>
</row>
</data>
"""
with pytest.raises(TypeError, match="encoding None"):
read_xml(StringIO(data), parser="lxml", encoding=None)


def test_none_encoding_etree():
# GH#45133
data = """<data>
<row>
<a>c</a>
</row>
</data>
"""
result = read_xml(StringIO(data), parser="etree", encoding=None)
expected = DataFrame({"a": ["c"]})
tm.assert_frame_equal(result, expected)


# PARSER


Expand Down Expand Up @@ -769,6 +795,19 @@ def test_stylesheet_file(datapath):
tm.assert_frame_equal(df_kml, df_style)


def test_read_xml_passing_as_positional_deprecated(datapath, parser):
# GH#45133
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")

with tm.assert_produces_warning(FutureWarning, match="keyword-only"):
read_xml(
kml,
".//k:Placemark",
namespaces={"k": "http://www.opengis.net/kml/2.2"},
parser=parser,
)


@td.skip_if_no("lxml")
def test_stylesheet_file_like(datapath, mode):
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
Expand Down

0 comments on commit a2aa477

Please sign in to comment.