Skip to content

Commit

Permalink
Fix issue pandas-dev#36271 - pd.read_json() fails for strings that lo…
Browse files Browse the repository at this point in the history
…ok similar to fsspec_url (pandas-dev#44619)
  • Loading branch information
mntss authored Dec 27, 2021
1 parent aff0694 commit eeff2b0
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,7 @@ I/O
- Bug in :func:`read_csv` not setting name of :class:`MultiIndex` columns correctly when ``index_col`` is not the first column (:issue:`38549`)
- Bug in :func:`read_csv` silently ignoring errors when failing to create a memory-mapped file (:issue:`44766`)
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
-

Period
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import mmap
import os
from pathlib import Path
import re
from typing import (
IO,
Any,
Expand Down Expand Up @@ -59,6 +60,7 @@

_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")

BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)

Expand Down Expand Up @@ -244,7 +246,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
"""
return (
isinstance(url, str)
and "://" in url
and bool(_RFC_3986_PATTERN.match(url))
and not url.startswith(("http://", "https://"))
)

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,21 @@ def test_read_timezone_information(self):
expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC"))
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"url",
[
"s3://example-fsspec/",
"gcs://another-fsspec/file.json",
"https://example-site.com/data",
"some-protocol://data.txt",
],
)
def test_read_json_with_url_value(self, url):
# GH 36271
result = read_json(f'{{"url":{{"0":"{url}"}}}}')
expected = DataFrame({"url": [url]})
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")]
)
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,11 @@ def test_is_fsspec_url():
assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
assert not icom.is_fsspec_url("/local/path")
assert not icom.is_fsspec_url("relative/local/path")
# fsspec URL in string should not be recognized
assert not icom.is_fsspec_url("this is not fsspec://url")
assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}")
# accept everything that conforms to RFC 3986 schema
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")


@pytest.mark.parametrize("encoding", [None, "utf-8"])
Expand Down

0 comments on commit eeff2b0

Please sign in to comment.