From eeff2b0c3bbaab646bfe019810931975fff7cdab Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Mon, 27 Dec 2021 18:16:00 +0100 Subject: [PATCH] Fix issue #36271 - pd.read_json() fails for strings that look similar to fsspec_url (#44619) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/common.py | 4 +++- pandas/tests/io/json/test_pandas.py | 15 +++++++++++++++ pandas/tests/io/test_common.py | 5 +++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 58f3fee1805d7..c069c1ce2413c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -785,6 +785,7 @@ I/O - Bug in :func:`read_csv` not setting name of :class:`MultiIndex` columns correctly when ``index_col`` is not the first column (:issue:`38549`) - Bug in :func:`read_csv` silently ignoring errors when failing to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) +- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) - Period diff --git a/pandas/io/common.py b/pandas/io/common.py index eddcd06a4d645..e54230c06d9b3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,6 +18,7 @@ import mmap import os from pathlib import Path +import re from typing import ( IO, Any, @@ -59,6 +60,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") +_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) @@ -244,7 +246,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ return ( isinstance(url, str) - and "://" in url + and bool(_RFC_3986_PATTERN.match(url)) and not url.startswith(("http://", "https://")) ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1cfda5c096fba..0b8548f98b03b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1527,6 +1527,21 @@ def test_read_timezone_information(self): expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "url", + [ + "s3://example-fsspec/", + "gcs://another-fsspec/file.json", + "https://example-site.com/data", + "some-protocol://data.txt", + ], + ) + def test_read_json_with_url_value(self, url): + # GH 36271 + result = read_json(f'{{"url":{{"0":"{url}"}}}}') + expected = DataFrame({"url": [url]}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] ) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f718a52a8a96b..43a5a33a0fdd4 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -494,6 +494,11 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + # fsspec URL in string should not be recognized + assert not icom.is_fsspec_url("this is not fsspec://url") + assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}") + # accept everything that conforms to RFC 3986 schema + assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") @pytest.mark.parametrize("encoding", [None, "utf-8"])