diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e76b466ff2d87..94aed4aa295e1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -463,6 +463,8 @@ I/O - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) +- Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) +- Period ^^^^^^ diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49797eea59ddc..6785bf628919a 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1784,6 +1784,8 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; + // Skip leading whitespace. + while (isspace_ascii(*p)) p++; // Copy Leading sign if (*p == '+' || *p == '-') { *dst++ = *p++; @@ -1798,10 +1800,25 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, *dst++ = '.'; p++; } - // Copy the remainder of the string as is. - strncpy(dst, p, length + 1 - (p - s)); + // Copy fractional part after decimal (if any) + while (isdigit_ascii(*p)) { + *dst++ = *p++; + } + // Copy exponent if any + if (toupper_ascii(*p) == toupper_ascii('E')) { + *dst++ = *p++; + // Copy leading exponent sign (if any) + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy exponent digits + while (isdigit_ascii(*p)) { + *dst++ = *p++; + } + } + *dst++ = '\0'; // terminate if (endpos != NULL) - *endpos = (char *)(s + length); + *endpos = (char *)p; return s_copy; } @@ -1839,6 +1856,11 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, PyGILState_Release(gstate); free(pc); + if (skip_trailing && q != NULL && *q != p) { + while (isspace_ascii(**q)) { + (*q)++; + } + } return r; } diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ae5ddb83f7052..726cd64c6dc23 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -204,6 +204,11 @@ def test_1000_sep_decimal_float_precision( # test decimal and thousand sep handling in across 'float_precision' # parsers decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision) + text, value = numeric_decimal + text = " " + text + " " + if isinstance(value, str): # the negative cases (parse as text) + value = " " + value + " " + decimal_number_check(c_parser_only, (text, value), thousands, float_precision) def decimal_number_check(parser, numeric_decimal, thousands, float_precision): @@ -222,6 +227,24 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision): assert val == numeric_decimal[1] +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +def test_skip_whitespace(c_parser_only, float_precision): + DATA = """id\tnum\t +1\t1.2 \t +1\t 2.1\t +2\t 1\t +2\t 1.2 \t +""" + df = c_parser_only.read_csv( + StringIO(DATA), + float_precision=float_precision, + sep="\t", + header=0, + dtype={1: np.float64}, + ) + tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num")) + + def test_true_values_cast_to_bool(all_parsers): # GH#34655 text = """a,b