Skip to content

Commit

Permalink
BUG: round_trip parser initial/trailing whitespace (pandas-dev#43714)
Browse files Browse the repository at this point in the history
  • Loading branch information
ales-erjavec authored Oct 2, 2021
1 parent 4f9b3ea commit bd94bb1
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 3 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,8 @@ I/O
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
- Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
-

Period
^^^^^^
Expand Down
28 changes: 25 additions & 3 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,8 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
size_t length = strlen(s);
char *s_copy = malloc(length + 1);
char *dst = s_copy;
// Skip leading whitespace.
while (isspace_ascii(*p)) p++;
// Copy Leading sign
if (*p == '+' || *p == '-') {
*dst++ = *p++;
Expand All @@ -1798,10 +1800,25 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
*dst++ = '.';
p++;
}
// Copy the remainder of the string as is.
strncpy(dst, p, length + 1 - (p - s));
// Copy fractional part after decimal (if any)
while (isdigit_ascii(*p)) {
*dst++ = *p++;
}
// Copy exponent if any
if (toupper_ascii(*p) == toupper_ascii('E')) {
*dst++ = *p++;
// Copy leading exponent sign (if any)
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy exponent digits
while (isdigit_ascii(*p)) {
*dst++ = *p++;
}
}
*dst++ = '\0'; // terminate
if (endpos != NULL)
*endpos = (char *)(s + length);
*endpos = (char *)p;
return s_copy;
}

Expand Down Expand Up @@ -1839,6 +1856,11 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,

PyGILState_Release(gstate);
free(pc);
if (skip_trailing && q != NULL && *q != p) {
while (isspace_ascii(**q)) {
(*q)++;
}
}
return r;
}

Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ def test_1000_sep_decimal_float_precision(
# test decimal and thousand sep handling in across 'float_precision'
# parsers
decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision)
text, value = numeric_decimal
text = " " + text + " "
if isinstance(value, str): # the negative cases (parse as text)
value = " " + value + " "
decimal_number_check(c_parser_only, (text, value), thousands, float_precision)


def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
Expand All @@ -222,6 +227,24 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision):
assert val == numeric_decimal[1]


@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_skip_whitespace(c_parser_only, float_precision):
DATA = """id\tnum\t
1\t1.2 \t
1\t 2.1\t
2\t 1\t
2\t 1.2 \t
"""
df = c_parser_only.read_csv(
StringIO(DATA),
float_precision=float_precision,
sep="\t",
header=0,
dtype={1: np.float64},
)
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))


def test_true_values_cast_to_bool(all_parsers):
# GH#34655
text = """a,b
Expand Down

0 comments on commit bd94bb1

Please sign in to comment.