diff --git a/capa/main.py b/capa/main.py index 474a435ab..f1ca77609 100644 --- a/capa/main.py +++ b/capa/main.py @@ -446,7 +446,14 @@ def main(argv=None): parser = argparse.ArgumentParser( description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter ) - parser.add_argument("sample", type=str, help="path to sample to analyze") + parser.add_argument( + # in #328 we noticed that the sample path is not handled correctly if it contains non-ASCII characters + # https://stackoverflow.com/a/22947334/ offers a solution and decoding using getfilesystemencoding works + # in our testing, however other sources suggest `sys.stdin.encoding` (https://stackoverflow.com/q/4012571/) + "sample", + type=lambda s: s.decode(sys.getfilesystemencoding()), + help="path to sample to analyze", + ) parser.add_argument("--version", action="version", version="%(prog)s {:s}".format(capa.version.__version__)) parser.add_argument( "-r", @@ -493,7 +500,9 @@ def main(argv=None): try: taste = get_file_taste(args.sample) except IOError as e: - logger.error("%s", str(e)) + # per our research there's not a programmatic way to render the IOError with non-ASCII filename unless we + # handle the IOError separately and reach into the args + logger.error("%s", e.args[0]) return -1 # py2 doesn't know about cp65001, which is a variant of utf-8 on windows diff --git a/tests/fixtures.py b/tests/fixtures.py index 1a1c28665..3ff40f6c9 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (C) 2020 FireEye, Inc. All Rights Reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -146,6 +147,8 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "64d9f7d96b99467f36e22fada623c3bb.dll_") elif name.startswith("82bf6"): return os.path.join(CD, "data", "82BF6347ACF15E5D883715DC289D8A2B.exe_") + elif name.startswith("pingtaest"): + return os.path.join(CD, "data", "ping_täst.exe_") else: raise ValueError("unexpected sample fixture") @@ -542,3 +545,8 @@ def z499c2_extractor(): @pytest.fixture def al_khaser_x86_extractor(): return get_extractor(get_data_path_by_name("al-khaser x86")) + + +@pytest.fixture +def pingtaest_extractor(): + return get_extractor(get_data_path_by_name("pingtaest")) diff --git a/tests/test_main.py b/tests/test_main.py index 7eb84883c..6ceae34a7 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (C) 2020 FireEye, Inc. All Rights Reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -57,6 +58,30 @@ def test_main_single_rule(z9324d_extractor, tmpdir): ) +@pytest.mark.xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2") +def test_main_non_ascii_filename(pingtaest_extractor, tmpdir, capsys): + # on py2.7, need to be careful about str (which can hold bytes) + # vs unicode (which is only unicode characters). + # on py3, this should not be needed. + # + # here we print a string with unicode characters in it + # (specifically, a byte string with utf-8 bytes in it, see file encoding) + assert capa.main.main(["-q", pingtaest_extractor.path]) == 0 + + std = capsys.readouterr() + # but here, we have to use a unicode instance, + # because capsys has decoded the output for us. + assert pingtaest_extractor.path.decode("utf-8") in std.out + + +@pytest.mark.xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2") +def test_main_non_ascii_filename_nonexistent(tmpdir, caplog): + NON_ASCII_FILENAME = "täst_not_there.exe" + assert capa.main.main(["-q", NON_ASCII_FILENAME]) == -1 + + assert NON_ASCII_FILENAME.decode("utf-8") in caplog.text + + @pytest.mark.xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2") def test_main_shellcode(z499c2_extractor): path = z499c2_extractor.path