From 09fdc258baf244b74bb72fd585a187b0450a29f1 Mon Sep 17 00:00:00 2001 From: cooki35 <33752572+cooki35@users.noreply.github.com> Date: Tue, 29 Nov 2022 14:39:46 +0100 Subject: [PATCH 1/4] Add support for CSV like input and do some minor refactoring. Add the input format InputFormat.CSV in parse_lines. In parse_line call the function parse_line_csv if the input format is InputFormat.CSV. The parse_line_csv function extracts the instruction from the line and generates an Instruction instance. Creat an abstract base class TargetInfo for the X86 and ARM target infos. Make the class InputFormat inherit from Enum. Move the classes to the top of the file. Add some comments to document the code. --- src/asm2cfg/asm2cfg.py | 302 +++++++++++++++++++++++++---------------- 1 file changed, 188 insertions(+), 114 deletions(-) diff --git a/src/asm2cfg/asm2cfg.py b/src/asm2cfg/asm2cfg.py index 2c426ae..87db83e 100644 --- a/src/asm2cfg/asm2cfg.py +++ b/src/asm2cfg/asm2cfg.py @@ -2,29 +2,70 @@ Module containing main building blocks to parse assembly and draw CFGs. """ +from abc import ABC, abstractmethod import re import sys import tempfile - +from enum import Enum from graphviz import Digraph # TODO: make this a command-line flag VERBOSE = 0 +# Common regexes +HEX_PATTERN = r'[0-9a-fA-F]+' +HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN -def escape(instruction): + +class InputFormat(Enum): """ - Escape used dot graph characters in given instruction so they will be - displayed correctly. + An enum which represents various supported input formats """ - instruction = instruction.replace('<', r'\<') - instruction = instruction.replace('>', r'\>') - instruction = instruction.replace('|', r'\|') - instruction = instruction.replace('{', r'\{') - instruction = instruction.replace('}', r'\}') - instruction = instruction.replace(' ', ' ') - return instruction + GDB = 'GDB' + OBJDUMP = 'OBJDUMP' + CSV = 'CSV' + + +class JumpTable: + """ + Holds info about branch sources and destinations in asm function. + """ + + def __init__(self, instructions): + # Address where the jump begins and value which address + # to jump to. This also includes calls. + self.abs_sources = {} + self.rel_sources = {} + + # Addresses where jumps end inside the current function. + self.abs_destinations = set() + self.rel_destinations = set() + + # Iterate over the lines and collect jump targets and branching points. + for inst in instructions: + if inst is None or not inst.is_direct_jump(): + continue + + self.abs_sources[inst.address.abs] = inst.target + self.abs_destinations.add(inst.target.abs) + + self.rel_sources[inst.address.offset] = inst.target + self.rel_destinations.add(inst.target.offset) + + def is_destination(self, address): + if address.abs is not None: + return address.abs in self.abs_destinations + if address.offset is not None: + return address.offset in self.rel_destinations + return False + + def get_target(self, address): + if address.abs is not None: + return self.abs_sources.get(address.abs) + if address.offset is not None: + return self.rel_sources.get(address.offset) + return None class BasicBlock: @@ -85,69 +126,11 @@ def __repr__(self): return '\n'.join([i.text for i in self.instructions]) -def print_assembly(basic_blocks): - """ - Debug function to print the assembly. - """ - for basic_block in basic_blocks.values(): - print(basic_block) - - -def read_lines(file_path): - """ Read lines from the file and return then as a list. """ - lines = [] - with open(file_path, 'r', encoding='utf8') as asm_file: - lines = asm_file.readlines() - return lines - - -# Common regexes -HEX_PATTERN = r'[0-9a-fA-F]+' -HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN - - -class InputFormat: # pylint: disable=too-few-public-methods - """ - An enum which represents various supported input formats - """ - GDB = 'GDB' - OBJDUMP = 'OBJDUMP' - - -def parse_function_header(line): - """ - Return function name of memory range from the given string line. - - Match lines for non-stripped binaries: - 'Dump of assembler code for function test_function:' - lines for stripped binaries: - 'Dump of assembler code from 0x555555555faf to 0x555555557008:' - and lines for obdjdump disassembly: - '0000000000016bb0 <_obstack_allocated_p@@Base>:' - """ - - objdump_name_pattern = re.compile(fr'{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:') - function_name = objdump_name_pattern.search(line) - if function_name is not None: - return InputFormat.OBJDUMP, function_name[1] - - function_name_pattern = re.compile(r'function (\w+):$') - function_name = function_name_pattern.search(line) - if function_name is not None: - return InputFormat.GDB, function_name[1] - - memory_range_pattern = re.compile(fr'(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$') - memory_range = memory_range_pattern.search(line) - if memory_range is not None: - return InputFormat.GDB, f'{memory_range[1]}-{memory_range[2]}' - - return None, None - - class Address: """ Represents location in program which may be absolute or relative """ + def __init__(self, abs_addr, base=None, offset=None): self.abs = abs_addr self.base = base @@ -182,6 +165,7 @@ class Encoding: e.g. the '31 c0' in '16bd3: 31 c0 xor %eax,%eax' """ + def __init__(self, bites): self.bites = bites @@ -192,7 +176,46 @@ def __str__(self): return ' '.join(map(lambda b: f'{b:#x}', self.bites)) -class X86TargetInfo: +class TargetInfo(ABC): + """ + Abstract class, contains instruction info for the targets. + """ + + def __init__(self): + pass + + @abstractmethod + def comment(self): + """ + Returns the comment symbol for the target. + """ + + @abstractmethod + def is_call(self, instruction): + """ + Returns True if the instruction is of type call. + """ + + @abstractmethod + def is_jump(self, instruction): + """ + Returns True if the instruction is of type jump. + """ + + @abstractmethod + def is_unconditional_jump(self, instruction): + """ + Returns True if the instruction is an is_unconditional jump. + """ + + @abstractmethod + def is_sink(self, instruction): + """ + Is this an instruction which terminates function execution e.g. return? + """ + + +class X86TargetInfo(TargetInfo): """ Contains instruction info for X86-compatible targets. """ @@ -223,7 +246,7 @@ def is_sink(self, instruction): return instruction.opcode.startswith('ret') -class ARMTargetInfo: +class ARMTargetInfo(TargetInfo): """ Contains instruction info for ARM-compatible targets. """ @@ -266,6 +289,7 @@ class Instruction: Represents a single assembly instruction with it operands, location and optional branch target """ + def __init__(self, body, text, lineno, address, opcode, ops, target, imm, target_info): # noqa self.body = body self.text = text @@ -303,6 +327,69 @@ def __str__(self): return result +def escape(instruction): + """ + Escape used dot graph characters in given instruction so they will be + displayed correctly. + """ + instruction = instruction.replace('<', r'\<') + instruction = instruction.replace('>', r'\>') + instruction = instruction.replace('|', r'\|') + instruction = instruction.replace('{', r'\{') + instruction = instruction.replace('}', r'\}') + instruction = instruction.replace(' ', ' ') + return instruction + + +def print_assembly(basic_blocks): + """ + Debug function to print the assembly. + """ + for basic_block in basic_blocks.values(): + print(basic_block) + + +def read_lines(file_path): + """ Read lines from the file and return then as a list. """ + lines = [] + with open(file_path, 'r', encoding='utf8') as asm_file: + lines = asm_file.readlines() + return lines + + +def parse_function_header(line): + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for obdjdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + + objdump_name_pattern = re.compile(fr'{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:') + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return InputFormat.OBJDUMP, function_name[1] + + function_name_pattern = re.compile(r'function (\w+):$') + function_name = function_name_pattern.search(line) + if function_name is not None: + return InputFormat.GDB, function_name[1] + + memory_range_pattern = re.compile(fr'(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$') + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return InputFormat.GDB, f'{memory_range[1]}-{memory_range[2]}' + + if line.strip() == 'address;bytes;operator;operand': + return InputFormat.CSV, None + + return None, None + + def parse_address(line): """ Parses leading address of instruction @@ -397,10 +484,30 @@ def parse_comment(line, target_info): return target, imm_match[3] +def parse_line_csv(line: str, lineno, target_info): + """ + Parse a single line of assembly to create an Instruction instance. + """ + original_line = line + elements: list[str] = line.split(";") + addr: Address = Address(int(elements[0])) + operands: str = elements[3] + target: Address | None = None + match = re.match(r"^[\d]+$", operands) + if match: + target = Address(int(operands)) + txt = original_line.strip() + return Instruction(body=None, text=txt, lineno=lineno, address=addr, opcode=elements[2], ops=operands, target=target, imm=None, target_info=target_info) + + def parse_line(line, lineno, function_name, fmt, target_info): """ Parses a single line of assembly to create Instruction instance """ + original_line = line + + if fmt == InputFormat.CSV: + return parse_line_csv(line, lineno, target_info) # Strip GDB prefix and leading whites if line.startswith('=> '): @@ -417,7 +524,6 @@ def parse_line(line, lineno, function_name, fmt, target_info): if not line: return encoding - original_line = line body, opcode, ops, line = parse_body(line, target_info) if opcode is None: return None @@ -438,47 +544,6 @@ def parse_line(line, lineno, function_name, fmt, target_info): return Instruction(body, original_line.strip(), lineno, address, opcode, ops, target, imm, target_info) -class JumpTable: - """ - Holds info about branch sources and destinations in asm function. - """ - - def __init__(self, instructions): - # Address where the jump begins and value which address - # to jump to. This also includes calls. - self.abs_sources = {} - self.rel_sources = {} - - # Addresses where jumps end inside the current function. - self.abs_destinations = set() - self.rel_destinations = set() - - # Iterate over the lines and collect jump targets and branching points. - for inst in instructions: - if inst is None or not inst.is_direct_jump(): - continue - - self.abs_sources[inst.address.abs] = inst.target - self.abs_destinations.add(inst.target.abs) - - self.rel_sources[inst.address.offset] = inst.target - self.rel_destinations.add(inst.target.offset) - - def is_destination(self, address): - if address.abs is not None: - return address.abs in self.abs_destinations - if address.offset is not None: - return address.offset in self.rel_destinations - return False - - def get_target(self, address): - if address.abs is not None: - return self.abs_sources.get(address.abs) - if address.offset is not None: - return self.rel_sources.get(address.offset) - return None - - def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused-argument if target_name == 'x86': target_info = X86TargetInfo() @@ -492,6 +557,8 @@ def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused- current_function_name = current_format = None for num, line in enumerate(lines, 1): fmt, function_name = parse_function_header(line) + if fmt == InputFormat.CSV: + function_name = "CSV" if function_name is not None: assert current_function_name is None, 'we handle only one function for now' if VERBOSE: @@ -501,6 +568,7 @@ def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused- continue instruction_or_encoding = parse_line(line, num, current_function_name, current_format, target_info) + # print(instruction_or_encoding) if isinstance(instruction_or_encoding, Encoding): # Partial encoding for previous instruction, skip it continue @@ -508,12 +576,18 @@ def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused- instructions.append(instruction_or_encoding) continue + # Ignore the last line of gdb informing about the end of the dump if line.startswith('End of assembler dump') or not line: continue + # Ignore empty lines if line.strip() == '': continue + # Ignore the header of the CSV file + if line.strip() == 'address;bytes;operator;operand': + continue + print(f'Unexpected assembly at line {num}:\n {line}') sys.exit(1) From fe4aad5616e8e514c8ef718b60e599df40dd6d98 Mon Sep 17 00:00:00 2001 From: cooki35 <33752572+cooki35@users.noreply.github.com> Date: Tue, 29 Nov 2022 15:36:45 +0100 Subject: [PATCH 2/4] Fix some linting problems. --- src/asm2cfg/asm2cfg.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/asm2cfg/asm2cfg.py b/src/asm2cfg/asm2cfg.py index 87db83e..fbcae1e 100644 --- a/src/asm2cfg/asm2cfg.py +++ b/src/asm2cfg/asm2cfg.py @@ -22,6 +22,7 @@ class InputFormat(Enum): """ An enum which represents various supported input formats """ + GDB = 'GDB' OBJDUMP = 'OBJDUMP' CSV = 'CSV' @@ -489,15 +490,25 @@ def parse_line_csv(line: str, lineno, target_info): Parse a single line of assembly to create an Instruction instance. """ original_line = line - elements: list[str] = line.split(";") + elements: list[str] = line.split(';') addr: Address = Address(int(elements[0])) operands: str = elements[3] target: Address | None = None - match = re.match(r"^[\d]+$", operands) + match = re.match(r'^[\d]+$', operands) if match: target = Address(int(operands)) txt = original_line.strip() - return Instruction(body=None, text=txt, lineno=lineno, address=addr, opcode=elements[2], ops=operands, target=target, imm=None, target_info=target_info) + return Instruction( + body=None, + text=txt, + lineno=lineno, + address=addr, + opcode=elements[2], + ops=operands, + target=target, + imm=None, + target_info=target_info, + ) def parse_line(line, lineno, function_name, fmt, target_info): @@ -558,7 +569,7 @@ def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused- for num, line in enumerate(lines, 1): fmt, function_name = parse_function_header(line) if fmt == InputFormat.CSV: - function_name = "CSV" + function_name = 'CSV' if function_name is not None: assert current_function_name is None, 'we handle only one function for now' if VERBOSE: From 3186bcb54795f9953eddcfbc72f79798e82bd293 Mon Sep 17 00:00:00 2001 From: cooki35 <33752572+cooki35@users.noreply.github.com> Date: Tue, 29 Nov 2022 15:39:20 +0100 Subject: [PATCH 3/4] Update the README. Add that asm2cfg also supports CSV input. Rewrite some text. --- README.md | 234 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 120 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index c52e514..13bb21d 100644 --- a/README.md +++ b/README.md @@ -5,19 +5,20 @@ [![Total alerts](https://img.shields.io/lgtm/alerts/g/Kazhuu/asm2cfg.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Kazhuu/asm2cfg/alerts/) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Kazhuu/asm2cfg.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Kazhuu/asm2cfg/context:python) -Python command-line tool and GDB extension to view and save x86, ARM and objdump -assembly files as control-flow graph (CFG) pdf files. From GDB debugging session -use `viewcfg` command to view CFG and use `savecfg` command to save it to the -pdf file. +Asm2cfg is a python command-line tool and GDB extension to view and save x86 and +ARM assembly files from GDB, objdump or CSV files as control-flow graph (CFG) +pdf files. +From a GDB debugging session use the `viewcfg` command to view CFG and use +the `savecfg` command to save it to a pdf file.
-Program has been developed to support X86, ARM and objdump assembly outputs. -Program is mostly tested with x86 assembly. ARM and objdump formats might not be -fully supported. If you have any suggestions or find bugs, please open an issue -or create a pull request. If you want to contribute, check +Asm2cfg has been developed to support X86, ARM, objdump, GDB and CSV assembly +outputs. The program is mostly tested with x86 assembly. ARM, objdump and CSV +formats might not be fully supported. If you have any suggestions or find bugs, +please open an issue or create a pull request. If you want to contribute, check [Development](#development) how to get started. ## Table of Content @@ -26,10 +27,10 @@ or create a pull request. If you want to contribute, check * [Install](#install) * [Usage From GDB](#usage-from-gdb) -* [Usage as Standalone](#usage-as-standalone) - * [Knowing Function Name](#knowing-function-name) +* [Standalone Usage](#standalone-usage) + * [Get Function Names](#get-function-names) * [Disassemble Function](#disassemble-function) - * [Draw CFG](#draw-cfg) + * [Draw CFG](#draw-cfgs) * [Examples](#examples) * [Development](#development) * [Python Environment](#python-environment) @@ -43,102 +44,107 @@ or create a pull request. If you want to contribute, check ## Install -Project can be installed with pip +The project can be installed using pip: ``` pip install asm2cfg ``` -To be able to view the dot files from GDB. External dot viewer is required. For -this purpose [xdot](https://pypi.org/project/xdot/) can be used for example. Any -other dot viewer will also do. To install this on Debian based distro run +To be able to view the dot files from GDB an external dot viewer is required. +For this purpose use e.g., [xdot](https://pypi.org/project/xdot/), but any +other dot viewer will also do. +To install xdot on Debian based distros run: ``` sudo apt install xdot ``` -Or Arch based +On Arch based systems run: ``` sudo pacman -S xdot ``` -To add extension to GDB you need to source the pip installed plugin to it. To -find where pip placed GDB extension run `which gdb_asm2cfg` or in case if you +To add the extension to GDB you need to source the pip installed plugin. To +find where pip placed the GDB extension run `which gdb_asm2cfg` or if you use pyenv use `pyenv which gdb_asm2cfg`. Copy the path to the clipboard. -Then in you home directory if not already add `.gdbinit` file -and place following line in it and replace path from the earlier step. +Then in your home directory, if not already there, add the `.gdbinit` file +and place following line in it and replace path from the result from the +previous step. ``` source