From 698838270c2f78819b1acb791c8a7304cdb5f1b2 Mon Sep 17 00:00:00 2001 From: Henrique Lorenzi Date: Sat, 18 Mar 2017 18:08:45 -0300 Subject: [PATCH] add string literal directives --- Cargo.toml | 2 +- doc/src.md | 43 +++++++++++++++ src/assembler.rs | 52 ++++++++++++++++++ src/tests/full.rs | 125 ++++++++++++++++++++++++++++++++++++++++++ src/util/bigint.rs | 11 +++- src/util/tokenizer.rs | 57 +++++++++++++++++-- 6 files changed, 282 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 085aad84..7ee9d193 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "customasm" -version = "0.2.0" +version = "0.3.0" authors = ["Henrique Lorenzi "] [lib] diff --git a/doc/src.md b/doc/src.md index eb84efcd..4abbfd9e 100644 --- a/doc/src.md +++ b/doc/src.md @@ -266,6 +266,49 @@ lda 0x77 Note that the `#d32` directive's arguments, `0x1234, 0x5678`, were extended with zeroes to match the directive's bit-size. +### String Directive + +This directive copies the UTF-8 representation of a string to +the output. The representation is extended with zeroes at the end +until it matches the machine's alignment. Escape sequences and +Unicode characters are available. For example: + +``` +#str "abcd" +#str "\n\r\0" +#str "\x12\x34" +#str "木" +``` + +...would be assembled into: + +``` +0x61 0x62 0x63 0x64 +0x0a 0x0d 0x00 +0x12 0x34 +0xe6 0x9c 0xa8 +``` + +### String with Length Directive + +Works like the previous directive, but prepends the output with +the string length in bytes, expressed in the given number of bits. +For example: + +``` +#strl 8, "abcd" +#strl 16, "abcd" +#strl 32, "abcd" +``` + +...would be assembled into: + +``` +0x04 0x61 0x62 0x63 0x64 +0x00 0x04 0x61 0x62 0x63 0x64 +0x00 0x00 0x00 0x04 0x61 0x62 0x63 0x64 +``` + ### Reserve Directive This directive advances the instruction *and* output addresses by diff --git a/src/assembler.rs b/src/assembler.rs index 4e2499a4..cf6eaefb 100644 --- a/src/assembler.rs +++ b/src/assembler.rs @@ -206,6 +206,12 @@ impl<'def> Assembler<'def> self.advance_address(bits); } + "str" => + return self.parse_str_directive(parser), + + "strl" => + return self.parse_strl_directive(parser), + "include" => { let (new_path, span) = try!(self.parse_relative_filename(parser, cur_path)); @@ -296,6 +302,52 @@ impl<'def> Assembler<'def> } + fn parse_str_directive(&mut self, parser: &mut Parser) -> Result<(), Error> + { + let (string, _) = try!(parser.expect_string()); + let mut bitvec = BitVec::new(); + + for c in string.bytes() + { bitvec.push(8, &BigInt::from_u8(c)); } + + while bitvec.len() % self.def.align_bits != 0 + { bitvec.push_bit(false); } + + self.output_bitvec(&bitvec); + try!(parser.expect_linebreak_or_end()); + Ok(()) + } + + + fn parse_strl_directive(&mut self, parser: &mut Parser) -> Result<(), Error> + { + let size_span = parser.current().span.clone(); + let size = try!(self.parse_integer(parser)); + if size % self.def.align_bits != 0 + { return Err(Error::new_with_span(format!("string length is not aligned"), size_span)); } + + try!(parser.expect_operator(",")); + + let (string, _) = try!(parser.expect_string()); + let mut bitvec = BitVec::new(); + + for c in string.bytes() + { bitvec.push(8, &BigInt::from_u8(c)); } + + while bitvec.len() % self.def.align_bits != 0 + { bitvec.push_bit(false); } + + let strlen = BigInt::from_usize(string.len()); + if strlen.width() > size + { return Err(Error::new_with_span(format!("string length (`{}`) does not fit given width", string.len()), size_span)); } + self.output_integer(size, &strlen); + + self.output_bitvec(&bitvec); + try!(parser.expect_linebreak_or_end()); + Ok(()) + } + + fn parse_global_constant(&mut self, parser: &mut Parser) -> Result<(), Error> { let (label, label_span) = try!(parser.expect_identifier()); diff --git a/src/tests/full.rs b/src/tests/full.rs index 98be3656..7d20ee94 100644 --- a/src/tests/full.rs +++ b/src/tests/full.rs @@ -416,6 +416,131 @@ fn test_data_directive_with_variables() } +#[test] +fn test_str_directive_simple() +{ + pass("#align 1", "#str \"abcd\"", 16, "61626364"); + pass("#align 2", "#str \"abcd\"", 16, "61626364"); + pass("#align 4", "#str \"abcd\"", 16, "61626364"); + pass("#align 8", "#str \"abcd\"", 16, "61626364"); + pass("#align 16", "#str \"abcd\"", 16, "61626364"); + pass("#align 32", "#str \"abcd\"", 16, "61626364"); + pass("#align 64", "#str \"abcd\"", 16, "6162636400000000"); + + pass("#align 1", "#str \"hello\"", 16, "68656c6c6f"); + pass("#align 2", "#str \"hello\"", 16, "68656c6c6f"); + pass("#align 4", "#str \"hello\"", 16, "68656c6c6f"); + pass("#align 8", "#str \"hello\"", 16, "68656c6c6f"); + pass("#align 16", "#str \"hello\"", 16, "68656c6c6f00"); + pass("#align 32", "#str \"hello\"", 16, "68656c6c6f000000"); + pass("#align 64", "#str \"hello\"", 16, "68656c6c6f000000"); + + pass("#align 3", "#str \"abcd\"", 2, "011000010110001001100011011001000"); + pass("#align 5", "#str \"abcd\"", 2, "01100001011000100110001101100100000"); + pass("#align 7", "#str \"abcd\"", 2, "01100001011000100110001101100100000"); + pass("#align 9", "#str \"abcd\"", 2, "011000010110001001100011011001000000"); +} + + +#[test] +fn test_str_directive_utf8() +{ + pass("#align 1", "#str \"木\"", 16, "e69ca8"); + pass("#align 2", "#str \"木\"", 16, "e69ca8"); + pass("#align 4", "#str \"木\"", 16, "e69ca8"); + pass("#align 8", "#str \"木\"", 16, "e69ca8"); + pass("#align 16", "#str \"木\"", 16, "e69ca800"); + pass("#align 32", "#str \"木\"", 16, "e69ca800"); + pass("#align 64", "#str \"木\"", 16, "e69ca80000000000"); + + pass("#align 1", "#str \"ab木cd\"", 16, "6162e69ca86364"); + pass("#align 2", "#str \"ab木cd\"", 16, "6162e69ca86364"); + pass("#align 4", "#str \"ab木cd\"", 16, "6162e69ca86364"); + pass("#align 8", "#str \"ab木cd\"", 16, "6162e69ca86364"); + pass("#align 16", "#str \"ab木cd\"", 16, "6162e69ca8636400"); + pass("#align 32", "#str \"ab木cd\"", 16, "6162e69ca8636400"); + pass("#align 64", "#str \"ab木cd\"", 16, "6162e69ca8636400"); +} + + +#[test] +fn test_str_directive_escape() +{ + pass("#align 8", "#str \"\0\"", 16, "00"); + pass("#align 8", "#str \"\t\"", 16, "09"); + pass("#align 8", "#str \"\n\"", 16, "0a"); + pass("#align 8", "#str \"\r\"", 16, "0d"); + pass("#align 8", "#str \"\\\\\"", 16, "5c"); + pass("#align 8", "#str \"\\\"\"", 16, "22"); + + pass("#align 8", "#str \"\\\"", 16, "5c"); + + pass("#align 8", "#str \"\\x00\"", 16, "00"); + pass("#align 8", "#str \"\\x12\"", 16, "12"); + pass("#align 8", "#str \"\\x7f\"", 16, "7f"); + pass("#align 8", "#str \"\\x80\"", 16, "c280"); + pass("#align 8", "#str \"\\xab\"", 16, "c2ab"); + pass("#align 8", "#str \"\\xAB\"", 16, "c2ab"); + pass("#align 8", "#str \"\\xabcd\"", 16, "c2ab6364"); + pass("#align 8", "#str \"ab\\xcd\"", 16, "6162c38d"); + + pass("#align 8", "#str \"\\x\"", 16, "5c"); + pass("#align 8", "#str \"\\x0\"", 16, "5c30"); + + fail("#align 8", "#str 0", 1, "string"); + fail("#align 8", "#str \"0", 1, "line break"); + fail("#align 8", "#str \"\\", 1, "line break"); +} + + +#[test] +fn test_strl_directive_simple() +{ + pass("#align 8", "#strl 8, \"abcd\"", 16, "0461626364"); + pass("#align 8", "#strl 16, \"abcd\"", 16, "000461626364"); + pass("#align 8", "#strl 24, \"abcd\"", 16, "00000461626364"); + pass("#align 8", "#strl 32, \"abcd\"", 16, "0000000461626364"); + pass("#align 8", "#strl 64, \"abcd\"", 16, "000000000000000461626364"); + pass("#align 16", "#strl 16, \"abcd\"", 16, "000461626364"); + pass("#align 16", "#strl 32, \"abcd\"", 16, "0000000461626364"); + pass("#align 16", "#strl 64, \"abcd\"", 16, "000000000000000461626364"); + pass("#align 32", "#strl 32, \"abcd\"", 16, "0000000461626364"); + pass("#align 32", "#strl 64, \"abcd\"", 16, "000000000000000461626364"); + pass("#align 64", "#strl 64, \"abcd\"", 16, "00000000000000046162636400000000"); + + pass("#align 8", "#strl 8, \"\"", 16, "00"); + pass("#align 8", "#strl 8, \"a\"", 16, "0161"); + pass("#align 8", "#strl 8, \"ab\"", 16, "026162"); + pass("#align 8", "#strl 8, \"abc\"", 16, "03616263"); + pass("#align 8", "#strl 8, \"abcde\"", 16, "056162636465"); + pass("#align 8", "#strl 8, \"abcdef\"", 16, "06616263646566"); + pass("#align 8", "#strl 8, \"abcdefg\"", 16, "0761626364656667"); + + fail("#align 8", "#strl \"abcd\"", 1, "expected"); + fail("#align 8", "#strl 8 \"abcd\"", 1, "expected"); + + pass("#align 4", "#strl 4, \"0123456789abcde\"", 16, "f303132333435363738396162636465"); + fail("#align 4", "#strl 4, \"0123456789abcdef\"", 1, "does not fit"); + + fail("#align 8", "#strl 1, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 2, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 3, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 4, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 5, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 6, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 7, \"abcd\"", 1, "string length"); + fail("#align 8", "#strl 9, \"abcd\"", 1, "string length"); +} + + +#[test] +fn test_strl_directive_utf8() +{ + pass("#align 8", "#strl 8, \"木\"", 16, "03e69ca8"); + pass("#align 8", "#strl 8, \"ab木cd\"", 16, "076162e69ca86364"); +} + + #[test] fn test_reserve_directive() { diff --git a/src/util/bigint.rs b/src/util/bigint.rs index 371e4379..e9dd22ed 100644 --- a/src/util/bigint.rs +++ b/src/util/bigint.rs @@ -7,7 +7,7 @@ pub struct BigInt impl BigInt -{ +{ pub fn from_i64(value: i64) -> BigInt { BigInt @@ -17,6 +17,15 @@ impl BigInt } + pub fn from_u8(value: u8) -> BigInt + { + BigInt + { + value: value as i64 + } + } + + pub fn from_usize(value: usize) -> BigInt { BigInt diff --git a/src/util/tokenizer.rs b/src/util/tokenizer.rs index 42cb9e42..0bfbf2ce 100644 --- a/src/util/tokenizer.rs +++ b/src/util/tokenizer.rs @@ -378,14 +378,59 @@ fn try_read_string(file: &Rc, src: &[char], index: &mut CharIndex) -> Op index.advance(); let mut s = String::new(); - while index.linear < src.len() && src[index.linear] != '\"' // " + while index.linear + 1 < src.len() && src[index.linear] != '\"' // " { - s.push(src[index.linear]); - - if src[index.linear] == '\n' - { index.advance_line(); } + // Parse escape sequences. + if src[index.linear] == '\\' && index.linear + 2 < src.len() + { + index.advance(); + + match src[index.linear] + { + '\\' => { s.push('\\'); index.advance(); } + '\"' => { s.push('\"'); index.advance(); } // " + '0' => { s.push('\0'); index.advance(); } + 't' => { s.push('\t'); index.advance(); } + 'n' => { s.push('\n'); index.advance(); } + 'r' => { s.push('\r'); index.advance(); } + 'x' => + { + index.advance(); + + if index.linear + 2 < src.len() + { + let hex1 = src[index.linear + 0].to_digit(16); + let hex2 = src[index.linear + 1].to_digit(16); + + if hex1.is_some() && hex2.is_some() + { + index.advance(); + index.advance(); + + s.push(((hex1.unwrap() << 4) | hex2.unwrap()) as u8 as char); + } + // FIXME: Should return an error. + else + { s.push('\\'); } + } + // FIXME: Should return an error. + else + { s.push('\\'); } + } + + // FIXME: Should return an error. + _ => { s.push('\\'); } + } + } else - { index.advance(); } + { + s.push(src[index.linear]); + + if src[index.linear] == '\n' + { index.advance_line(); } + else + { index.advance(); } + } } if src[index.linear] == '\"' // "