From 698838270c2f78819b1acb791c8a7304cdb5f1b2 Mon Sep 17 00:00:00 2001
From: Henrique Lorenzi <hlorenzi12@gmail.com>
Date: Sat, 18 Mar 2017 18:08:45 -0300
Subject: [PATCH] add string literal directives

---
 Cargo.toml            |   2 +-
 doc/src.md            |  43 +++++++++++++++
 src/assembler.rs      |  52 ++++++++++++++++++
 src/tests/full.rs     | 125 ++++++++++++++++++++++++++++++++++++++++++
 src/util/bigint.rs    |  11 +++-
 src/util/tokenizer.rs |  57 +++++++++++++++++--
 6 files changed, 282 insertions(+), 8 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 085aad84..7ee9d193 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "customasm"
-version = "0.2.0"
+version = "0.3.0"
 authors = ["Henrique Lorenzi <hlorenzi12@gmail.com>"]
 
 [lib]
diff --git a/doc/src.md b/doc/src.md
index eb84efcd..4abbfd9e 100644
--- a/doc/src.md
+++ b/doc/src.md
@@ -266,6 +266,49 @@ lda 0x77
 Note that the `#d32` directive's arguments, `0x1234, 0x5678`, were
 extended with zeroes to match the directive's bit-size.
 
+### String Directive
+
+This directive copies the UTF-8 representation of a string to
+the output. The representation is extended with zeroes at the end
+until it matches the machine's alignment. Escape sequences and 
+Unicode characters are available. For example:
+
+```
+#str "abcd"
+#str "\n\r\0"
+#str "\x12\x34"
+#str "木"
+```
+
+...would be assembled into:
+
+```
+0x61 0x62 0x63 0x64
+0x0a 0x0d 0x00
+0x12 0x34
+0xe6 0x9c 0xa8
+```
+
+### String with Length Directive
+
+Works like the previous directive, but prepends the output with
+the string length in bytes, expressed in the given number of bits.
+For example:
+
+```
+#strl 8,  "abcd"
+#strl 16, "abcd"
+#strl 32, "abcd"
+```
+
+...would be assembled into:
+
+```
+0x04 0x61 0x62 0x63 0x64
+0x00 0x04 0x61 0x62 0x63 0x64
+0x00 0x00 0x00 0x04 0x61 0x62 0x63 0x64
+```
+
 ### Reserve Directive
 
 This directive advances the instruction *and* output addresses by
diff --git a/src/assembler.rs b/src/assembler.rs
index 4e2499a4..cf6eaefb 100644
--- a/src/assembler.rs
+++ b/src/assembler.rs
@@ -206,6 +206,12 @@ impl<'def> Assembler<'def>
 				self.advance_address(bits);
 			}
 			
+			"str" =>
+				return self.parse_str_directive(parser),
+			
+			"strl" =>
+				return self.parse_strl_directive(parser),
+			
 			"include" =>
 			{
 				let (new_path, span) = try!(self.parse_relative_filename(parser, cur_path));
@@ -296,6 +302,52 @@ impl<'def> Assembler<'def>
 	}
 
 
+	fn parse_str_directive(&mut self, parser: &mut Parser) -> Result<(), Error>
+	{
+		let (string, _) = try!(parser.expect_string());
+		let mut bitvec = BitVec::new();
+		
+		for c in string.bytes()
+			{ bitvec.push(8, &BigInt::from_u8(c)); }
+		
+		while bitvec.len() % self.def.align_bits != 0
+			{ bitvec.push_bit(false); }
+		
+		self.output_bitvec(&bitvec);		
+		try!(parser.expect_linebreak_or_end());
+		Ok(())
+	}
+
+
+	fn parse_strl_directive(&mut self, parser: &mut Parser) -> Result<(), Error>
+	{
+		let size_span = parser.current().span.clone();
+		let size = try!(self.parse_integer(parser));
+		if size % self.def.align_bits != 0
+			{ return Err(Error::new_with_span(format!("string length is not aligned"), size_span)); }
+			
+		try!(parser.expect_operator(","));
+		
+		let (string, _) = try!(parser.expect_string());
+		let mut bitvec = BitVec::new();
+		
+		for c in string.bytes()
+			{ bitvec.push(8, &BigInt::from_u8(c)); }
+		
+		while bitvec.len() % self.def.align_bits != 0
+			{ bitvec.push_bit(false); }
+			
+		let strlen = BigInt::from_usize(string.len());
+		if strlen.width() > size
+			{ return Err(Error::new_with_span(format!("string length (`{}`) does not fit given width", string.len()), size_span)); }
+		self.output_integer(size, &strlen);
+		
+		self.output_bitvec(&bitvec);		
+		try!(parser.expect_linebreak_or_end());
+		Ok(())
+	}
+
+
 	fn parse_global_constant(&mut self, parser: &mut Parser) -> Result<(), Error>
 	{
 		let (label, label_span) = try!(parser.expect_identifier());
diff --git a/src/tests/full.rs b/src/tests/full.rs
index 98be3656..7d20ee94 100644
--- a/src/tests/full.rs
+++ b/src/tests/full.rs
@@ -416,6 +416,131 @@ fn test_data_directive_with_variables()
 }
 
 
+#[test]
+fn test_str_directive_simple()
+{
+	pass("#align 1",  "#str \"abcd\"", 16, "61626364");
+	pass("#align 2",  "#str \"abcd\"", 16, "61626364");
+	pass("#align 4",  "#str \"abcd\"", 16, "61626364");
+	pass("#align 8",  "#str \"abcd\"", 16, "61626364");
+	pass("#align 16", "#str \"abcd\"", 16, "61626364");
+	pass("#align 32", "#str \"abcd\"", 16, "61626364");
+	pass("#align 64", "#str \"abcd\"", 16, "6162636400000000");
+	
+	pass("#align 1",  "#str \"hello\"", 16, "68656c6c6f");
+	pass("#align 2",  "#str \"hello\"", 16, "68656c6c6f");
+	pass("#align 4",  "#str \"hello\"", 16, "68656c6c6f");
+	pass("#align 8",  "#str \"hello\"", 16, "68656c6c6f");
+	pass("#align 16", "#str \"hello\"", 16, "68656c6c6f00");
+	pass("#align 32", "#str \"hello\"", 16, "68656c6c6f000000");
+	pass("#align 64", "#str \"hello\"", 16, "68656c6c6f000000");
+	
+	pass("#align 3", "#str \"abcd\"", 2, "011000010110001001100011011001000");
+	pass("#align 5", "#str \"abcd\"", 2, "01100001011000100110001101100100000");
+	pass("#align 7", "#str \"abcd\"", 2, "01100001011000100110001101100100000");
+	pass("#align 9", "#str \"abcd\"", 2, "011000010110001001100011011001000000");
+}
+
+
+#[test]
+fn test_str_directive_utf8()
+{
+	pass("#align 1",  "#str \"木\"", 16, "e69ca8");
+	pass("#align 2",  "#str \"木\"", 16, "e69ca8");
+	pass("#align 4",  "#str \"木\"", 16, "e69ca8");
+	pass("#align 8",  "#str \"木\"", 16, "e69ca8");
+	pass("#align 16", "#str \"木\"", 16, "e69ca800");
+	pass("#align 32", "#str \"木\"", 16, "e69ca800");
+	pass("#align 64", "#str \"木\"", 16, "e69ca80000000000");
+	
+	pass("#align 1",  "#str \"ab木cd\"", 16, "6162e69ca86364");
+	pass("#align 2",  "#str \"ab木cd\"", 16, "6162e69ca86364");
+	pass("#align 4",  "#str \"ab木cd\"", 16, "6162e69ca86364");
+	pass("#align 8",  "#str \"ab木cd\"", 16, "6162e69ca86364");
+	pass("#align 16", "#str \"ab木cd\"", 16, "6162e69ca8636400");
+	pass("#align 32", "#str \"ab木cd\"", 16, "6162e69ca8636400");
+	pass("#align 64", "#str \"ab木cd\"", 16, "6162e69ca8636400");
+}
+
+
+#[test]
+fn test_str_directive_escape()
+{
+	pass("#align 8", "#str \"\0\"",   16, "00");
+	pass("#align 8", "#str \"\t\"",   16, "09");
+	pass("#align 8", "#str \"\n\"",   16, "0a");
+	pass("#align 8", "#str \"\r\"",   16, "0d");
+	pass("#align 8", "#str \"\\\\\"", 16, "5c");
+	pass("#align 8", "#str \"\\\"\"", 16, "22");
+	
+	pass("#align 8", "#str \"\\\"",   16, "5c");
+	
+	pass("#align 8", "#str \"\\x00\"",   16, "00");
+	pass("#align 8", "#str \"\\x12\"",   16, "12");
+	pass("#align 8", "#str \"\\x7f\"",   16, "7f");
+	pass("#align 8", "#str \"\\x80\"",   16, "c280");
+	pass("#align 8", "#str \"\\xab\"",   16, "c2ab");
+	pass("#align 8", "#str \"\\xAB\"",   16, "c2ab");
+	pass("#align 8", "#str \"\\xabcd\"", 16, "c2ab6364");
+	pass("#align 8", "#str \"ab\\xcd\"", 16, "6162c38d");
+	
+	pass("#align 8", "#str \"\\x\"",  16, "5c");
+	pass("#align 8", "#str \"\\x0\"", 16, "5c30");
+	
+	fail("#align 8", "#str 0",    1, "string");
+	fail("#align 8", "#str \"0",  1, "line break");
+	fail("#align 8", "#str \"\\", 1, "line break");
+}
+
+
+#[test]
+fn test_strl_directive_simple()
+{
+	pass("#align 8",  "#strl 8,  \"abcd\"", 16, "0461626364");
+	pass("#align 8",  "#strl 16, \"abcd\"", 16, "000461626364");
+	pass("#align 8",  "#strl 24, \"abcd\"", 16, "00000461626364");
+	pass("#align 8",  "#strl 32, \"abcd\"", 16, "0000000461626364");
+	pass("#align 8",  "#strl 64, \"abcd\"", 16, "000000000000000461626364");
+	pass("#align 16", "#strl 16, \"abcd\"", 16, "000461626364");
+	pass("#align 16", "#strl 32, \"abcd\"", 16, "0000000461626364");
+	pass("#align 16", "#strl 64, \"abcd\"", 16, "000000000000000461626364");
+	pass("#align 32", "#strl 32, \"abcd\"", 16, "0000000461626364");
+	pass("#align 32", "#strl 64, \"abcd\"", 16, "000000000000000461626364");
+	pass("#align 64", "#strl 64, \"abcd\"", 16, "00000000000000046162636400000000");
+
+	pass("#align 8",  "#strl 8,  \"\"",        16, "00");
+	pass("#align 8",  "#strl 8,  \"a\"",       16, "0161");
+	pass("#align 8",  "#strl 8,  \"ab\"",      16, "026162");
+	pass("#align 8",  "#strl 8,  \"abc\"",     16, "03616263");
+	pass("#align 8",  "#strl 8,  \"abcde\"",   16, "056162636465");
+	pass("#align 8",  "#strl 8,  \"abcdef\"",  16, "06616263646566");
+	pass("#align 8",  "#strl 8,  \"abcdefg\"", 16, "0761626364656667");
+	
+	fail("#align 8",  "#strl \"abcd\"",   1, "expected");
+	fail("#align 8",  "#strl 8 \"abcd\"", 1, "expected");
+	
+	pass("#align 4",  "#strl 4, \"0123456789abcde\"",  16, "f303132333435363738396162636465");
+	fail("#align 4",  "#strl 4, \"0123456789abcdef\"", 1,  "does not fit");
+	
+	fail("#align 8",  "#strl 1, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 2, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 3, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 4, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 5, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 6, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 7, \"abcd\"", 1, "string length");
+	fail("#align 8",  "#strl 9, \"abcd\"", 1, "string length");
+}
+
+
+#[test]
+fn test_strl_directive_utf8()
+{
+	pass("#align 8", "#strl 8, \"木\"", 16, "03e69ca8");
+	pass("#align 8", "#strl 8, \"ab木cd\"", 16, "076162e69ca86364");
+}
+
+
 #[test]
 fn test_reserve_directive()
 {
diff --git a/src/util/bigint.rs b/src/util/bigint.rs
index 371e4379..e9dd22ed 100644
--- a/src/util/bigint.rs
+++ b/src/util/bigint.rs
@@ -7,7 +7,7 @@ pub struct BigInt
 
 
 impl BigInt
-{
+{	
 	pub fn from_i64(value: i64) -> BigInt
 	{
 		BigInt
@@ -17,6 +17,15 @@ impl BigInt
 	}
 	
 	
+	pub fn from_u8(value: u8) -> BigInt
+	{
+		BigInt
+		{
+			value: value as i64
+		}
+	}
+	
+	
 	pub fn from_usize(value: usize) -> BigInt
 	{
 		BigInt
diff --git a/src/util/tokenizer.rs b/src/util/tokenizer.rs
index 42cb9e42..0bfbf2ce 100644
--- a/src/util/tokenizer.rs
+++ b/src/util/tokenizer.rs
@@ -378,14 +378,59 @@ fn try_read_string(file: &Rc<String>, src: &[char], index: &mut CharIndex) -> Op
 	index.advance();
 
 	let mut s = String::new();
-	while index.linear < src.len() && src[index.linear] != '\"' // "
+	while index.linear + 1 < src.len() && src[index.linear] != '\"' // "
 	{
-		s.push(src[index.linear]);
-		
-		if src[index.linear] == '\n'
-			{ index.advance_line(); }
+		// Parse escape sequences.
+		if src[index.linear] == '\\' && index.linear + 2 < src.len()
+		{
+			index.advance();
+			
+			match src[index.linear]
+			{
+				'\\' => { s.push('\\'); index.advance(); }
+				'\"' => { s.push('\"'); index.advance(); } // "
+				'0'  => { s.push('\0'); index.advance(); }
+				't'  => { s.push('\t'); index.advance(); }
+				'n'  => { s.push('\n'); index.advance(); }
+				'r'  => { s.push('\r'); index.advance(); }
+				'x'  =>
+				{
+					index.advance();
+					
+					if index.linear + 2 < src.len()
+					{
+						let hex1 = src[index.linear + 0].to_digit(16);
+						let hex2 = src[index.linear + 1].to_digit(16);
+						
+						if hex1.is_some() && hex2.is_some()
+						{
+							index.advance();
+							index.advance();
+							
+							s.push(((hex1.unwrap() << 4) | hex2.unwrap()) as u8 as char);
+						}
+						// FIXME: Should return an error.
+						else
+							{ s.push('\\'); }
+					}
+					// FIXME: Should return an error.
+					else
+						{ s.push('\\'); }
+				}
+				
+				// FIXME: Should return an error.
+				_ => { s.push('\\'); }
+			}
+		}
 		else
-			{ index.advance(); }
+		{
+			s.push(src[index.linear]);
+			
+			if src[index.linear] == '\n'
+				{ index.advance_line(); }
+			else
+				{ index.advance(); }
+		}
 	}
 	
 	if src[index.linear] == '\"' // "