Change Position to a cached byte offset, not an always-recomputed line & column pair #744

jiribenes · 2024-12-12T12:48:31Z

We currently use Position which is a (line, column) pair.
This means we spend a lot of time on Source.offsetToPosition when parsing.

Instead, we should just use byte offsets to mark positions, computing the line & column on demand (and caching it).

I tried this a long time ago (almost 6 months), but I'm attaching the code that somewhat worked back then, it just needs a clean up (refactoring :)) and some benchmarking (used to be something like 20-30% faster parsing!). It also needs some proper LSP testing: it worked on my machine for at least a few minutes, but that's probably not good enough -- I'm a bit scared that there could be bugs that make the LSP much slower to use.
Note that this change is across Effekt and our fork of Kiama.

Here's the code I used for this change in the Effekt codebase:

diff --git i/effekt/js/src/main/scala/effekt/LanguageServer.scala w/effekt/js/src/main/scala/effekt/LanguageServer.scala
index ecbcbd73..9cc157fc 100644
--- i/effekt/js/src/main/scala/effekt/LanguageServer.scala
+++ w/effekt/js/src/main/scala/effekt/LanguageServer.scala
@@ -175,7 +175,7 @@ class LanguageServer extends Intelligence {
     new lsp.Position(p.line - 1, p.column - 1)
 
   private def fromLSPPosition(p: lsp.Position, source: Source): Position =
-    Position(p.line + 1, p.character + 1, source)
+    source.lineAndColumnToPosition(p.line, p.character)
 
   private def toLSPLocation(from: Position, to: Position): Option[lsp.Location] = from.source match {
     case VirtualFileSource(path) => Some(new lsp.Location(from.source.name, new lsp.Range(toLSPPosition(from), toLSPPosition(to))))
diff --git i/effekt/shared/src/main/scala/effekt/RecursiveDescent.scala w/effekt/shared/src/main/scala/effekt/RecursiveDescent.scala
index 4c117084..46b052d0 100644
--- i/effekt/shared/src/main/scala/effekt/RecursiveDescent.scala
+++ w/effekt/shared/src/main/scala/effekt/RecursiveDescent.scala
@@ -39,7 +39,7 @@ class RecursiveDescent(positions: Positions, tokens: Seq[Token], source: Source)
             val to = source.offsetToPosition(value.end + 1)
             Some(Range(from, to))
           case None =>
-            val pos = Position(0, 0, source)
+            val pos = Position(0, source)
             Some(Range(pos, pos))
         }

and in Kiama:

diff --git i/jvm/src/main/scala/kiama/util/Server.scala w/jvm/src/main/scala/kiama/util/Server.scala
index 23e39b0..32f7574 100644
--- i/jvm/src/main/scala/kiama/util/Server.scala
+++ w/jvm/src/main/scala/kiama/util/Server.scala
@@ -322,12 +322,7 @@ trait Server[N, C <: Config, M <: Message] extends Compiler[C, M] with LanguageS
   def positionOfStartFinish(optStart: Option[Position], optFinish: Option[Position]): Option[(Int, Int)] =
     (optStart, optFinish) match {
       case (Some(start), Some(finish)) =>
-        (start.optOffset, finish.optOffset) match {
-          case (Some(s), Some(f)) =>
-            Some((s, f))
-          case _ =>
-            None
-        }
+        Some((start.offset, finish.offset))
       case _ =>
         None
     }
@@ -512,7 +507,7 @@ class Services[N, C <: Config, M <: Message](
 
   def positionOfNotification(document: TextDocumentIdentifier, position: LSPPosition): Option[Position] =
     server.sources.get(document.getUri).map(source => {
-      Position(position.getLine + 1, position.getCharacter + 1, source)
+      source.lineAndColumnToPosition(position.getLine, position.getCharacter)
     })
 
   @JsonNotification("textDocument/codeAction")
diff --git i/shared/src/main/scala/kiama/util/Positions.scala w/shared/src/main/scala/kiama/util/Positions.scala
index 64e580d..ba9bd83 100644
--- i/shared/src/main/scala/kiama/util/Positions.scala
+++ w/shared/src/main/scala/kiama/util/Positions.scala
@@ -15,7 +15,13 @@ package util
  * Record of a source position at a particular line and column relative to
  * a given source. The line and column numbers are one-indexed.
  */
-case class Position(line: Int, column: Int, source: Source) {
+case class Position(offset: Int, source: Source) {
+  lazy val (line, column) = source.lineStarts.lastIndexWhere(offset >= _) match {
+    case -1 =>
+      (0, 0)
+    case line =>
+      (line + 1, offset - source.lineStarts(line) + 1)
+  }
 
   /**
    * Format this position. The result is of the form `/foo/bar.txt:2:10:` if
@@ -37,24 +43,12 @@ case class Position(line: Int, column: Int, source: Source) {
   lazy val optContext: Option[String] =
     source.optLineContents(line).map(s => s"$s\n${" " * (column - 1)}^")
 
-  /**
-   * Return the offset that this position refers to in its source. `None`
-   * is returned if the position is not valid for its source.
-   */
-  lazy val optOffset: Option[Int] =
-    source.positionToOffset(this)
-
   /**
    * Apply a binary Boolean operation to this position and another one,
    * as offsets, return the result.
    */
   def op2(op: (Int, Int) => Boolean, p: Position): Boolean =
-    (optOffset, p.optOffset) match {
-      case (Some(l), Some(r)) =>
-        op(l, r)
-      case (l, r) =>
-        false
-    }
+    op(this.offset, p.offset)
 
   /**
    * Does this position occur no later than `p`? The two positions
@@ -84,7 +78,7 @@ case class Position(line: Int, column: Int, source: Source) {
 case class Range(from: Position, to: Position)
 
 object Range {
-  def empty(source: Source): Range = Range(Position(1, 1, source), Position(1, 1, source))
+  def empty(source: Source): Range = Range(Position(0, source), Position(0, source))
 }
 
 /**
@@ -205,13 +199,8 @@ class Positions {
    * positions doesn't refer to a valid offset in the source then
    * `None` is returned.
    */
-  def substring(s: Position, f: Position): Option[String] =
-    (s.optOffset, f.optOffset) match {
-      case (Some(soffset), Some(foffset)) =>
-        Some(s.source.content.substring(soffset, foffset))
-      case _ =>
-        None
-    }
+  def substring(s: Position, f: Position): String =
+    s.source.content.substring(s.offset, f.offset)
 
   /**
    * If `t` has valid start and finish positions, return the source text
@@ -222,7 +211,7 @@ class Positions {
   def textOf[T](t: T): Option[String] =
     (getStart(t), getFinish(t)) match {
       case (Some(start), Some(finish)) =>
-        substring(start, finish)
+        Some(substring(start, finish))
       case _ =>
         None
     }
diff --git i/shared/src/main/scala/kiama/util/Source.scala w/shared/src/main/scala/kiama/util/Source.scala
index 58d300d..814157b 100644
--- i/shared/src/main/scala/kiama/util/Source.scala
+++ w/shared/src/main/scala/kiama/util/Source.scala
@@ -65,29 +65,23 @@ trait Source {
    * Convert an offset into the content into a position.
    */
   def offsetToPosition(offset: Int): Position =
-    lineStarts.lastIndexWhere(offset >= _) match {
+    Position(offset, source = this)
+    /*lineStarts.lastIndexWhere(offset >= _) match {
       case -1 =>
-        Position(0, 0, this)
+        Position(0, this)
       case line =>
         Position(line + 1, offset - lineStarts(line) + 1, this)
-    }
+    }*/
 
   /**
    * If the position is valid for this source, return the corresponding
    * offset into the content, otherwise return `None`.
    */
-  def positionToOffset(position: Position): Option[Int] = {
-    val line = position.line
-    if ((line >= 1) && (line <= lineCount)) {
-      val lineStart = lineStarts(line - 1)
-      val column = position.column
-      if ((column >= 1) && (column <= lineFinish(line) - lineStart + 1))
-        Some(lineStart + column - 1)
-      else
-        None
-    } else
-      None
-  }
+  def positionToOffset(position: Position): Int =
+    position.offset
+
+  def lineAndColumnToPosition(line: Int, column: Int): Position =
+    Position(lineFinish(line) + column, this)
 }
 
 /**

The text was updated successfully, but these errors were encountered:

jiribenes added the feature New feature or request label Dec 12, 2024

jiribenes changed the title ~~Source positions should be cached byte offsets, not always recomputed line & character pair~~ Change Position to a cached byte offset, not an always-recomputed line & column pair Dec 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Change Position to a cached byte offset, not an always-recomputed line & column pair #744

Change Position to a cached byte offset, not an always-recomputed line & column pair #744

jiribenes commented Dec 12, 2024 •

edited

Loading

Change Position to a cached byte offset, not an always-recomputed line & column pair #744

Change Position to a cached byte offset, not an always-recomputed line & column pair #744

Comments

jiribenes commented Dec 12, 2024 • edited Loading

jiribenes commented Dec 12, 2024 •

edited

Loading