From c27854afd524d34bf1979233eea9911284e542fd Mon Sep 17 00:00:00 2001 From: Nick Budak Date: Mon, 17 Jun 2024 15:07:34 -0700 Subject: [PATCH] Improve CSV output This adds transcriptions to the CSV output and relabels the scores to more clearly indicate graphic vs. phonetic similarity of matches. Closes #362 --- dphon/cli.py | 2 +- dphon/console.py | 7 ------- dphon/match.py | 53 ++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/dphon/cli.py b/dphon/cli.py index 50d89ee..f711760 100644 --- a/dphon/cli.py +++ b/dphon/cli.py @@ -141,7 +141,7 @@ def run() -> None: for match in results: writer.write(match.as_dict()) elif args["--output-format"] == "csv": - fieldnames = Match("", "", "", "").as_dict().keys() + fieldnames = results[0].as_dict().keys() writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames) writer.writeheader() for match in results: diff --git a/dphon/console.py b/dphon/console.py index 269c7cf..c481558 100644 --- a/dphon/console.py +++ b/dphon/console.py @@ -126,10 +126,3 @@ def _add_context(self, match: Match) -> Tuple[str, str, str, str]: cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]" cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]" return (cul, cur, cvl, cvr) - - def transcription(self, match: Match) -> Tuple[str, str]: - """Get the phonemic transcription for the match for display.""" - return ( - "*" + " ".join(match.utxt._.syllables), - "*" + " ".join(match.vtxt._.syllables), - ) diff --git a/dphon/match.py b/dphon/match.py index 2475cc5..a513829 100644 --- a/dphon/match.py +++ b/dphon/match.py @@ -3,7 +3,7 @@ """The Match class for encoding text reuse relationships.""" import math -from typing import Dict, List, NamedTuple +from typing import Dict, List, NamedTuple, Tuple import Levenshtein as Lev from rich.padding import Padding @@ -32,7 +32,7 @@ def __rich_console__( """Format the match for display in console.""" # get colorized match text and transcription su, sv = console.highlighter.format_match(self) # type: ignore - pu, pv = console.highlighter.transcription(self) # type: ignore + pu, pv = self.transcription # add left-padding to align with match numbers, and bottom-padding # so that there's a space between matches in output @@ -49,16 +49,55 @@ def __rich_console__( pv, ) + @property + def u_transcription(self) -> str: + return "*" + " ".join(self.utxt._.syllables) + + @property + def v_transcription(self) -> str: + return "*" + " ".join(self.vtxt._.syllables) + @property def weighted_score(self) -> float: """Ratio of phonemic similarity to graphic similarity.""" try: - return self.weight / Lev.seqratio(self.au, self.av) + return self.phonetic_similarity() / self.graphic_similarity() except ZeroDivisionError: return math.inf + @property + def transcription(self) -> Tuple[str, str]: + """Return the phonemic transcription of the match.""" + return (self.u_transcription, self.v_transcription) + + def graphic_similarity(self) -> float: + """Levenshtein ratio of the aligned sequences.""" + return Lev.seqratio(self.au, self.av) + + def phonetic_similarity(self) -> float: + """Similarity score of the phonetic content of the sequences.""" + return self.weight + + def context(self, chars: int) -> Tuple[str, str, str, str]: + """Return up to `chars` characters of context around the match. + + Return value is a tuple of four strings: + - left context of u + - right context of u + - left context of v + - right context of v + """ + u, v = self.utxt.doc, self.vtxt.doc + u_start, u_end = self.utxt.start, self.utxt.end + v_start, v_end = self.vtxt.start, self.vtxt.end + u_context_left = u[max(u_start - chars, 0) : u_start] + v_context_left = v[max(v_start - chars, 0) : v_start] + u_context_right = u[u_end : min(u_end + chars, len(u))] + v_context_right = v[v_end : min(v_end + chars, len(v))] + return (u_context_left, u_context_right, v_context_left, v_context_right) + def as_dict(self) -> Dict[str, str]: - """Match with prettier field names for serialization.""" + """Dict form for structured output formats.""" return { "u_id": self.u, "v_id": self.v, @@ -66,10 +105,12 @@ def as_dict(self) -> Dict[str, str]: "v_text": self.vtxt.text, "u_text_aligned": "".join(self.au), "v_text_aligned": "".join(self.av), + "u_transcription": self.u_transcription, + "v_transcription": self.v_transcription, "u_start": self.utxt.start, "u_end": self.utxt.end, "v_start": self.vtxt.start, "v_end": self.vtxt.end, - "score": str(self.weight), - "weighted_score": str(self.weighted_score), + "phonetic_similarity": self.phonetic_similarity(), + "graphic_similarity": self.graphic_similarity(), }