Skip to content

Commit

Permalink
Merge pull request #92 from twitter/detokenizer_bug_fix
Browse files Browse the repository at this point in the history
bug fixed
  • Loading branch information
hohyon-ryu committed Dec 19, 2015
2 parents 6c196b9 + 36c80eb commit 822da20
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ object KoreanDetokenizer {
private def collapseTokens(tokenized: Seq[KoreanToken]): List[String] = {
val (output, isPrefix) = tokenized.foldLeft((List[String](), false)) {
case ((output: List[String], isPrefix: Boolean), token: KoreanToken) =>
if (isPrefix || SuffixPos.contains(token.pos)) {
if (output.nonEmpty && (isPrefix || SuffixPos.contains(token.pos))) {
val attached = output.lastOption.getOrElse("") + token.text
(output.init :+ attached, false)
} else if (PrefixPos.contains(token.pos)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,31 @@ class KoreanDetokenizerTest extends TestBase {
=== "뭐 완벽하진 않지만 그럭저럭 쓸 만하군..."
)
}

test("detokenizer should correctly detokenize the edge cases") {
assert(
detokenize(List(""))
=== ""
)

assert(
detokenize(List())
=== ""
)

assert(
detokenize(List("완벽"))
=== "완벽"
)

assert(
detokenize(List(""))
=== ""
)

assert(
detokenize(List("", "제품을", "사용하겠습니다"))
=== "이 제품을 사용하겠습니다"
)
}
}

1 comment on commit 822da20

@laeubli
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, thanks for that!

Please sign in to comment.