From ecde0adfcac572fd677dfa622a3227d00e7257f5 Mon Sep 17 00:00:00 2001 From: Billy Seol Date: Mon, 19 Oct 2015 21:42:07 +0000 Subject: [PATCH 1/5] slf4j-nop needs a non-default scope --- pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/pom.xml b/pom.xml index 167c7d0..aef2268 100644 --- a/pom.xml +++ b/pom.xml @@ -91,6 +91,7 @@ org.slf4j slf4j-nop 1.5.8 + provided org.scalatest From 939842e03ae7955955e66775c33fa3c53e14e550 Mon Sep 17 00:00:00 2001 From: Will Hohyon Ryu Date: Mon, 26 Oct 2015 17:20:15 -0700 Subject: [PATCH 2/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 07cf60c..3472d3a 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Opensourced here: [twitter-korean-tokenizer-api](https://github.com/gunjaag/twit ## API [scaladoc](http://twitter.github.io/twitter-korean-text/scaladocs/#com.twitter.penguin.korean.TwitterKoreanProcessor$) -[Auto-generated Documentation](http://twitter.github.io/twitter-korean-text) +[Machine-generated Documentation](http://twitter.github.io/twitter-korean-text) ## Maven From 97006e5543e850c556ea68369020b58acce14bc4 Mon Sep 17 00:00:00 2001 From: Will Hohyon Ryu Date: Mon, 26 Oct 2015 17:20:50 -0700 Subject: [PATCH 3/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3472d3a..afab1d0 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Opensourced here: [twitter-korean-tokenizer-api](https://github.com/gunjaag/twit ## API [scaladoc](http://twitter.github.io/twitter-korean-text/scaladocs/#com.twitter.penguin.korean.TwitterKoreanProcessor$) -[Machine-generated Documentation](http://twitter.github.io/twitter-korean-text) +[mavendoc](http://twitter.github.io/twitter-korean-text) ## Maven From 4d1e36aaa4133210618de9979a6a091af623bdee Mon Sep 17 00:00:00 2001 From: Will Hohyon Ryu Date: Tue, 27 Oct 2015 10:38:29 -0700 Subject: [PATCH 4/5] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index afab1d0..f018083 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ twitter-korean-text는 normalization, tokenization, stemming, phrase extraction * 한국어를 처리하는 예시입니다 ㅋㅋ -> 한국어, 처리, 예시, 처리하는 예시 +Introductory Presentation: [Google Slides](https://docs.google.com/presentation/d/10CZj8ry03oCk_Jqw879HFELzOLjJZ0EOi4KJbtRSIeU) + + ## Try it here Gunja Agrawal kindly created a test API webpage for this project: [http://gunjaagrawal.com/langhack/](http://gunjaagrawal.com/langhack/) From ed7521797a3e3b04f3dfa51fe013c14ca22e095b Mon Sep 17 00:00:00 2001 From: Will Hohyon Ryu Date: Thu, 5 Nov 2015 12:10:01 -0800 Subject: [PATCH 5/5] tokenization error message --- .../korean/tokenizer/KoreanTokenizer.scala | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala b/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala index 877263e..0fe6679 100644 --- a/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala +++ b/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala @@ -251,14 +251,20 @@ object KoreanTokenizer { * @return sequence of KoreanTokens */ def tokenize(text: CharSequence): Seq[KoreanToken] = { - chunk(text).flatMap { - case token: KoreanToken if token.pos == Korean => - // Get the best parse of each chunk - val parsed = parseKoreanChunk(token) - - // Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*) - collapseNouns(parsed) - case token: KoreanToken => Seq(token) + try { + chunk(text).flatMap { + case token: KoreanToken if token.pos == Korean => + // Get the best parse of each chunk + val parsed = parseKoreanChunk(token) + + // Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*) + collapseNouns(parsed) + case token: KoreanToken => Seq(token) + } + } catch { + case e: Exception => + System.err.println(s"Error tokenizing a chunk: $text") + throw e } } } \ No newline at end of file