diff --git a/README.md b/README.md index 07cf60c..f018083 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ twitter-korean-text는 normalization, tokenization, stemming, phrase extraction * 한국어를 처리하는 예시입니다 ㅋㅋ -> 한국어, 처리, 예시, 처리하는 예시 +Introductory Presentation: [Google Slides](https://docs.google.com/presentation/d/10CZj8ry03oCk_Jqw879HFELzOLjJZ0EOi4KJbtRSIeU) + + ## Try it here Gunja Agrawal kindly created a test API webpage for this project: [http://gunjaagrawal.com/langhack/](http://gunjaagrawal.com/langhack/) @@ -39,7 +42,7 @@ Opensourced here: [twitter-korean-tokenizer-api](https://github.com/gunjaag/twit ## API [scaladoc](http://twitter.github.io/twitter-korean-text/scaladocs/#com.twitter.penguin.korean.TwitterKoreanProcessor$) -[Auto-generated Documentation](http://twitter.github.io/twitter-korean-text) +[mavendoc](http://twitter.github.io/twitter-korean-text) ## Maven diff --git a/pom.xml b/pom.xml index 0cbf769..698ef68 100644 --- a/pom.xml +++ b/pom.xml @@ -91,6 +91,7 @@ org.slf4j slf4j-nop 1.5.8 + provided org.scalatest diff --git a/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala b/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala index 877263e..0fe6679 100644 --- a/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala +++ b/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala @@ -251,14 +251,20 @@ object KoreanTokenizer { * @return sequence of KoreanTokens */ def tokenize(text: CharSequence): Seq[KoreanToken] = { - chunk(text).flatMap { - case token: KoreanToken if token.pos == Korean => - // Get the best parse of each chunk - val parsed = parseKoreanChunk(token) - - // Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*) - collapseNouns(parsed) - case token: KoreanToken => Seq(token) + try { + chunk(text).flatMap { + case token: KoreanToken if token.pos == Korean => + // Get the best parse of each chunk + val parsed = parseKoreanChunk(token) + + // Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*) + collapseNouns(parsed) + case token: KoreanToken => Seq(token) + } + } catch { + case e: Exception => + System.err.println(s"Error tokenizing a chunk: $text") + throw e } } } \ No newline at end of file