Merge branch 'master' into scala_2_10

twitter · Nov 5, 2015 · 782a545 · 782a545
2 parents fe2d091 + ed75217
commit 782a545
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -27,6 +27,9 @@ twitter-korean-text는 normalization, tokenization, stemming, phrase extraction
 
 * 한국어를 처리하는 예시입니다 ㅋㅋ -> 한국어, 처리, 예시, 처리하는 예시
 
+Introductory Presentation: [Google Slides](https://docs.google.com/presentation/d/10CZj8ry03oCk_Jqw879HFELzOLjJZ0EOi4KJbtRSIeU)
+
+
 ## Try it here
 
 Gunja Agrawal kindly created a test API webpage for this project: [http://gunjaagrawal.com/langhack/](http://gunjaagrawal.com/langhack/)
@@ -39,7 +42,7 @@ Opensourced here: [twitter-korean-tokenizer-api](https://github.com/gunjaag/twit
 ## API
 [scaladoc](http://twitter.github.io/twitter-korean-text/scaladocs/#com.twitter.penguin.korean.TwitterKoreanProcessor$)
 
-[Auto-generated Documentation](http://twitter.github.io/twitter-korean-text)
+[mavendoc](http://twitter.github.io/twitter-korean-text)
 
 
 ## Maven

diff --git a/pom.xml b/pom.xml
@@ -91,6 +91,7 @@
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-nop</artifactId>
       <version>1.5.8</version>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>

diff --git a/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala b/src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala
@@ -251,14 +251,20 @@ object KoreanTokenizer {
    * @return sequence of KoreanTokens
    */
   def tokenize(text: CharSequence): Seq[KoreanToken] = {
-    chunk(text).flatMap {
-      case token: KoreanToken if token.pos == Korean =>
-        // Get the best parse of each chunk
-        val parsed = parseKoreanChunk(token)
-
-        // Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*)
-        collapseNouns(parsed)
-      case token: KoreanToken => Seq(token)
+    try {
+      chunk(text).flatMap {
+        case token: KoreanToken if token.pos == Korean =>
+          // Get the best parse of each chunk
+          val parsed = parseKoreanChunk(token)
+
+          // Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*)
+          collapseNouns(parsed)
+        case token: KoreanToken => Seq(token)
+      }
+    } catch {
+      case e: Exception =>
+        System.err.println(s"Error tokenizing a chunk: $text")
+        throw e
     }
   }
 }