merge conflict

twitter · Nov 20, 2015 · 435d761 · 435d761
2 parents d9350d7 + d22d114
commit 435d761
Show file tree

Hide file tree

Showing 23 changed files with 609 additions and 475 deletions.
diff --git a/pom.xml b/pom.xml
@@ -21,7 +21,7 @@
   <groupId>com.twitter.penguin</groupId>
   <artifactId>korean-text-scala-2.10</artifactId>
   <packaging>jar</packaging>
-  <version>4.1.6-SNAPSHOT</version>
+  <version>4.2.0-SNAPSHOT</version
   <name>Korean Text Processing Utilities</name>
   <url>https://github.com/twitter/twitter-korean-text</url>
   <description>Scala library to process Korean text</description>
@@ -39,7 +39,7 @@
     <connection>scm:git:https://github.com/twitter/twitter-korean-text.git</connection>
     <developerConnection>scm:git:[email protected]:twitter/twitter-korean-text.git
     </developerConnection>
-    <tag>korean-text-scala-2.10-4.1.4</tag>
+    <tag>korean-text-scala-2.10-4.2.0</tag>
   </scm>
   <prerequisites>
     <maven>3.0.4</maven>

diff --git a/src/main/resources/com/twitter/penguin/korean/util/adjective/adjective.txt b/src/main/resources/com/twitter/penguin/korean/util/adjective/adjective.txt
@@ -325,6 +325,7 @@
 아름답
 아무렇
 아쉽
+아찔하
 아프
 안남
 안녕하

diff --git a/src/main/resources/com/twitter/penguin/korean/util/adverb/adverb.txt b/src/main/resources/com/twitter/penguin/korean/util/adverb/adverb.txt
@@ -1182,6 +1182,8 @@
 너뜰너뜰
 너무
 너무나
+너무나도
+너무도
 너물너물
 너벅너벅
 너벳벳이

diff --git a/src/main/resources/com/twitter/penguin/korean/util/auxiliary/exclamation.txt b/src/main/resources/com/twitter/penguin/korean/util/auxiliary/exclamation.txt
@@ -170,6 +170,7 @@
 프흐흐
 플
 피
+하
 하모
 하뿔싸
 하아

diff --git a/src/main/resources/com/twitter/penguin/korean/util/noun/foreign.txt b/src/main/resources/com/twitter/penguin/korean/util/noun/foreign.txt
@@ -421,10 +421,13 @@
 아나타노
 아노하나
 아뒤
+아따시
 아라마키
 아라비안
 아라키타
 아로마
+아로이
+아롱이
 아메미야
 아미
 아바타
@@ -442,6 +445,7 @@
 아이시떼루
 아이에프
 아이컨택
+아임인러브
 아즈마
 아츠야
 아카기
@@ -471,6 +475,7 @@
 야매
 야스토모
 야오
+야토가미
 얀데레
 양웬리
 어덜트
@@ -926,6 +931,7 @@
 피카추
 피티오
 피팅
+피피티
 핀업
 하나요
 하데스
@@ -947,6 +953,7 @@
 하이파이브회
 하이퍼
 하이퍼마켓
+하젠
 하치만
 하치켄
 하코네

diff --git a/src/main/resources/com/twitter/penguin/korean/util/noun/nouns.txt b/src/main/resources/com/twitter/penguin/korean/util/noun/nouns.txt
@@ -4288,8 +4288,6 @@
 너름새
 너리
 너머
-너무
-너무나
 너부죽이
 너비
 너울
@@ -8317,7 +8315,6 @@
 물레
 물레방아
 물력
-물론
 물리
 물리학
 물매
@@ -14771,6 +14768,8 @@
 아메바
 아멘
 아명
+아무
+아무나
 아바마마
 아방궁
 아버님
@@ -20123,7 +20122,6 @@
 제승
 제시
 제씨
-제아무리
 제안
 제압
 제약

diff --git a/src/main/resources/com/twitter/penguin/korean/util/noun/profane.txt b/src/main/resources/com/twitter/penguin/korean/util/noun/profane.txt
@@ -37,6 +37,8 @@
 씹새야
 야걸
 야동
+야설
+야애니
 엔조이
 오나니
 오피

diff --git a/src/main/resources/com/twitter/penguin/korean/util/substantives/suffix.txt b/src/main/resources/com/twitter/penguin/korean/util/substantives/suffix.txt
@@ -62,5 +62,6 @@
 킬로
 킬로미터
 틱
+하
 형
 화
diff --git a/src/main/resources/com/twitter/penguin/korean/util/typos/typos.txt b/src/main/resources/com/twitter/penguin/korean/util/typos/typos.txt
@@ -28,6 +28,7 @@
 그래욤 그래요
 그랫어요 그랬어요
 그러시져 그러시죠
+그로묜 그러면
 그지같 거지같
 그쵸 그렇죠
 글애요 그래요

diff --git a/src/main/resources/com/twitter/penguin/korean/util/verb/eomi.txt b/src/main/resources/com/twitter/penguin/korean/util/verb/eomi.txt
@@ -489,6 +489,7 @@
 마다에게
 마다의
 마따나
+마라
 마저
 마저나마라도
 마저도

diff --git a/src/main/resources/com/twitter/penguin/korean/util/verb/verb.txt b/src/main/resources/com/twitter/penguin/korean/util/verb/verb.txt
@@ -1288,7 +1288,9 @@
 피우
 피하
 하
+하고프
 하므
+하옵
 한잔하
 한정되
 한정하

diff --git a/src/main/scala/com/twitter/penguin/korean/TwitterKoreanProcessor.scala b/src/main/scala/com/twitter/penguin/korean/TwitterKoreanProcessor.scala
@@ -22,8 +22,8 @@ import com.twitter.penguin.korean.normalizer.KoreanNormalizer
 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor
 import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor.KoreanPhrase
 import com.twitter.penguin.korean.stemmer.KoreanStemmer
-import com.twitter.penguin.korean.tokenizer.{KoreanSentenceSplitter, Sentence, KoreanTokenizer}
 import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken
+import com.twitter.penguin.korean.tokenizer.{KoreanSentenceSplitter, KoreanTokenizer, Sentence, TokenizerProfile}
 import com.twitter.penguin.korean.util.KoreanPos
 
 /**
@@ -48,6 +48,18 @@ object TwitterKoreanProcessor {
    */
   def tokenize(text: CharSequence): Seq[KoreanToken] = KoreanTokenizer.tokenize(text)
 
+  /**
+    * Tokenize text (with a custom profile) into a sequence of KoreanTokens,
+    * which includes part-of-speech information and whether a token is an out-of-vocabulary term.
+    *
+    * @param text input text
+    * @return A sequence of KoreanTokens.
+    */
+  def tokenize(
+    text: CharSequence,
+    profile: TokenizerProfile
+  ): Seq[KoreanToken] = KoreanTokenizer.tokenize(text, profile)
+
   /**
    * Wrapper for Korean stemmer
    *

diff --git a/src/main/scala/com/twitter/penguin/korean/stemmer/KoreanStemmer.scala b/src/main/scala/com/twitter/penguin/korean/stemmer/KoreanStemmer.scala
@@ -25,7 +25,7 @@ object KoreanStemmer {
     }
 
     val stemmed = tokens.foldLeft(List[KoreanToken]()) {
-      case (l: List[KoreanToken], token: KoreanToken) if Endings.contains(token.pos) =>
+      case (l: List[KoreanToken], token: KoreanToken) if l.nonEmpty && Endings.contains(token.pos) =>
         if (Predicates.contains(l.head.pos)) {
           val prevToken = l.head
           KoreanToken(
-Original file line number
+Diff line change
@@ Expand Up / @@ -170,6 +170,7 @@ @@
     프흐흐
     플
     피
+    하
     하모
     하뿔싸
     하아
@@ Expand Down @@