Skip to content

Commit

Permalink
Merge pull request #100 from jpark-dev/master
Browse files Browse the repository at this point in the history
Added a method to split by space instead of using regex to make token…
  • Loading branch information
hohyon-ryu authored Aug 10, 2016
2 parents 4f06c08 + f85e35d commit 00e221a
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken
import com.twitter.penguin.korean.util.KoreanPos
import com.twitter.penguin.korean.util.KoreanPos._

import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer

case class KoreanChunk(text: String, offset: Int, length: Int)

/**
Expand All @@ -47,8 +50,7 @@ object KoreanChunker {
Space -> """\s+""".r.pattern
)

private val CHUNKING_ORDER = Seq(Space, URL, Email, ScreenName, Hashtag, CashTag, Number, Korean, KoreanParticle, Alpha, Punctuation)
private val SPACE_REGEX_DELIMITER_KEEP_SPACES = """((?<=\s+)|(?=\s+))"""
private val CHUNKING_ORDER = Seq(URL, Email, ScreenName, Hashtag, CashTag, Number, Korean, KoreanParticle, Alpha, Punctuation)

protected[korean] def getChunks(input: String, keepSpace: Boolean = false): Seq[String] = {
chunk(input).map(_.text)
Expand All @@ -61,6 +63,27 @@ object KoreanChunker {
}
}

def splitBySpaceKeepingSpace(s: CharSequence): Seq[String] = {
val space = """\s+""".r.pattern
val m = space.matcher(s)

var tokens = new ListBuffer[String]()
var index = 0
while(m.find()) {
if (index < m.start) {
tokens += s.subSequence(index, m.start).toString
}
tokens += s.subSequence(m.start, m.end).toString
index = m.end
}

if (index < s.length()) {
tokens += s.subSequence(index, s.length()).toString
}

return tokens.toList
}

/**
* Recursively call m.find() to find all the matches.
* Use tail-recursion optimization to avoid stack overflow.
Expand All @@ -80,16 +103,27 @@ object KoreanChunker {
}

private[this] def splitChunks(text: String): List[ChunkMatch] = {
val chunks = text.split(SPACE_REGEX_DELIMITER_KEEP_SPACES).flatMap { s =>
CHUNKING_ORDER.foldLeft(List[ChunkMatch]()) {
(l, pos) =>
val m = POS_PATTERNS(pos).matcher(s)

findAllPatterns(m, pos).filter(cm => l.forall(cm.disjoint)) ::: l
if (text.charAt(0).isSpaceChar) {
List(ChunkMatch(0, text.length, text, Space))
} else {
val chunksBuf = new ListBuffer[ChunkMatch]()
var matchedLen = 0
CHUNKING_ORDER.foreach { pos =>
if (matchedLen < text.length) {
val m = POS_PATTERNS(pos).matcher(text)
while (m.find()) {
val cm = ChunkMatch(m.start, m.end, m.group(), pos)
if (chunksBuf.forall(cm.disjoint)) {
chunksBuf += cm
matchedLen += cm.end - cm.start
}
}
}
}
}.sortBy(cm => cm.start)

fillInUnmatched(text, chunks, Foreign)
val chunks = chunksBuf.sortBy(cm => cm.start).toList
fillInUnmatched(text, chunks, Foreign)
}
}

/**
Expand Down Expand Up @@ -144,17 +178,14 @@ object KoreanChunker {
* @return sequence of KoreanTokens
*/
def chunk(input: CharSequence): Seq[KoreanToken] = {
val splitRegex = SPACE_REGEX_DELIMITER_KEEP_SPACES
val s = input.toString

val (l: List[KoreanToken], i: Int) = s.split(splitRegex).flatMap {
val (l: List[KoreanToken], i: Int) = splitBySpaceKeepingSpace(s).flatMap {
s => splitChunks(s)
}.foldLeft(List[KoreanToken](), 0) {
case ((l: List[KoreanToken], i: Int), m: ChunkMatch) =>
val segStart = s.indexOf(m.text, i)
(KoreanToken(m.text, m.pos, segStart, m.text.length) :: l, segStart + m.text.length)
}
l.reverse

}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ RT @user: 에릭세트랑 찰스세트 바꾸실 분 안계실까요....ㅠㅠ
RT @user: 여러분 앤캐가 생기니까 세계가 핑크빛이네요 RT(Noun: 0, 2)/여러분(Noun: 10, 3)/여러분 앤캐(Noun: 10, 6)/생기(Noun: 18, 2)/세계(Noun: 23, 2)/핑크빛(Noun: 27, 3)/앤캐(Noun: 14, 2)/핑크(Noun: 27, 2)
RT @user: 으악 이제 잘래요! #제국의아이들 타이틀은 #숨소리 RT(Noun: 0, 2)/이제(Noun: 13, 2)/타이틀(Noun: 29, 3)/#제국의아이들(Hashtag: 21, 7)/#숨소리(Hashtag: 34, 4)
RT @user: 이 꼴통을 우쨔지? http://link.com RT(Noun: 0, 2)/이 꼴통(Noun: 10, 4)/우쨔(Noun: 16, 2)/꼴통(Noun: 12, 2)
RT @user: 이런좌빨놈 노동당사무국장 이런놈보기싫어서 일어나투표하러간다 시민들이여 일어나투표소로가자 어서들~~~~??? http://link.com" RT(Noun: 0, 2)/이런좌빨놈(Noun: 10, 5)/이런좌빨놈 노동당사무국장(Noun: 10, 13)/좌빨놈 노동당사무국장(Noun: 12, 11)/이런좌빨놈 노동당사무국장 이런놈보기(Noun: 10, 19)/좌빨놈 노동당사무국장 이런놈보기(Noun: 12, 17)/노동당사무국장 이런놈보기(Noun: 16, 13)/이런놈보기(Noun: 24, 5)/투표(Noun: 36, 2)/간다(Noun: 40, 2)/간다 시민들(Noun: 40, 7)/시민들(Noun: 44, 3)/투표소(Noun: 53, 3)/어서들(Noun: 61, 3)/좌빨(Noun: 12, 2)/노동당(Noun: 16, 3)/사무국장(Noun: 19, 4)/보기(Noun: 27, 2)
RT @user: 이런좌빨놈 노동당사무국장 이런놈보기싫어서 일어나투표하러간다 시민들이여 일어나투표소로가자 어서들~~~~??? http://link.com" RT(Noun: 0, 2)/이런좌빨놈(Noun: 10, 5)/이런좌빨놈 노동당사무국장(Noun: 10, 13)/좌빨놈 노동당사무국장(Noun: 12, 11)/이런좌빨놈 노동당사무국장 이런놈보기(Noun: 10, 19)/좌빨놈 노동당사무국장 이런놈보기(Noun: 12, 17)/노동당사무국장 이런놈보기(Noun: 16, 13)/이런놈보기(Noun: 24, 5)/투표(Noun: 36, 2)/간다(Noun: 40, 2)/간다 시민들(Noun: 40, 7)/투표소(Noun: 53, 3)/어서들(Noun: 61, 3)/좌빨(Noun: 12, 2)/노동당(Noun: 16, 3)/사무국장(Noun: 19, 4)/보기(Noun: 27, 2)/시민들(Noun: 44, 3)
RT @user: 이제 곧 '현충일' 국립묘지 돌보는 공군장병: 공군 중앙전산소 장병 120여 명이 3일 국립 대전현충원 묘역에서 태극기 교체 봉사를 하고 있다. 이날 장병들은 비가오는 중에도 현충원 참배와 태극기 ... http://link.com… RT(Noun: 0, 2)/이제(Noun: 10, 2)/이제 곧(Noun: 10, 4)/현충일(Noun: 16, 3)/국립묘지(Noun: 21, 4)/국립묘지 돌보는 공군장병(Noun: 21, 13)/공군장병(Noun: 30, 4)/공군(Noun: 36, 2)/공군 중앙전산소(Noun: 36, 8)/공군 중앙전산소 장병(Noun: 36, 11)/중앙전산소 장병(Noun: 39, 8)/공군 중앙전산소 장병 120여(Noun: 36, 16)/중앙전산소 장병 120여(Noun: 39, 13)/장병 120여(Noun: 45, 7)/공군 중앙전산소 장병 120여 명(Noun: 36, 18)/중앙전산소 장병 120여 명(Noun: 39, 15)/장병 120여 명(Noun: 45, 9)/120여 명(Noun: 48, 6)/3일(Noun: 56, 2)/3일 국립(Noun: 56, 5)/3일 국립 대전현충원(Noun: 56, 11)/국립 대전현충원(Noun: 59, 8)/3일 국립 대전현충원 묘역(Noun: 56, 14)/국립 대전현충원 묘역(Noun: 59, 11)/대전현충원 묘역(Noun: 62, 8)/태극기(Noun: 73, 3)/태극기 교체(Noun: 73, 6)/태극기 교체 봉사(Noun: 73, 9)/교체 봉사(Noun: 77, 5)/이날(Noun: 91, 2)/이날 장병들(Noun: 91, 6)/현충원(Noun: 108, 3)/현충원 참배(Noun: 108, 6)/현충원 참배와 태극기(Noun: 108, 11)/참배와 태극기(Noun: 112, 7)/장병(Noun: 32, 2)/중앙(Noun: 39, 2)/산소(Noun: 42, 2)/120(Noun: 48, 3)/국립(Noun: 59, 2)/대전(Noun: 62, 2)/묘역(Noun: 68, 2)/교체(Noun: 77, 2)/봉사(Noun: 80, 2)/장병들(Noun: 94, 3)/참배(Noun: 112, 2)
RT @user: 저도 아까 아침에 아무거나 막 트윗하다가 반성하고 지금 신중히 리튓중이예여 RT @user: 간간히 정말 함정 튓이 있다 RT(Noun: 0, 2)/아침(Noun: 16, 2)/아무거나(Noun: 20, 4)/아무거나 막(Noun: 20, 6)/아무거나 막 트윗(Noun: 20, 9)/막 트윗(Noun: 25, 4)/지금(Noun: 38, 2)/리튓중이(Noun: 45, 4)/정말(Noun: 66, 2)/정말 함정(Noun: 66, 5)/정말 함정 튓(Noun: 66, 7)/함정 튓(Noun: 69, 4)/아무(Noun: 20, 2)/거나(Noun: 22, 2)/트윗(Noun: 27, 2)/리튓(Noun: 45, 2)/중이(Noun: 47, 2)/함정(Noun: 69, 2)
RT @user: 천칭 씨의 눈치가 어설픈 점 하나 더. RT(Noun: 0, 2)/천칭(Noun: 10, 2)/천칭 씨(Noun: 10, 4)/천칭 씨의 눈치(Noun: 10, 8)/씨의 눈치(Noun: 13, 5)/눈치(Noun: 16, 2)/어설픈(Noun: 20, 3)/어설픈 점(Noun: 20, 5)/어설픈 점 하나(Noun: 20, 8)/점 하나(Noun: 24, 4)/어설픈 점 하나 더(Noun: 20, 10)/점 하나 더(Noun: 24, 6)/하나 더(Noun: 26, 4)/하나(Noun: 26, 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TwitterKoreanProcessorTest extends TestBase {
assert(
tokenize("이태민 복근있다..!!!!!! 11자...ㅋㅋㅋㅋ").mkString("/")
=== "이태민(Noun: 0, 3)/ (Space: 3, 1)/복근(Noun: 4, 2)/있다(Adjective: 6, 2)/" +
"..!!!!!!(Punctuation: 8, 8)/ (Space: 16, 1)/ (Space: 17, 1)/11(Number: 18, 2)/" +
"..!!!!!!(Punctuation: 8, 8)/ (Space: 16, 2)/11(Number: 18, 2)/" +
"자(Noun: 20, 1)/...(Punctuation: 21, 3)/ㅋㅋㅋㅋ(KoreanParticle: 24, 4)"
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ class KoreanChunkerTest extends TestBase {
getChunks("62:45의 결과").mkString("/")
=== "62:45/의/ /결과"
)

assert(
getChunks("여러 칸 띄어쓰기, 하나의 Space묶음으로 처리됩니다.").mkString("/")
=== "여러/ /칸/ /띄어쓰기/,/ /하나의/ /Space/묶음으로/ /처리됩니다/."
)
}

test("getChunkTokens should correctly find chunks with correct POS tags") {
Expand Down

0 comments on commit 00e221a

Please sign in to comment.