From 7685a8f557d4e2ac79aeffc91c023507afa1a36e Mon Sep 17 00:00:00 2001 From: KEINOS Date: Sat, 13 Apr 2024 13:17:43 +0900 Subject: [PATCH] chore: revamp sample dir as _example Refactor directory structure of examples. - related issue #200 https://github.com/ikawaha/kagome/issues/299#issuecomment-2052892253 --- _examples/db_search/README.md | 65 +++++++++++++++++++ .../_example => _examples/db_search}/go.mod | 2 +- .../_example => _examples/db_search}/go.sum | 0 .../_example => _examples}/db_search/main.go | 34 ++-------- _examples/tokenize/go.mod | 12 ++++ _examples/tokenize/go.sum | 4 ++ .../_example => _examples}/tokenize/main.go | 0 _examples/user_dict/go.mod | 11 ++++ _examples/user_dict/go.sum | 4 ++ _examples/user_dict/main.go | 36 ++++++++++ .../dict => _examples/user_dict}/userdict.txt | 0 _examples/wakati/go.mod | 12 ++++ _examples/wakati/go.sum | 4 ++ {sample/_example => _examples}/wakati/main.go | 0 _examples/wasm/README.md | 23 +++++++ _examples/wasm/go.mod | 12 ++++ _examples/wasm/go.sum | 4 ++ {sample => _examples}/wasm/kagome.html | 0 {sample => _examples}/wasm/main.go | 4 +- sample/wasm/README.md | 21 ------ sample/wasm/go.mod | 3 - 21 files changed, 196 insertions(+), 55 deletions(-) create mode 100644 _examples/db_search/README.md rename {sample/_example => _examples/db_search}/go.mod (88%) rename {sample/_example => _examples/db_search}/go.sum (100%) rename {sample/_example => _examples}/db_search/main.go (71%) create mode 100644 _examples/tokenize/go.mod create mode 100644 _examples/tokenize/go.sum rename {sample/_example => _examples}/tokenize/main.go (100%) create mode 100644 _examples/user_dict/go.mod create mode 100644 _examples/user_dict/go.sum create mode 100644 _examples/user_dict/main.go rename {sample/dict => _examples/user_dict}/userdict.txt (100%) create mode 100644 _examples/wakati/go.mod create mode 100644 _examples/wakati/go.sum rename {sample/_example => _examples}/wakati/main.go (100%) create mode 100644 _examples/wasm/README.md create mode 100644 _examples/wasm/go.mod create mode 100644 _examples/wasm/go.sum rename {sample => _examples}/wasm/kagome.html (100%) rename {sample => _examples}/wasm/main.go (96%) delete mode 100644 sample/wasm/README.md delete mode 100644 sample/wasm/go.mod diff --git a/_examples/db_search/README.md b/_examples/db_search/README.md new file mode 100644 index 0000000..f308768 --- /dev/null +++ b/_examples/db_search/README.md @@ -0,0 +1,65 @@ +# Full-text search with Kagome and SQLite3 + +This example provides a practical example of how to work with Japanese text data and **perform efficient [full-text search](https://en.wikipedia.org/wiki/Full-text_search) using Kagome and SQLite3**. + +- Target text data is as follows: + +```text +人魚は、南の方の海にばかり棲んでいるのではありません。 +北の海にも棲んでいたのであります。 +北方の海の色は、青うございました。 +ある時、岩の上に、女の人魚があがって、 +あたりの景色を眺めながら休んでいました。 +小川未明 『赤い蝋燭と人魚』 +``` + +- Example output: + +```shellsession +$ cd /path/to/kagome/_examples/db_search +$ go run . +Searching for: 人魚 + Found content: 人魚は、南の方の海にばかり棲んでいるのではありません。 at line: 1 + Found content: ある時、岩の上に、女の人魚があがって、 at line: 4 + Found content: 小川未明 『赤い蝋燭と人魚』 at line: 6 +Searching for: 人 + No results found +Searching for: 北方 + Found content: 北方の海の色は、青うございました。 at line: 3 +Searching for: 北 + Found content: 北の海にも棲んでいたのであります。 at line: 2 +``` + +- [View main.go](main.go) + +## Details + +In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "人魚" and "人". + +Note that the string tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for [FTS4](https://www.sqlite.org/fts3.html) (Full-Text-Search) at the same time as the original text. + +This allows Unicode text data that is not separated by spaces, such as Japanese, to be searched by FTS. + +### Aim of this example + +This example can be useful in scenarios where you need to perform full-text searches on Japanese text. + +It demonstrates how to tokenize Japanese text using Kagome, which is a common requirement when working with text data in the Japanese language. + +By using SQLite with FTS4, it efficiently manages and searches through a large amount of text data, making it suitable for applications like: + +1. **Search Engines:** You can use this code as a basis for building a search engine that indexes and searches Japanese text content. +2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. +3. **Content Recommendation Systems:** When you have a large collection of Japanese content, you can use this code to implement content recommendation systems based on user queries. +4. **Chatbots and NLP:** If you're building chatbots or natural language processing (NLP) systems for Japanese language, this code can assist in text analysis and search within the chatbot's knowledge base. + +## Acknowledgements + +This example is taken in part from the following book for reference. + +- p.204, 9.2 "データーベース登録プログラム", "Go言語プログラミングエッセンス エンジニア選書" + - Written by: [Mattn](https://github.com/mattn) + - Published: 2023/3/9 (技術評論社) + - ISBN: 4297134195 / 978-4297134198 + - ASIN: B0BVZCJQ4F / [https://amazon.co.jp/dp/4297134195](https://amazon.co.jp/dp/4297134195) + - Original sample code: [https://github.com/mattn/aozora-search](https://github.com/mattn/aozora-search) diff --git a/sample/_example/go.mod b/_examples/db_search/go.mod similarity index 88% rename from sample/_example/go.mod rename to _examples/db_search/go.mod index 1b747b4..c4c8eff 100644 --- a/sample/_example/go.mod +++ b/_examples/db_search/go.mod @@ -1,4 +1,4 @@ -module kagome/examples +module kagome/examples/db_search go 1.19 diff --git a/sample/_example/go.sum b/_examples/db_search/go.sum similarity index 100% rename from sample/_example/go.sum rename to _examples/db_search/go.sum diff --git a/sample/_example/db_search/main.go b/_examples/db_search/main.go similarity index 71% rename from sample/_example/db_search/main.go rename to _examples/db_search/main.go index a811a27..e92de2d 100644 --- a/sample/_example/db_search/main.go +++ b/_examples/db_search/main.go @@ -1,35 +1,9 @@ /* -# TL; DR +# Full-text search with Kagome and SQLite3 This example provides a practical example of how to work with Japanese text data and perform efficient full-text search using Kagome and SQLite3. -# TS; WM - -In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "人魚" and "人". - -Note that the string tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for FTS (Full-Text-Search) at the same time as the original text. - -This allows Unicode text data that is not separated by spaces, such as Japanese, to be searched by FTS. - -Aim of this example: - -This example can be useful in scenarios where you need to perform full-text searches on Japanese text. It demonstrates how to tokenize Japanese text using Kagome, which is a common requirement when working with text data in the Japanese language. By using SQLite with FTS4, it efficiently manages and searches through a large amount of text data, making it suitable for applications like: - -1. **Search Engines:** You can use this code as a basis for building a search engine that indexes and searches Japanese text content. -2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. -3. **Content Recommendation Systems:** When you have a large collection of Japanese content, you can use this code to implement content recommendation systems based on user queries. -4. **Chatbots and NLP:** If you're building chatbots or natural language processing (NLP) systems for Japanese language, this code can assist in text analysis and search within the chatbot's knowledge base. - -Acknowledgements: - -This example is taken in part from the following book for reference. - -- p.204, 9.2 "データーベース登録プログラム", "Go言語プログラミングエッセンス エンジニア選書" - - Written by: Mattn - - Published: 2023/3/9 (技術評論社) - - ISBN: 4297134195 / 978-4297134198 - - ASIN: B0BVZCJQ4F / https://amazon.co.jp/dp/4297134195 - - Original sample code: https://github.com/mattn/aozora-search +For details and acknowledgements, see the README.md file in the same directory. */ package main @@ -39,6 +13,7 @@ import ( "fmt" "log" "os" + "slices" "strings" "github.com/ikawaha/kagome-dict/ipa" @@ -165,6 +140,9 @@ func insertSearchToken(db *sql.DB, rowID int64, content string) error { } seg := tknzr.Wakati(content) + + seg = slices.Compact(seg) // remove duplicate segment tokens + tokenizedContent := strings.Join(seg, " ") _, err = db.Exec( diff --git a/_examples/tokenize/go.mod b/_examples/tokenize/go.mod new file mode 100644 index 0000000..9b8c0fa --- /dev/null +++ b/_examples/tokenize/go.mod @@ -0,0 +1,12 @@ +module kagome/examples/tokenize + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +require github.com/ikawaha/kagome-dict v1.0.9 // indirect + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/tokenize/go.sum b/_examples/tokenize/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/tokenize/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/sample/_example/tokenize/main.go b/_examples/tokenize/main.go similarity index 100% rename from sample/_example/tokenize/main.go rename to _examples/tokenize/main.go diff --git a/_examples/user_dict/go.mod b/_examples/user_dict/go.mod new file mode 100644 index 0000000..8c61210 --- /dev/null +++ b/_examples/user_dict/go.mod @@ -0,0 +1,11 @@ +module kagome/examples/user_dict + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict v1.0.9 + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/user_dict/go.sum b/_examples/user_dict/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/user_dict/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/_examples/user_dict/main.go b/_examples/user_dict/main.go new file mode 100644 index 0000000..148e223 --- /dev/null +++ b/_examples/user_dict/main.go @@ -0,0 +1,36 @@ +package main + +import ( + "fmt" + + "github.com/ikawaha/kagome-dict/dict" + "github.com/ikawaha/kagome-dict/ipa" + "github.com/ikawaha/kagome/v2/tokenizer" +) + +func main() { + // Use IPA dictionary as a system dictionary. + sysDic := ipa.Dict() + + // Build a user dictionary from a file. + userDic, err := dict.NewUserDict("userdict.txt") + if err != nil { + panic(err) + } + + // Specify the user dictionary as an option. + t, err := tokenizer.New(sysDic, tokenizer.UserDict(userDic), tokenizer.OmitBosEos()) + if err != nil { + panic(err) + } + + tokens := t.Analyze("関西国際空港限定トートバッグ", tokenizer.Search) + for _, token := range tokens { + fmt.Printf("%s\t%v\n", token.Surface, token.Features()) + } + + // Output: + // 関西国際空港 [テスト名詞 関西/国際/空港 カンサイ/コクサイ/クウコウ] + // 限定 [名詞 サ変接続 * * * * 限定 ゲンテイ ゲンテイ] + // トートバッグ [名詞 一般 * * * * *] +} diff --git a/sample/dict/userdict.txt b/_examples/user_dict/userdict.txt similarity index 100% rename from sample/dict/userdict.txt rename to _examples/user_dict/userdict.txt diff --git a/_examples/wakati/go.mod b/_examples/wakati/go.mod new file mode 100644 index 0000000..3193fd0 --- /dev/null +++ b/_examples/wakati/go.mod @@ -0,0 +1,12 @@ +module kagome/examples/wakati + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +require github.com/ikawaha/kagome-dict v1.0.9 // indirect + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/wakati/go.sum b/_examples/wakati/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/wakati/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/sample/_example/wakati/main.go b/_examples/wakati/main.go similarity index 100% rename from sample/_example/wakati/main.go rename to _examples/wakati/main.go diff --git a/_examples/wasm/README.md b/_examples/wasm/README.md new file mode 100644 index 0000000..ffbdc01 --- /dev/null +++ b/_examples/wasm/README.md @@ -0,0 +1,23 @@ +# WebAssembly Example of Kagome + +- Build + +```sh +GOOS=js GOARCH=wasm go build -o kagome.wasm main.go +``` + +```shellsession +├── docs ... gh-pages +│   ├── index.html +│   ├── kagome.wasm +│   └── wasm_exec.js +├── _examples +│   └── wasm +│   ├── README.md ... this document +│   ├── kagome.html ... html sample +│   ├── main.go ... source code +│   ├── go.mod +│   └── go.sum +``` + +- Online demo: [https://ikawaha.github.io/kagome/](https://ikawaha.github.io/kagome/) diff --git a/_examples/wasm/go.mod b/_examples/wasm/go.mod new file mode 100644 index 0000000..a916734 --- /dev/null +++ b/_examples/wasm/go.mod @@ -0,0 +1,12 @@ +module kagome/examples/wasm + +go 1.19 + +require ( + github.com/ikawaha/kagome-dict/ipa v1.0.10 + github.com/ikawaha/kagome/v2 v2.9.3 +) + +require github.com/ikawaha/kagome-dict v1.0.9 // indirect + +replace github.com/ikawaha/kagome/v2 => ../../ diff --git a/_examples/wasm/go.sum b/_examples/wasm/go.sum new file mode 100644 index 0000000..2c9b28a --- /dev/null +++ b/_examples/wasm/go.sum @@ -0,0 +1,4 @@ +github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= +github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= +github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= +github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= diff --git a/sample/wasm/kagome.html b/_examples/wasm/kagome.html similarity index 100% rename from sample/wasm/kagome.html rename to _examples/wasm/kagome.html diff --git a/sample/wasm/main.go b/_examples/wasm/main.go similarity index 96% rename from sample/wasm/main.go rename to _examples/wasm/main.go index a4f08f0..24a2571 100644 --- a/sample/wasm/main.go +++ b/_examples/wasm/main.go @@ -1,5 +1,5 @@ -//go:build ignore -// +build ignore +//go:build js && wasm +// +build js,wasm package main diff --git a/sample/wasm/README.md b/sample/wasm/README.md deleted file mode 100644 index 9e72e2c..0000000 --- a/sample/wasm/README.md +++ /dev/null @@ -1,21 +0,0 @@ -WebAssembly Sample ---- - -``` -GOOS=js GOARCH=wasm go build -o kagome.wasm main.go -``` - - -``` -├── docs ... gh-pages -│   ├── index.html -│   ├── kagome.wasm -│   └── wasm_exec.js -├── sample -│   └── wasm -│   ├── README.md ... this document. -│   ├── go.mod -│   ├── kagome.html ... html sample -│   └── main.go ... -``` -demo. https://ikawaha.github.io/kagome/ \ No newline at end of file diff --git a/sample/wasm/go.mod b/sample/wasm/go.mod deleted file mode 100644 index 89d4416..0000000 --- a/sample/wasm/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module sample - -go 1.16