From 79157e22349fb5aeb789750fc396d42fb7e45c12 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 22 Mar 2023 16:10:14 +0800 Subject: [PATCH 1/3] updated docs Signed-off-by: ftgreat --- README.md | 2 +- README_zh.md | 2 +- test.py | 3 --- 3 files changed, 2 insertions(+), 5 deletions(-) delete mode 100644 test.py diff --git a/README.md b/README.md index f853b490..a2e97b8d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ FlagAI (Fast LArge-scale General AI models) is a fast, easy-to-use and extensibl The code is partially based on [GLM](https://github.com/THUDM/GLM), [Transformers](https://github.com/huggingface/transformers),[timm](https://github.com/rwightman/pytorch-image-models) and [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). ## News -- [17 Mar 2023] release v1.6.2, Support application of new optimizers [#266](https://github.com/FlagAI-Open/FlagAI/pull/266); +- [17 Mar 2023] release v1.6.2, Support application of new optimizers [#266](https://github.com/FlagAI-Open/FlagAI/pull/266), and added a new gpt model name 'GPT2-base-en' for English; - [2 Mar 2023] release v1.6.1, Support Galactica model [#234](https://github.com/FlagAI-Open/FlagAI/pull/234); BMInf, a low-resource inference package [#238](https://github.com/FlagAI-Open/FlagAI/pull/238), and examples for p-tuning [#227](https://github.com/FlagAI-Open/FlagAI/pull/238) - [12 Jan 2023] release v1.6.0, support a new parallel lib called [**BMTrain**](https://github.com/OpenBMB/BMTrain) and integate [**Flash Attention**](https://github.com/HazyResearch/flash-attention) to speedup training of Bert and Vit models, examples in [FlashAttentionBERT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/bert_title_generation_english/train_flash_atten.py) and [FlashAttentionViT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/vit_cifar100/train_single_gpu_flash_atten.py). Also add the contrastive search based text generation method [**SimCTG**](https://github.com/yxuansu/SimCTG) and DreamBooth finetuning based on AltDiffusion, examples in [AltDiffusionNaruto](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/AltDiffusion/dreambooth.py). - [28 Nov 2022] release v1.5.0, support 1.1B [**EVA-CLIP**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/EVA_CLIP) and [ALM: A large Arabic Language Model based on GLM], examples in [**ALM**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/ALM) diff --git a/README_zh.md b/README_zh.md index 6836f6d7..18d2b6f6 100644 --- a/README_zh.md +++ b/README_zh.md @@ -21,7 +21,7 @@ 本项目的部分代码基于[GLM](https://github.com/THUDM/GLM),[Transformers](https://github.com/huggingface/transformers),[timm](https://github.com/rwightman/pytorch-image-models) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). ## 动态 -- [17 Mar 2023] 支持v1.6.2版本, 可以使用新的优化器 [#266](https://github.com/FlagAI-Open/FlagAI/pull/266); +- [17 Mar 2023] 支持v1.6.2版本, 可以使用新的优化器 [#266](https://github.com/FlagAI-Open/FlagAI/pull/266), 并增加了英文gpt模型GPT2-base-en; - [2 Mar 2023] 支持v1.6.1版本, 增加Galactica模型 [#234](https://github.com/FlagAI-Open/FlagAI/pull/234), 大模型推理的低资源工具包BMInf [#238](https://github.com/FlagAI-Open/FlagAI/pull/238), 以及P-tuning样例 [#227](https://github.com/FlagAI-Open/FlagAI/pull/238) - [12 Jan 2023] 发布v1.6.0版本, 新增支持并行训练库 [**BMTrain**](https://github.com/OpenBMB/BMTrain) 以及集成 [**Flash Attention**](https://github.com/HazyResearch/flash-attention) 到 Bert 和 Vit 模型提速端到端训练, 示例见 [FlashAttentionBERT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/bert_title_generation_english/train_flash_atten.py)和 [FlashAttentionViT](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/vit_cifar100/train_single_gpu_flash_atten.py). 同时增加了基于对比搜索的文本生成方法 [**SimCTG**](https://github.com/yxuansu/SimCTG) 以及基于 AltDiffusion 进行 DreamBooth 个性化微调, 示例见 [AltDiffusionNaruto](https://github.com/FlagAI-Open/FlagAI/blob/master/examples/AltDiffusion/dreambooth.py). - [28 Nov 2022] 发布v1.5.0版本, 支持1.1B参数的 [**EVA-CLIP**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/EVA_CLIP) 以及[ALM: 基于GLM的阿拉伯语大模型], 示例见[**ALM**](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/ALM) diff --git a/test.py b/test.py deleted file mode 100644 index b64b151c..00000000 --- a/test.py +++ /dev/null @@ -1,3 +0,0 @@ -import sys;sys.path.append("/home/yanzhaodong/FlagAI") -from flagai.data.tokenizer import Tokenizer -tokenizer = Tokenizer.from_pretrained("gpt2_new_1w") From 552ecf76f32af44b357f7a590983879d93e79c56 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 22 Mar 2023 18:53:17 +0800 Subject: [PATCH 2/3] fixed tokenizer test Signed-off-by: ftgreat --- examples/glm_seq2seq/generate.py | 3 +- examples/glm_seq2seq/train_deepspeed.py | 2 +- .../data/tokenizer/uni_tokenizer/tokenizer.py | 62 ++++++-------- tests/test_tokenizer.py | 81 ++++++++++--------- 4 files changed, 69 insertions(+), 79 deletions(-) diff --git a/examples/glm_seq2seq/generate.py b/examples/glm_seq2seq/generate.py index 85eea9b1..a3579b75 100644 --- a/examples/glm_seq2seq/generate.py +++ b/examples/glm_seq2seq/generate.py @@ -40,8 +40,7 @@ train_dataset = train_dataset[:1] valid_dataset = valid_dataset[:1] model = GLMForSeq2Seq.from_pretrain(model_name=model_name) -model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints/310000/pytorch_model.bin")["module"]) -# model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints_lang/135000/pytorch_model.bin")["module"]) +# model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints/310000/pytorch_model.bin")["module"]) print("model loaded") trainer = Trainer(env_type='pytorch', diff --git a/examples/glm_seq2seq/train_deepspeed.py b/examples/glm_seq2seq/train_deepspeed.py index d70ecc13..e0898220 100644 --- a/examples/glm_seq2seq/train_deepspeed.py +++ b/examples/glm_seq2seq/train_deepspeed.py @@ -39,7 +39,7 @@ task_name=task_name) model = GLMForSeq2Seq.from_pretrain(model_name=model_name) -model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints/140000/pytorch_model.bin")["module"]) +# model.load_state_dict(torch.load("/home/yanzhaodong/anhforth/FlagAI/examples/glm_seq2seq/checkpoints/140000/pytorch_model.bin")["module"]) trainer = Trainer(env_type='deepspeed', epochs=10000000, batch_size=16, diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 673f56d3..5e010347 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -93,9 +93,6 @@ def __init__(self, self.num_command_tokens = 6 self.num_text_tokens = self.num_tokens - 5 self.num_type_tokens = 2 - self.token_start_id = None - self.token_end_id = None - self.token_pad_id = None try: self._command_tokens = [ CommandToken( @@ -117,12 +114,6 @@ def __init__(self, 'eos', '[PAD]', self.text_tokenizer.convert_token_to_id('[PAD]')), ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '[CLS]') - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '[SEP]') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '[PAD]') self.text_tokenizer._token_cls = "[CLS]" self.text_tokenizer._token_sep = "[SEP]" @@ -148,12 +139,6 @@ def __init__(self, 'eos', '[PAD]', self.text_tokenizer.convert_token_to_id('')), ] - self.token_start_id = self.text_tokenizer.convert_token_to_id( - '') - # self.token_end_id = self.text_tokenizer.convert_token_to_id( - # '') - self.token_pad_id = self.text_tokenizer.convert_token_to_id( - '') self.text_tokenizer._token_cls = "" self.text_tokenizer._token_sep = "" if add_block_symbols: @@ -279,21 +264,16 @@ def __init__(self, [CommandToken('dBLOCK', '[dBLOCK]', self.num_tokens)]) self.num_tokens += 1 self.num_command_tokens += 1 + if self.tokenizer_model_name.lower().startswith('opt'): + self._command_tokens = [] + special_tokens = ['cls', 'pad', 'bos','eos', 'unk', 'mask'] elif self.tokenizer_class == "sp": + fix_command_token = True self.num_command_tokens = 0 self.num_text_tokens = self.text_tokenizer.vocab_size self.num_tokens = self.num_text_tokens - if self.tokenizer_model_name.lower().startswith('glm'): - pad_token_id = self.num_tokens - eos_token_id = self.num_tokens - unk_token_id = self.num_tokens + 4 - else: - pad_token_id = self.text_tokenizer.convert_token_to_id('') - eos_token_id = self.text_tokenizer.convert_token_to_id('') - unk_token_id = self.text_tokenizer.convert_token_to_id('') - self._command_tokens = [ CommandToken('pad', '<|endoftext|>', self.num_text_tokens), CommandToken('eos', '<|endoftext|>', self.num_text_tokens), @@ -371,7 +351,8 @@ def __init__(self, for tk in special_tokens: if tk not in self.command_name_map: res = self.search_special(tk) - self.add_command_token(tk, res,self.tokenizer_class) + if res: + self.add_command_token(tk, res,self.tokenizer_class) self.command_name_map = {tok.name: tok for tok in self._command_tokens} self.command_token_map = { @@ -381,18 +362,27 @@ def __init__(self, self.command_id_map = {tok.Id: tok for tok in self._command_tokens} self._command_token_tokens = list(self.command_token_map.keys()) vocab = self.text_tokenizer.get_vocab() - try: - self.token_start_id = self.TokenToId('') - except KeyError: - self.token_start_id = self.TokenToId('[CLS]') - try: - self.token_end_id = self.TokenToId("") - except KeyError: - self.token_end_id = self.TokenToId("<|endoftext|>") - except KeyError: - self.token_end_id = self.TokenToId("[SEP]") + for potential_start in ['', '[CLS]']: + try: + self.token_start_id = self.TokenToId(potential_start) + break + except KeyError: + pass + + for potential_end in ["", "<|endoftext|>", "[SEP]"]: + try: + self.token_end_id = self.TokenToId(potential_end) + break + except KeyError: + pass + for potential_pad in ['[PAD]', '']: + try: + self.token_pad_id = self.TokenToId(potential_pad) + break + except KeyError: + pass print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): @@ -662,7 +652,7 @@ def prepare_for_model( self.get_command_id("cls") except KeyError: add_special_tokens = False - + if add_special_tokens: if pair_ids is not None: sequence = [self.get_command_id("cls")] + ids + [ diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index c72d34a7..34d8af39 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -35,17 +35,17 @@ def test_tokenizer_GLM_large_en(self): ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)}) - # def test_tokenizer_glm_10b_en(self): - # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") - # self.assertEqual(tokenizer.TokenToId("day"), 820, '') - # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # [25520, 9015, 1838, 502, 3772], '') - # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), - # 'fried chicken makes me happy', 'DecodeIds Error') - # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + # # # def test_tokenizer_glm_10b_en(self): + # # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # # # self.assertEqual(tokenizer.TokenToId("day"), 820, '') + # # # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # # # [25520, 9015, 1838, 502, 3772], '') + # # # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), + # # # 'fried chicken makes me happy', 'DecodeIds Error') + # # # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # # # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # # # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # # # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) def test_tokenizer_t5(self): @@ -61,8 +61,9 @@ def test_tokenizer_t5(self): self.assertEqual(encode_plus_result['input_ids'], [101, 306, 1231, 798, 5447, 798, 266, 4017, 1738, 1166, 102], 'encode_plus Error') self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), - {('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - ('sep', '[SEP]', 102), ('pad', '[PAD]', 0)}, 'SpecialTokens error') + {('pad', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('eos', '[PAD]', 0), ('sop', '<|startofpiece|>', 50000), ('eop', '<|endofpiece|>', 50001), + ('gMASK', '[gMASK]', 50002), ('sMASK', '[sMASK]', 50003)}, 'SpecialTokens error') def test_tokenizer_roberta(self): @@ -77,8 +78,9 @@ def test_tokenizer_roberta(self): self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], [101, 791, 1921, 1391, 7649, 1391, 749, 5507, 2548, 1825, 102], 'encode_plus Error') self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), - {('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), ('mask', '[MASK]', 103), - ('eos', '[PAD]', 0), ('pad', '[PAD]', 0)}, 'SpecialTokens error') + {('pad', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + ('sep', '[SEP]', 102), ('eos', '[PAD]', 0), ('sop', '<|startofpiece|>', 21128), + ('eop', '<|endofpiece|>', 21129), ('gMASK', '[gMASK]', 21130), ('sMASK', '[sMASK]', 21131)}, 'SpecialTokens error') def test_tokenizer_bert(self): tokenizer = Tokenizer.from_pretrained('BERT-base-en') @@ -93,39 +95,38 @@ def test_tokenizer_bert(self): [101, 13017, 7975, 3084, 2033, 3407, 102], 'encode_plus Error') self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), {('eos', '[PAD]', 0), ('unk', '[UNK]', 100), ('cls', '[CLS]', 101), ('sep', '[SEP]', 102), - ('mask', '[MASK]', 103), ('pad', '[PAD]', 0)}, 'SpecialTokens error') - - # def test_tokenizer_cpm1(self): - # loader = AutoLoader(task_name="lm", - # model_name="CPM-large-ch", - # model_dir="./checkpoints/", - # only_download_config=True) + ('mask', '[MASK]', 103), ('pad', '[PAD]', 0),('sop', '<|startofpiece|>', 30522), + ('eop', '<|endofpiece|>', 30523), ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)}, 'SpecialTokens error') + + # # def test_tokenizer_cpm1(self): + # # loader = AutoLoader(task_name="lm", + # # model_name="CPM-large-ch", + # # model_dir="./checkpoints/", + # # only_download_config=True) - # tokenizer = loader.get_tokenizer() - # self.assertEqual(tokenizer.TokenToId("人"), 62, '') - # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), - # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') - # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), - # '今天吃饭吃了肯德基', 'DecodeIds Error') - # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') - # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') - # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), - # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), - # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') + # # tokenizer = loader.get_tokenizer() + # # self.assertEqual(tokenizer.TokenToId("人"), 62, '') + # # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), + # # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') + # # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), + # # '今天吃饭吃了肯德基', 'DecodeIds Error') + # # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') + # # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') + # # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + # # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), + # # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') def test_tokenizer_opt(self): tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') self.assertEqual(tokenizer.encode("day"), [1208], '') - self.assertEqual(tokenizer.encode_plus("fried chicken makes me happy")["input_ids"], - [0, 21209, 5884, 817, 162, 1372, 2], '') self.assertEqual(tokenizer.decode([21209, 5884, 817, 162, 1372]), 'fried chicken makes me happy', 'DecodeIds Error') self.assertEqual(tokenizer.tokenize('fried chicken makes me happy'), ['fried', 'Ġchicken', 'Ġmakes', 'Ġme', 'Ġhappy'], 'tokenize Error') self.assertEqual(tokenizer.encode_plus('fried chicken makes me happy')['input_ids'], - [0, 21209, 5884, 817, 162, 1372, 2], 'encode_plus Error') + [2, 21209, 5884, 817, 162, 1372], 'encode_plus Error') self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), {('cls', '', 0), ('pad', '', 1), ('bos', '', 2), ('eos', '', 2), ('unk', '', 3), ('mask', '', 50264)}, 'SpecialTokens error') @@ -148,11 +149,11 @@ def suite(): suite = unittest.TestSuite() suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) - # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) + # # # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) suite.addTest(TokenizerTestCase('test_tokenizer_t5')) suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + # # # # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) suite.addTest(TokenizerTestCase('test_tokenizer_opt')) suite.addTest(TokenizerTestCase('test_tokenizer_clip')) suite.addTest(TokenizerTestCase('test_tokenizer_evaclip')) From 892bf0bd37930c7003057e92d67df9049d75b151 Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 22 Mar 2023 18:55:00 +0800 Subject: [PATCH 3/3] comment change Signed-off-by: ftgreat --- tests/test_tokenizer.py | 62 ++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 34d8af39..f401be04 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -35,17 +35,17 @@ def test_tokenizer_GLM_large_en(self): ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)}) - # # # def test_tokenizer_glm_10b_en(self): - # # # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") - # # # self.assertEqual(tokenizer.TokenToId("day"), 820, '') - # # # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), - # # # [25520, 9015, 1838, 502, 3772], '') - # # # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), - # # # 'fried chicken makes me happy', 'DecodeIds Error') - # # # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], - # # # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), - # # # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), - # # # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) + # def test_tokenizer_glm_10b_en(self): + # tokenizer = Tokenizer.from_pretrained("GLM-10b-en") + # self.assertEqual(tokenizer.TokenToId("day"), 820, '') + # self.assertEqual(tokenizer.EncodeAsIds("fried chicken makes me happy"), + # [25520, 9015, 1838, 502, 3772], '') + # self.assertEqual(tokenizer.DecodeIds([25520, 9015, 1838, 502, 3772]), + # 'fried chicken makes me happy', 'DecodeIds Error') + # self.assertEqual([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()], + # [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('mask', '[MASK]', 103), ('unk', '[UNK]', 100), + # ('sep', '[SEP]', 102), ('pad', '[PAD]', 0), ('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), + # ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)]) def test_tokenizer_t5(self): @@ -98,25 +98,25 @@ def test_tokenizer_bert(self): ('mask', '[MASK]', 103), ('pad', '[PAD]', 0),('sop', '<|startofpiece|>', 30522), ('eop', '<|endofpiece|>', 30523), ('gMASK', '[gMASK]', 30524), ('sMASK', '[sMASK]', 30525)}, 'SpecialTokens error') - # # def test_tokenizer_cpm1(self): - # # loader = AutoLoader(task_name="lm", - # # model_name="CPM-large-ch", - # # model_dir="./checkpoints/", - # # only_download_config=True) + # def test_tokenizer_cpm1(self): + # loader = AutoLoader(task_name="lm", + # model_name="CPM-large-ch", + # model_dir="./checkpoints/", + # only_download_config=True) - # # tokenizer = loader.get_tokenizer() - # # self.assertEqual(tokenizer.TokenToId("人"), 62, '') - # # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), - # # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') - # # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), - # # '今天吃饭吃了肯德基', 'DecodeIds Error') - # # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), - # # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') - # # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], - # # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') - # # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), - # # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), - # # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') + # tokenizer = loader.get_tokenizer() + # self.assertEqual(tokenizer.TokenToId("人"), 62, '') + # self.assertEqual(tokenizer.encode("今天吃饭吃了肯德基"), + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], '') + # self.assertEqual(tokenizer.DecodeIds([837, 3079, 1777, 3079, 139, 3687, 513, 1463]), + # '今天吃饭吃了肯德基', 'DecodeIds Error') + # self.assertEqual(tokenizer.tokenize('今天吃饭吃了肯德基'), + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'tokenize Error') + # self.assertEqual(tokenizer.encode_plus('今天吃饭吃了肯德基')['input_ids'], + # [837, 3079, 1777, 3079, 139, 3687, 513, 1463], 'encode_plus Error') + # self.assertEqual(set([(k, v.token, v.Id) for k,v in tokenizer.command_name_map.items()]), + # {('unk', '', 0), ('cls', '', 1), ('eos', '', 2), ('sep', '', 4), + # ('mask', '', 6), ('pad', '', 5),('eod', '', 7)}, 'SpecialTokens error') def test_tokenizer_opt(self): tokenizer = Tokenizer.from_pretrained('opt-1.3b-en') @@ -149,11 +149,11 @@ def suite(): suite = unittest.TestSuite() suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch')) suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en')) - # # # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) + # suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en')) suite.addTest(TokenizerTestCase('test_tokenizer_t5')) suite.addTest(TokenizerTestCase('test_tokenizer_roberta')) suite.addTest(TokenizerTestCase('test_tokenizer_bert')) - # # # # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) + # suite.addTest(TokenizerTestCase('test_tokenizer_cpm1')) suite.addTest(TokenizerTestCase('test_tokenizer_opt')) suite.addTest(TokenizerTestCase('test_tokenizer_clip')) suite.addTest(TokenizerTestCase('test_tokenizer_evaclip'))