diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index fe5b16ae..f67e8219 100755 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -4,8 +4,9 @@ import importlib import os import copy -from flagai.model.file_utils import _get_model_id - +from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files +from flagai.model.aquila2.modeling_aquila import AquilaForCausalLM +import torch class LazyImport(object): @@ -16,7 +17,7 @@ def __init__(self, name): def __getattr__(self, name): mod = self.cache.get(self.mod_name) if not mod: - mod = importlib.import_module(self.mod_name) + mod = importlib.import_module(self.mod_name) self.cache[self.mod_name] = mod return getattr(mod, name) @@ -163,7 +164,12 @@ def __init__(self, model_name: str = "RoBERTa-base-ch", model_dir: str = "./checkpoints/", only_download_config: bool = False, - device="cpu", + device="cuda", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + lora_dir=None, + qlora_dir=None, + quantization_config=None, **kwargs): """ Args: @@ -194,66 +200,149 @@ def __init__(self, raw_model_name = copy.deepcopy(model_name) model_name = model_name.lower() - if model_name not in MODEL_DICT: + if model_name not in MODEL_DICT and task_name != "aquila2": print(f"The model_name: {model_name} is not be supported") print(f"All supported models are {list(MODEL_DICT.keys())}") return + if task_name == "aquila2": + download_path = os.path.join(model_dir, model_name) + + if not os.path.exists(download_path): + # Try to download from ModelHub + try: + model_id = _get_model_id(model_name) + except: + raise FileNotFoundError("Model name not found in local path and ModelHub") + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(download_path, file_name, model_id) - brief_model_name = MODEL_DICT[model_name][2] - model_type = MODEL_DICT[model_name][3] - # The dir to save config, vocab and model. + if os.path.exists( + os.path.join(download_path, 'config.json')): + if os.getenv('ENV_TYPE') == 'deepspeed+mpu': + model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) + if model_parallel_size > 1: + # if gpus == nums_of_modelhub_models + # can load + # else need to download the pytorch_model.bin and to recut. + model_hub_parallel_size = 0 + for f in model_files: + if "pytorch_model_" in f: + model_hub_parallel_size += 1 + else: + model_parallel_size = 1 - self.model_name = ALL_TASK.get(f"{brief_model_name}_{task_name}", None) - if self.model_name is None: - print(f"For the model_name: {model_name}, task_name: {task_name} \ - is not be supported.") - tasks = self.get_task_name(brief_model_name) - print( - f"For the model_name: {model_name}, these tasks are be supported: {tasks}" - ) - return - download_path = os.path.join(model_dir, raw_model_name) - print("*" * 20, task_name, model_name) - model_name_ = self.is_exist_finetuned_model(raw_model_name, task_name) - self.model = getattr(LazyImport(self.model_name[0]), - self.model_name[1]).from_pretrain( - download_path=model_dir, - model_name=model_name_, - only_download_config=only_download_config, - device=device, - **kwargs) + if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: + # Only to download the model slices(megatron-lm). + for file_to_load in model_files: + if "pytorch_model_" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) - if model_type == "nlp": - if brief_model_name in ["galactica",]: - self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5])(download_path) - else : - tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), - "Tokenizer") - self.tokenizer = tokenizer_class.from_pretrained( - model_name, cache_dir=download_path) + elif 'pytorch_model.bin' in model_files: + checkpoint_path = _get_checkpoint_path( + download_path, 'pytorch_model.bin', model_id) + else: + checkpoint_merge = {} + # maybe multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + + if qlora_dir: + from transformers import BitsAndBytesConfig + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch_dtype, + ) - elif model_type == "mm": - if model_name.startswith("altdiffusion"): - self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5]).from_pretrained(os.path.join(model_dir, raw_model_name)) - self.tokenizer = self.process.tokenizer - self.model.tokenizer = self.tokenizer - elif "altclip" not in model_name: - from flagai.data.tokenizer.clip.tokenizer import ClipTokenizer - self.tokenizer = ClipTokenizer(bpe_path=os.path.join(download_path, 'bpe_simple_vocab_16e6.txt.gz')) - self.transform = None - else: - - self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5]).from_pretrained( - os.path.join(model_dir, raw_model_name)) - self.transform = self.process.feature_extractor - self.tokenizer = self.process.tokenizer + model = AquilaForCausalLM.from_pretrained(download_path, + low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype, + quantization_config=quantization_config) + + model.eval() + # from accelerate import load_checkpoint_and_dispatch + # model = load_checkpoint_and_dispatch( + # model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"]) + if not qlora_dir: + model.to(device) + if lora_dir: + from flagai.model.tools.peft import PeftModel + model = PeftModel.from_pretrained(model, lora_dir) + print("lora modules loaded") + if qlora_dir: + from flagai.model.tools.peft import PeftModel + model = PeftModel.from_pretrained(model, qlora_dir) + print("Qlora modules loaded") + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name) + self.model = model + self.tokenizer = tokenizer else: - self.tokenizer = None - self.transform = None + brief_model_name = MODEL_DICT[model_name][2] + model_type = MODEL_DICT[model_name][3] + # The dir to save config, vocab and model. + + self.model_name = ALL_TASK.get(f"{brief_model_name}_{task_name}", None) + if self.model_name is None: + print(f"For the model_name: {model_name}, task_name: {task_name} \ + is not be supported.") + tasks = self.get_task_name(brief_model_name) + print( + f"For the model_name: {model_name}, these tasks are be supported: {tasks}" + ) + return + download_path = os.path.join(model_dir, raw_model_name) + print("*" * 20, task_name, model_name) + model_name_ = self.is_exist_finetuned_model(raw_model_name, task_name) + self.model = getattr(LazyImport(self.model_name[0]), + self.model_name[1]).from_pretrain( + download_path=model_dir, + model_name=model_name_, + only_download_config=only_download_config, + device=device, + **kwargs) + + if model_type == "nlp": + if brief_model_name in ["galactica",]: + self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5])(download_path) + # elif 'Aquila2-7b' in model_name: + + else : + tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), + "Tokenizer") + self.tokenizer = tokenizer_class.from_pretrained( + model_name, cache_dir=download_path) + + elif model_type == "mm": + if model_name.startswith("altdiffusion"): + self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5]).from_pretrained(os.path.join(model_dir, raw_model_name)) + self.tokenizer = self.process.tokenizer + self.model.tokenizer = self.tokenizer + elif "altclip" not in model_name: + from flagai.data.tokenizer.clip.tokenizer import ClipTokenizer + self.tokenizer = ClipTokenizer(bpe_path=os.path.join(download_path, 'bpe_simple_vocab_16e6.txt.gz')) + self.transform = None + else: + + self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5]).from_pretrained( + os.path.join(model_dir, raw_model_name)) + self.transform = self.process.feature_extractor + self.tokenizer = self.process.tokenizer + + else: + self.tokenizer = None + self.transform = None def is_exist_finetuned_model(self, raw_model_name, task_name): try: diff --git a/flagai/model/aquila2/configuration_aquila.py b/flagai/model/aquila2/configuration_aquila.py new file mode 100644 index 00000000..f364ebd6 --- /dev/null +++ b/flagai/model/aquila2/configuration_aquila.py @@ -0,0 +1,128 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Aquila model configuration""" + +from transformers import PretrainedConfig + + + +class AquilaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Aquila-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`AquilaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import AquilaModel, AquilaConfig + + >>> # Initializing a Aquila aquila-7b style configuration + >>> configuration = AquilaConfig() + + >>> # Initializing a model from the aquila-7b style configuration + >>> model = AquilaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "aquila" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=100008, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/flagai/model/aquila_modeling_hf.py b/flagai/model/aquila2/modeling_aquila.py old mode 100644 new mode 100755 similarity index 72% rename from flagai/model/aquila_modeling_hf.py rename to flagai/model/aquila2/modeling_aquila.py index 254669d2..b1ae0cac --- a/flagai/model/aquila_modeling_hf.py +++ b/flagai/model/aquila2/modeling_aquila.py @@ -31,6 +31,17 @@ from transformers.modeling_utils import PreTrainedModel from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_aquila import AquilaConfig +from transformers import ( + LogitsProcessorList, + MinLengthLogitsProcessor, + TopKLogitsWarper, + TemperatureLogitsWarper, + TopPLogitsWarper, + StoppingCriteriaList, + MaxLengthCriteria, + BitsAndBytesConfig, +) +from .utils import * logger = logging.get_logger(__name__) @@ -93,34 +104,83 @@ def forward(self, hidden_states): class AquilaRotaryEmbedding(torch.nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) - self.register_buffer("inv_freq", inv_freq) + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) # Build here to make `torch.jit.trace` work. - self.max_seq_len_cached = max_position_embeddings - t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) def forward(self, x, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. if seq_len > self.max_seq_len_cached: - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1).to(x.device) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + return ( self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), ) +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Aquila +class AquilaLinearScalingRotaryEmbedding(AquilaRotaryEmbedding): + """AquilaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Aquila +class AquilaDynamicNTKScalingRotaryEmbedding(AquilaRotaryEmbedding): + """AquilaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + def rotate_half(x): """Rotates half the hidden dims of the input.""" @@ -142,33 +202,64 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Aquila class AquilaMLP(nn.Module): - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - ): + def __init__(self, config): super().__init__() - self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) - self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.act_fn = ACT2FN[hidden_act] + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 + ) + up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Aquila class AquilaAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - def __init__(self, config: AquilaConfig): super().__init__() self.config = config self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -176,10 +267,37 @@ def __init__(self, config: AquilaConfig): f" and `num_heads`: {self.num_heads})." ) self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.rotary_emb = AquilaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = AquilaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = AquilaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = AquilaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() @@ -195,16 +313,37 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - # [bsz, nh, t, hd] if past_key_value is not None: # reuse k, v, self_attention @@ -213,8 +352,11 @@ def forward( past_key_value = (key_states, value_states) if use_cache else None - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -228,9 +370,6 @@ def forward( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask - attn_weights = torch.max( - attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device) - ) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) @@ -242,10 +381,15 @@ def forward( f" {attn_output.size()}" ) - attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -259,11 +403,7 @@ def __init__(self, config: AquilaConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = AquilaAttention(config=config) - self.mlp = AquilaMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - ) + self.mlp = AquilaMLP(config) self.input_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -321,7 +461,6 @@ def forward( return outputs - AQUILA_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads @@ -350,7 +489,6 @@ class AquilaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["AquilaDecoderLayer"] _skip_keys_device_placement = "past_key_values" - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range @@ -570,7 +708,7 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value - return module(*inputs, output_attentions, None) + return module(*inputs, past_key_value, output_attentions) return custom_forward @@ -579,7 +717,6 @@ def custom_forward(*inputs): hidden_states, attention_mask, position_ids, - None, ) else: layer_outputs = decoder_layer( @@ -615,13 +752,14 @@ def custom_forward(*inputs): attentions=all_self_attns, ) - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->AQUILA,Llama->Aquila class AquilaForCausalLM(AquilaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + def __init__(self, config): super().__init__(config) self.model = AquilaModel(config) - + self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing @@ -706,7 +844,13 @@ def forward( ) hidden_states = outputs[0] - logits = self.lm_head(hidden_states) + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head(hidden_states) + logits = logits.float() loss = None if labels is not None: @@ -767,9 +911,117 @@ def prepare_inputs_for_generation( def _reorder_cache(past_key_values, beam_idx): reordered_past = () for layer_past in past_key_values: - reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) return reordered_past + def predict(self, text, tokenizer=None, + max_gen_len=200, top_p=0.95, + seed=1234, topk=100, + temperature=0.9, + sft=True, convo_template = "aquila-chat", + device = "cuda"): + + vocab = tokenizer.get_vocab() + #device = device + id2word = {v:k for k, v in vocab.items()} + + + set_random_seed(seed) + if temperature == 0: + topk = 1 + temperature = 1.0 + if sft: + tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template) + tokens = torch.tensor(tokens)[None,].to(device) + else : + tokens = tokenizer.encode_plus(text)["input_ids"] + print(tokenizer.decode(tokens)) + tokens = torch.tensor(tokens)[None,].to(device) + input_length = len(tokens[0]) + with torch.no_grad(): + + # instantiate logits processors + logits_processor = LogitsProcessorList( + [ + MinLengthLogitsProcessor(1, eos_token_id=100007), + ] + ) + # instantiate logits processors + logits_warper = LogitsProcessorList( + [ + TopPLogitsWarper(top_p), + TopKLogitsWarper(topk), + TemperatureLogitsWarper(temperature), + + ] + ) + + stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)]) + out = self.sample( + tokens, + logits_processor=logits_processor, + logits_warper=logits_warper, + stopping_criteria=stopping_criteria, + return_dict_in_generate=True, + output_scores=True, + ) + + + # print(out) + out_ids = out["sequences"][0][input_length:].cpu().numpy() + + out_scores = out["scores"] + + out_scores = torch.cat(out_scores, dim=0) + out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy() + + probs = [] + for i in range(len(out_ids)): + probs.append(float(out_scores[i][out_ids[i]])) + + # print(f"probs is {probs}") + + convert_tokens = [] + for t in out_ids: + if t == 100006: + convert_tokens.append("[CLS]") + else : + convert_tokens.append(id2word.get(t, "[unkonwn_token]")) + + out_text = tokenizer.decode(out_ids.tolist()) + + + out = out_text + + if "###" in out: + special_index = out.index("###") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "[UNK]" in out: + special_index = out.index("[UNK]") + out = out[:special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "" in out: + special_index = out.index("") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if len(out) > 0 and out[0] == " ": + out = out[1:] + + convert_tokens = convert_tokens[1:] + probs = probs[1:] + return out @add_start_docstrings( """ @@ -852,7 +1104,9 @@ def forward( sequence_lengths = -1 else: if input_ids is not None: - sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( + logits.device + ) else: sequence_lengths = -1 diff --git a/flagai/model/aquila2/utils.py b/flagai/model/aquila2/utils.py new file mode 100755 index 00000000..8e3de5f9 --- /dev/null +++ b/flagai/model/aquila2/utils.py @@ -0,0 +1,38 @@ +import random +import numpy as np +import torch +from fastchat.conversation import get_conv_template + +def set_random_seed(seed): + """Set random seed for reproducability.""" + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + + +def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): + # aquila-chat as default + conv = get_conv_template(convo_template) + + conv.append_message(conv.roles[1], None) + conv.append_message(conv.roles[0], text) + + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + while(len(history) > 0 and (len(example) < max_token)): + tmp = history.pop() + if tmp[0] == 'ASSISTANT': + conv.append_message(conv.roles[1], tmp[1]) + else: + conv.append_message(conv.roles[0], tmp[1]) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + if len(example) >= max_token: + conv.messages.pop() + conv.messages = conv.messages[::-1] + print('model in:', conv.get_prompt()) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + return example \ No newline at end of file