From 0ede7b67f5d0767ea8fb1333fb162a5542e9916f Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Tue, 12 Sep 2023 09:49:32 +0000 Subject: [PATCH 01/11] add hf aquila Signed-off-by: shunxing1234 --- .../Aquila/Aquila-chat-hf/modeling_aquila.py | 1030 +++++++++++++++++ 1 file changed, 1030 insertions(+) create mode 100644 examples/Aquila/Aquila-chat-hf/modeling_aquila.py diff --git a/examples/Aquila/Aquila-chat-hf/modeling_aquila.py b/examples/Aquila/Aquila-chat-hf/modeling_aquila.py new file mode 100644 index 00000000..32107375 --- /dev/null +++ b/examples/Aquila/Aquila-chat-hf/modeling_aquila.py @@ -0,0 +1,1030 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Aquila model.""" +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_aquila import AquilaConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "AquilaConfig" + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Aquila +class AquilaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + AquilaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + return (self.weight * hidden_states).to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Aquila +class AquilaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Aquila +class AquilaLinearScalingRotaryEmbedding(AquilaRotaryEmbedding): + """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Aquila +class AquilaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Aquila +class AquilaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 + ) + up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Aquila +class AquilaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + def __init__(self, config: AquilaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = AquilaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = AquilaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = AquilaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Aquila +class AquilaDecoderLayer(nn.Module): + def __init__(self, config: AquilaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = AquilaAttention(config=config) + self.mlp = AquilaMLP(config) + self.input_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + +AQUILA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AquilaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Aquila Model outputting raw hidden-states without any specific head on top.", + AQUILA_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Aquila +class AquilaPreTrainedModel(PreTrainedModel): + config_class = AquilaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["AquilaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, AquilaModel): + module.gradient_checkpointing = value + + +AQUILA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Aquila Model outputting raw hidden-states without any specific head on top.", + AQUILA_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaModel with LLAMA->AQUILA,Llama->Aquila +class AquilaModel(AquilaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AquilaDecoderLayer`] + + Args: + config: AquilaConfig + """ + + def __init__(self, config: AquilaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([AquilaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(AQUILA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + +# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->AQUILA,Llama->Aquila +class AquilaForCausalLM(AquilaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = AquilaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(AQUILA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, AquilaForCausalLM + + >>> model = AquilaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`AquilaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + AQUILA_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->AQUILA,Llama->Aquila +class AquilaForSequenceClassification(AquilaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = AquilaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(AQUILA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From de6410538b81d93ee4abdeff451df35f5d3095a9 Mon Sep 17 00:00:00 2001 From: shunxing1234 <33774367+shunxing1234@users.noreply.github.com> Date: Wed, 20 Sep 2023 14:15:55 +0800 Subject: [PATCH 02/11] Update modeling_aquila.py --- examples/Aquila/Aquila-chat-hf/modeling_aquila.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Aquila/Aquila-chat-hf/modeling_aquila.py b/examples/Aquila/Aquila-chat-hf/modeling_aquila.py index 32107375..e6c9a00c 100644 --- a/examples/Aquila/Aquila-chat-hf/modeling_aquila.py +++ b/examples/Aquila/Aquila-chat-hf/modeling_aquila.py @@ -346,7 +346,7 @@ def forward( value_states = repeat_kv(value_states, self.num_key_value_groups) attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - + attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" From 4a9bd36cc881fabeeffedff83e482703202af46e Mon Sep 17 00:00:00 2001 From: shunxing1234 <33774367+shunxing1234@users.noreply.github.com> Date: Wed, 20 Sep 2023 15:15:56 +0800 Subject: [PATCH 03/11] Rename modeling_aquila.py to aquila2_model.py --- .../Aquila-chat-hf/{modeling_aquila.py => aquila2_model.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/Aquila/Aquila-chat-hf/{modeling_aquila.py => aquila2_model.py} (100%) diff --git a/examples/Aquila/Aquila-chat-hf/modeling_aquila.py b/examples/Aquila/Aquila-chat-hf/aquila2_model.py similarity index 100% rename from examples/Aquila/Aquila-chat-hf/modeling_aquila.py rename to examples/Aquila/Aquila-chat-hf/aquila2_model.py From a3d229a5e25bd3199b5d13ae4b123fda3a187c54 Mon Sep 17 00:00:00 2001 From: shunxing1234 <33774367+shunxing1234@users.noreply.github.com> Date: Wed, 20 Sep 2023 15:30:59 +0800 Subject: [PATCH 04/11] Create configuration_aquila.py --- .../Aquila-chat-hf/configuration_aquila.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 examples/Aquila/Aquila-chat-hf/configuration_aquila.py diff --git a/examples/Aquila/Aquila-chat-hf/configuration_aquila.py b/examples/Aquila/Aquila-chat-hf/configuration_aquila.py new file mode 100644 index 00000000..8ba531b0 --- /dev/null +++ b/examples/Aquila/Aquila-chat-hf/configuration_aquila.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Aquila model configuration""" + +from transformers import PretrainedConfig + + + +class AquilaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Aquila-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`AquilaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import AquilaModel, AquilaConfig + + >>> # Initializing a Aquila aquila-7b style configuration + >>> configuration = AquilaConfig() + + >>> # Initializing a model from the aquila-7b style configuration + >>> model = AquilaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "aquila" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=100008, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) From 42ba8500efc5727b8f3ec707d7c3600f37369179 Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Thu, 21 Sep 2023 17:50:13 +0800 Subject: [PATCH 05/11] add aquila2 Signed-off-by: shunxing1234 --- ...aquila_modeling_hf.py => aquila2_model.py} | 244 ++++++++++++++---- flagai/model/configuration_aquila.py | 128 +++++++++ 2 files changed, 318 insertions(+), 54 deletions(-) rename flagai/model/{aquila_modeling_hf.py => aquila2_model.py} (78%) create mode 100644 flagai/model/configuration_aquila.py diff --git a/flagai/model/aquila_modeling_hf.py b/flagai/model/aquila2_model.py similarity index 78% rename from flagai/model/aquila_modeling_hf.py rename to flagai/model/aquila2_model.py index 254669d2..17c5c58a 100644 --- a/flagai/model/aquila_modeling_hf.py +++ b/flagai/model/aquila2_model.py @@ -93,34 +93,83 @@ def forward(self, hidden_states): class AquilaRotaryEmbedding(torch.nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) - self.register_buffer("inv_freq", inv_freq) + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) # Build here to make `torch.jit.trace` work. - self.max_seq_len_cached = max_position_embeddings - t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) def forward(self, x, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. if seq_len > self.max_seq_len_cached: - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1).to(x.device) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + return ( self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), ) +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Aquila +class AquilaLinearScalingRotaryEmbedding(AquilaRotaryEmbedding): + """AquilaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Aquila +class AquilaDynamicNTKScalingRotaryEmbedding(AquilaRotaryEmbedding): + """AquilaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + def rotate_half(x): """Rotates half the hidden dims of the input.""" @@ -142,33 +191,64 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Aquila class AquilaMLP(nn.Module): - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - ): + def __init__(self, config): super().__init__() - self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) - self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.act_fn = ACT2FN[hidden_act] + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 + ) + up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Aquila class AquilaAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - def __init__(self, config: AquilaConfig): super().__init__() self.config = config self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -176,10 +256,37 @@ def __init__(self, config: AquilaConfig): f" and `num_heads`: {self.num_heads})." ) self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.rotary_emb = AquilaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = AquilaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = AquilaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = AquilaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() @@ -195,16 +302,37 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - # [bsz, nh, t, hd] if past_key_value is not None: # reuse k, v, self_attention @@ -213,8 +341,11 @@ def forward( past_key_value = (key_states, value_states) if use_cache else None - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( @@ -228,9 +359,6 @@ def forward( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask - attn_weights = torch.max( - attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device) - ) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) @@ -242,10 +370,15 @@ def forward( f" {attn_output.size()}" ) - attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -259,11 +392,7 @@ def __init__(self, config: AquilaConfig): super().__init__() self.hidden_size = config.hidden_size self.self_attn = AquilaAttention(config=config) - self.mlp = AquilaMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - ) + self.mlp = AquilaMLP(config) self.input_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -321,7 +450,6 @@ def forward( return outputs - AQUILA_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads @@ -350,7 +478,6 @@ class AquilaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["AquilaDecoderLayer"] _skip_keys_device_placement = "past_key_values" - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range @@ -570,7 +697,7 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value - return module(*inputs, output_attentions, None) + return module(*inputs, past_key_value, output_attentions) return custom_forward @@ -579,7 +706,6 @@ def custom_forward(*inputs): hidden_states, attention_mask, position_ids, - None, ) else: layer_outputs = decoder_layer( @@ -615,13 +741,14 @@ def custom_forward(*inputs): attentions=all_self_attns, ) - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->AQUILA,Llama->Aquila class AquilaForCausalLM(AquilaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + def __init__(self, config): super().__init__(config) self.model = AquilaModel(config) - + self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing @@ -706,7 +833,13 @@ def forward( ) hidden_states = outputs[0] - logits = self.lm_head(hidden_states) + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head(hidden_states) + logits = logits.float() loss = None if labels is not None: @@ -767,10 +900,11 @@ def prepare_inputs_for_generation( def _reorder_cache(past_key_values, beam_idx): reordered_past = () for layer_past in past_key_values: - reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) return reordered_past - @add_start_docstrings( """ The LLaMa Model transformer with a sequence classification head on top (linear layer). @@ -852,7 +986,9 @@ def forward( sequence_lengths = -1 else: if input_ids is not None: - sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( + logits.device + ) else: sequence_lengths = -1 diff --git a/flagai/model/configuration_aquila.py b/flagai/model/configuration_aquila.py new file mode 100644 index 00000000..f364ebd6 --- /dev/null +++ b/flagai/model/configuration_aquila.py @@ -0,0 +1,128 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Aquila model configuration""" + +from transformers import PretrainedConfig + + + +class AquilaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Aquila-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`AquilaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + ```python + >>> from transformers import AquilaModel, AquilaConfig + + >>> # Initializing a Aquila aquila-7b style configuration + >>> configuration = AquilaConfig() + + >>> # Initializing a model from the aquila-7b style configuration + >>> model = AquilaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "aquila" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=100008, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) From cda28f7e9be85899f2d02b2a2a02e349ff05e05c Mon Sep 17 00:00:00 2001 From: shunxing1234 Date: Fri, 22 Sep 2023 09:06:00 +0800 Subject: [PATCH 06/11] mv aquila2 to model file Signed-off-by: shunxing1234 --- .../Aquila/Aquila-chat-hf/aquila2_model.py | 1030 ----------------- .../Aquila-chat-hf/configuration_aquila.py | 113 -- .../{ => aquila2}/configuration_aquila.py | 0 .../modeling_aquila.py} | 0 4 files changed, 1143 deletions(-) delete mode 100644 examples/Aquila/Aquila-chat-hf/aquila2_model.py delete mode 100644 examples/Aquila/Aquila-chat-hf/configuration_aquila.py rename flagai/model/{ => aquila2}/configuration_aquila.py (100%) rename flagai/model/{aquila2_model.py => aquila2/modeling_aquila.py} (100%) diff --git a/examples/Aquila/Aquila-chat-hf/aquila2_model.py b/examples/Aquila/Aquila-chat-hf/aquila2_model.py deleted file mode 100644 index e6c9a00c..00000000 --- a/examples/Aquila/Aquila-chat-hf/aquila2_model.py +++ /dev/null @@ -1,1030 +0,0 @@ -# coding=utf-8 -# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Aquila model.""" -import math -from typing import List, Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings -from .configuration_aquila import AquilaConfig - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "AquilaConfig" - - -# Copied from transformers.models.bart.modeling_bart._make_causal_mask -def _make_causal_mask( - input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 -): - """ - Make causal mask used for bi-directional self-attention. - """ - bsz, tgt_len = input_ids_shape - mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) - mask_cond = torch.arange(mask.size(-1), device=device) - mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) - mask = mask.to(dtype) - - if past_key_values_length > 0: - mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) - return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) - - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Aquila -class AquilaRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - AquilaRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - - return (self.weight * hidden_states).to(input_dtype) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Aquila -class AquilaRotaryEmbedding(torch.nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), - self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), - ) - -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Aquila -class AquilaLinearScalingRotaryEmbedding(AquilaRotaryEmbedding): - """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - t = t / self.scaling_factor - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) - -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Aquila -class AquilaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): - """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids): - # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. - cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] - sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] - cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Aquila -class AquilaMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - down_proj = sum(down_proj) - else: - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - return down_proj - - -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Aquila -class AquilaAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - def __init__(self, config: AquilaConfig): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self._init_rope() - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = AquilaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - self.rotary_emb = AquilaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = AquilaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.) - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Aquila -class AquilaDecoderLayer(nn.Module): - def __init__(self, config: AquilaConfig): - super().__init__() - self.hidden_size = config.hidden_size - self.self_attn = AquilaAttention(config=config) - self.mlp = AquilaMLP(config) - self.input_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - -AQUILA_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`AquilaConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Aquila Model outputting raw hidden-states without any specific head on top.", - AQUILA_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->Aquila -class AquilaPreTrainedModel(PreTrainedModel): - config_class = AquilaConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["AquilaDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, AquilaModel): - module.gradient_checkpointing = value - - -AQUILA_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape - `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Aquila Model outputting raw hidden-states without any specific head on top.", - AQUILA_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaModel with LLAMA->AQUILA,Llama->Aquila -class AquilaModel(AquilaPreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AquilaDecoderLayer`] - - Args: - config: AquilaConfig - """ - - def __init__(self, config: AquilaConfig): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList([AquilaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - if input_shape[-1] > 1: - combined_attention_mask = _make_causal_mask( - input_shape, - inputs_embeds.dtype, - device=inputs_embeds.device, - past_key_values_length=past_key_values_length, - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - @add_start_docstrings_to_model_forward(AQUILA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - seq_length_with_past = seq_length - past_key_values_length = 0 - - if past_key_values is not None: - past_key_values_length = past_key_values[0][0].shape[2] - seq_length_with_past = seq_length_with_past + past_key_values_length - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - # embed positions - if attention_mask is None: - attention_mask = torch.ones( - (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device - ) - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length - ) - - hidden_states = inputs_embeds - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = () if use_cache else None - - for idx, decoder_layer in enumerate(self.layers): - if output_hidden_states: - all_hidden_states += (hidden_states,) - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, past_key_value, output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - position_ids, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - -# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->AQUILA,Llama->Aquila -class AquilaForCausalLM(AquilaPreTrainedModel): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.model = AquilaModel(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(AQUILA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, AquilaForCausalLM - - >>> model = AquilaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) - >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - - >>> prompt = "Hey, are you consciours? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - if self.config.pretraining_tp > 1: - lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) - logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] - logits = torch.cat(logits, dim=-1) - else: - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - if past_key_values: - input_ids = input_ids[:, -1:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - -@add_start_docstrings( - """ - The LLaMa Model transformer with a sequence classification head on top (linear layer). - - [`AquilaForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - AQUILA_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->AQUILA,Llama->Aquila -class AquilaForSequenceClassification(AquilaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = AquilaModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(AQUILA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( - logits.device - ) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/examples/Aquila/Aquila-chat-hf/configuration_aquila.py b/examples/Aquila/Aquila-chat-hf/configuration_aquila.py deleted file mode 100644 index 8ba531b0..00000000 --- a/examples/Aquila/Aquila-chat-hf/configuration_aquila.py +++ /dev/null @@ -1,113 +0,0 @@ -# coding=utf-8 -# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Aquila model configuration""" - -from transformers import PretrainedConfig - - - -class AquilaConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the Aquila-7B. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`AquilaModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - tie_word_embeddings(`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - Example: - - ```python - >>> from transformers import AquilaModel, AquilaConfig - - >>> # Initializing a Aquila aquila-7b style configuration - >>> configuration = AquilaConfig() - - >>> # Initializing a model from the aquila-7b style configuration - >>> model = AquilaModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "aquila" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=100008, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/flagai/model/configuration_aquila.py b/flagai/model/aquila2/configuration_aquila.py similarity index 100% rename from flagai/model/configuration_aquila.py rename to flagai/model/aquila2/configuration_aquila.py diff --git a/flagai/model/aquila2_model.py b/flagai/model/aquila2/modeling_aquila.py similarity index 100% rename from flagai/model/aquila2_model.py rename to flagai/model/aquila2/modeling_aquila.py From 0ed67d43ae140748350b876481ad6a8c19ce1a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= Date: Mon, 25 Sep 2023 09:58:43 +0000 Subject: [PATCH 07/11] flagai fit for aquila2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 严照东 --- examples/Aquila2/generate_chat.py | 29 ++++ examples/Aquila2/utils.py | 26 ++++ flagai/auto_model/auto_loader.py | 154 +++++++++++++------- flagai/model/aquila2_model.py | 224 ++++++++++++++++++++++++++++++ 4 files changed, 379 insertions(+), 54 deletions(-) create mode 100755 examples/Aquila2/generate_chat.py create mode 100755 examples/Aquila2/utils.py create mode 100755 flagai/model/aquila2_model.py diff --git a/examples/Aquila2/generate_chat.py b/examples/Aquila2/generate_chat.py new file mode 100755 index 00000000..0cff2e8a --- /dev/null +++ b/examples/Aquila2/generate_chat.py @@ -0,0 +1,29 @@ +from flagai.auto_model.auto_loader import AutoLoader + +state_dict = "./checkpoints/" +model_name = 'Aquila2Chat-hf' + +state_dict = "/data2/20230907/" +model_name = 'iter_0205000_hf' + +autoloader = AutoLoader("aquila2", + model_dir=state_dict, + model_name=model_name, + qlora_dir="/data2/yzd/FastChat/checkpoints_out/30bhf_save/checkpoint-4200",) + # qlora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/qlora/aquila2chat-hf') + # lora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/lora/aquila2chat-hf') + # ) + +model = autoloader.get_model() +tokenizer = autoloader.get_tokenizer() +# + +test_data = [ + "请介绍下北京有哪些景点。", + "唾面自干是什么意思", + "'我'字有几个笔划", +] + +for text in test_data: + print(model.predict(text, tokenizer=tokenizer)) + diff --git a/examples/Aquila2/utils.py b/examples/Aquila2/utils.py new file mode 100755 index 00000000..336c04b8 --- /dev/null +++ b/examples/Aquila2/utils.py @@ -0,0 +1,26 @@ +from fastchat.conversation import get_conv_template + +def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): + # aquila-chat as default + conv = get_conv_template(convo_template) + + conv.append_message(conv.roles[1], None) + conv.append_message(conv.roles[0], text) + + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + while(len(history) > 0 and (len(example) < max_token)): + tmp = history.pop() + if tmp[0] == 'ASSISTANT': + conv.append_message(conv.roles[1], tmp[1]) + else: + conv.append_message(conv.roles[0], tmp[1]) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + if len(example) >= max_token: + conv.messages.pop() + conv.messages = conv.messages[::-1] + print('model in:', conv.get_prompt()) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + return example \ No newline at end of file diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index fe5b16ae..7375aff3 100755 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -5,6 +5,7 @@ import os import copy from flagai.model.file_utils import _get_model_id +import torch class LazyImport(object): @@ -16,7 +17,7 @@ def __init__(self, name): def __getattr__(self, name): mod = self.cache.get(self.mod_name) if not mod: - mod = importlib.import_module(self.mod_name) + mod = importlib.import_module(self.mod_name) self.cache[self.mod_name] = mod return getattr(mod, name) @@ -107,6 +108,8 @@ def __getattr__(self, name): "aquilacode-7b-ts": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], "aquilacode-multi": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], "aquilacode-python": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], + "aquila2-7b": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], + "aquila2chat-hf":["flagai.model.aquila2_model", "AQUILAModel", "aquila", "nlp"], "vit-base-p16-224": ["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"], "vit-base-p16-384": @@ -163,7 +166,12 @@ def __init__(self, model_name: str = "RoBERTa-base-ch", model_dir: str = "./checkpoints/", only_download_config: bool = False, - device="cpu", + device="cuda", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + lora_dir=None, + qlora_dir=None, + quantization_config=None, **kwargs): """ Args: @@ -194,66 +202,104 @@ def __init__(self, raw_model_name = copy.deepcopy(model_name) model_name = model_name.lower() - if model_name not in MODEL_DICT: + if model_name not in MODEL_DICT and task_name != "aquila2": print(f"The model_name: {model_name} is not be supported") print(f"All supported models are {list(MODEL_DICT.keys())}") return + if task_name == "aquila2": + from flagai.model.aquila2_model import Aquila2Model + from accelerate import init_empty_weights, load_checkpoint_and_dispatch + if qlora_dir: + from transformers import BitsAndBytesConfig + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch_dtype, + ) + model = Aquila2Model.from_pretrain(model_dir, model_name, + low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype, + quantization_config=quantization_config) + + model.eval() + # model = load_checkpoint_and_dispatch( + # model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"]) + if not qlora_dir: + model.to(device) + if lora_dir: + from flagai.model.tools.peft import PeftModel + model = PeftModel.from_pretrained(model, lora_dir) + print("lora modules loaded") + if qlora_dir: + from flagai.model.tools.peft import PeftModel + model = PeftModel.from_pretrained(model, qlora_dir) + print("Qlora modules loaded") + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name) + #args.cuda_index = 0 + # device = f"cuda" + self.model = model + self.tokenizer = tokenizer - brief_model_name = MODEL_DICT[model_name][2] - model_type = MODEL_DICT[model_name][3] - # The dir to save config, vocab and model. + else: - self.model_name = ALL_TASK.get(f"{brief_model_name}_{task_name}", None) - if self.model_name is None: - print(f"For the model_name: {model_name}, task_name: {task_name} \ - is not be supported.") - tasks = self.get_task_name(brief_model_name) - print( - f"For the model_name: {model_name}, these tasks are be supported: {tasks}" - ) - return - download_path = os.path.join(model_dir, raw_model_name) - print("*" * 20, task_name, model_name) - model_name_ = self.is_exist_finetuned_model(raw_model_name, task_name) - self.model = getattr(LazyImport(self.model_name[0]), - self.model_name[1]).from_pretrain( - download_path=model_dir, - model_name=model_name_, - only_download_config=only_download_config, - device=device, - **kwargs) + brief_model_name = MODEL_DICT[model_name][2] + model_type = MODEL_DICT[model_name][3] + # The dir to save config, vocab and model. - if model_type == "nlp": - if brief_model_name in ["galactica",]: - self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5])(download_path) - else : - tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), - "Tokenizer") - self.tokenizer = tokenizer_class.from_pretrained( - model_name, cache_dir=download_path) + self.model_name = ALL_TASK.get(f"{brief_model_name}_{task_name}", None) + if self.model_name is None: + print(f"For the model_name: {model_name}, task_name: {task_name} \ + is not be supported.") + tasks = self.get_task_name(brief_model_name) + print( + f"For the model_name: {model_name}, these tasks are be supported: {tasks}" + ) + return + download_path = os.path.join(model_dir, raw_model_name) + print("*" * 20, task_name, model_name) + model_name_ = self.is_exist_finetuned_model(raw_model_name, task_name) + self.model = getattr(LazyImport(self.model_name[0]), + self.model_name[1]).from_pretrain( + download_path=model_dir, + model_name=model_name_, + only_download_config=only_download_config, + device=device, + **kwargs) - elif model_type == "mm": - if model_name.startswith("altdiffusion"): - self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5]).from_pretrained(os.path.join(model_dir, raw_model_name)) - self.tokenizer = self.process.tokenizer - self.model.tokenizer = self.tokenizer - elif "altclip" not in model_name: - from flagai.data.tokenizer.clip.tokenizer import ClipTokenizer - self.tokenizer = ClipTokenizer(bpe_path=os.path.join(download_path, 'bpe_simple_vocab_16e6.txt.gz')) - self.transform = None - else: - - self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), - MODEL_DICT[model_name][5]).from_pretrained( - os.path.join(model_dir, raw_model_name)) - self.transform = self.process.feature_extractor - self.tokenizer = self.process.tokenizer + if model_type == "nlp": + if brief_model_name in ["galactica",]: + self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5])(download_path) + # elif 'Aquila2-7b' in model_name: - else: - self.tokenizer = None - self.transform = None + else : + tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), + "Tokenizer") + self.tokenizer = tokenizer_class.from_pretrained( + model_name, cache_dir=download_path) + + elif model_type == "mm": + if model_name.startswith("altdiffusion"): + self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5]).from_pretrained(os.path.join(model_dir, raw_model_name)) + self.tokenizer = self.process.tokenizer + self.model.tokenizer = self.tokenizer + elif "altclip" not in model_name: + from flagai.data.tokenizer.clip.tokenizer import ClipTokenizer + self.tokenizer = ClipTokenizer(bpe_path=os.path.join(download_path, 'bpe_simple_vocab_16e6.txt.gz')) + self.transform = None + else: + + self.process = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5]).from_pretrained( + os.path.join(model_dir, raw_model_name)) + self.transform = self.process.feature_extractor + self.tokenizer = self.process.tokenizer + + else: + self.tokenizer = None + self.transform = None def is_exist_finetuned_model(self, raw_model_name, task_name): try: diff --git a/flagai/model/aquila2_model.py b/flagai/model/aquila2_model.py new file mode 100755 index 00000000..48cb3a53 --- /dev/null +++ b/flagai/model/aquila2_model.py @@ -0,0 +1,224 @@ +from transformers import AutoTokenizer, LlamaForCausalLM , AutoModelForCausalLM +import random +import numpy as np +import torch +from utils import covert_prompt_to_input_ids_with_history +import os +from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files +from transformers import ( + LogitsProcessorList, + MinLengthLogitsProcessor, + TopKLogitsWarper, + TemperatureLogitsWarper, + TopPLogitsWarper, + StoppingCriteriaList, + MaxLengthCriteria, + BitsAndBytesConfig, +) +from fastchat.conversation import get_conv_template + + +def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): + # aquila-chat as default + conv = get_conv_template(convo_template) + + conv.append_message(conv.roles[1], None) + conv.append_message(conv.roles[0], text) + + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + while(len(history) > 0 and (len(example) < max_token)): + tmp = history.pop() + if tmp[0] == 'ASSISTANT': + conv.append_message(conv.roles[1], tmp[1]) + else: + conv.append_message(conv.roles[0], tmp[1]) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + if len(example) >= max_token: + conv.messages.pop() + conv.messages = conv.messages[::-1] + print('model in:', conv.get_prompt()) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + return example + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + +class Aquila2Model(LlamaForCausalLM): + + @classmethod + def from_pretrain(self, model_dir, model_name, **kwargs): + download_path = os.path.join(model_dir, model_name) + if os.path.exists(download_path): + return self.from_pretrained(download_path, **kwargs) + + + config_path = os.path.join(download_path, "config.json") + checkpoint_path = os.path.join(download_path, "pytorch_model.bin") + from flagai.model.file_utils import _get_model_id + model_id = _get_model_id(model_name) + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(download_path, file_name, model_id) + + if os.path.exists( + os.path.join(download_path, 'config.json')): + if os.getenv('ENV_TYPE') == 'deepspeed+mpu': + model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) + if model_parallel_size > 1: + # if gpus == nums_of_modelhub_models + # can load + # else need to download the pytorch_model.bin and to recut. + model_hub_parallel_size = 0 + for f in model_files: + if "pytorch_model_" in f: + model_hub_parallel_size += 1 + else: + model_parallel_size = 1 + + if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: + # Only to download the model slices(megatron-lm). + for file_to_load in model_files: + if "pytorch_model_" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + + elif 'pytorch_model.bin' in model_files: + checkpoint_path = _get_checkpoint_path( + download_path, 'pytorch_model.bin', model_id) + else: + checkpoint_merge = {} + # maybe multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + # checkpoint_to_load = torch.load(os.path.join( + # download_path, file_to_load), + # map_location="cpu") + # for k, v in checkpoint_to_load.items(): + # checkpoint_merge[k] = v + # # save all parameters + # torch.save( + # checkpoint_merge, + # os.path.join(download_path, "pytorch_model.bin")) + + + def predict(self, text, tokenizer=None, + max_gen_len=200, top_p=0.95, + seed=1234, topk=100, + temperature=0.9, + sft=True, convo_template = "aquila-chat", + device = "cuda"): + + vocab = tokenizer.get_vocab() + #device = device + id2word = {v:k for k, v in vocab.items()} + + + set_random_seed(seed) + if temperature == 0: + topk = 1 + temperature = 1.0 + if sft: + tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template) + tokens = torch.tensor(tokens)[None,].to(device) + else : + tokens = tokenizer.encode_plus(text)["input_ids"] + print(tokenizer.decode(tokens)) + tokens = torch.tensor(tokens)[None,].to(device) + input_length = len(tokens[0]) + with torch.no_grad(): + + # instantiate logits processors + logits_processor = LogitsProcessorList( + [ + MinLengthLogitsProcessor(1, eos_token_id=100007), + ] + ) + # instantiate logits processors + logits_warper = LogitsProcessorList( + [ + TopPLogitsWarper(top_p), + TopKLogitsWarper(topk), + TemperatureLogitsWarper(temperature), + + ] + ) + + stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)]) + out = self.sample( + tokens, + logits_processor=logits_processor, + logits_warper=logits_warper, + stopping_criteria=stopping_criteria, + return_dict_in_generate=True, + output_scores=True, + ) + + + # print(out) + out_ids = out["sequences"][0][input_length:].cpu().numpy() + + out_scores = out["scores"] + + out_scores = torch.cat(out_scores, dim=0) + out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy() + + probs = [] + for i in range(len(out_ids)): + probs.append(float(out_scores[i][out_ids[i]])) + + # print(f"probs is {probs}") + + convert_tokens = [] + for t in out_ids: + if t == 100006: + convert_tokens.append("[CLS]") + else : + convert_tokens.append(id2word.get(t, "[unkonwn_token]")) + + out_text = tokenizer.decode(out_ids.tolist()) + + + out = out_text + + if "###" in out: + special_index = out.index("###") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "[UNK]" in out: + special_index = out.index("[UNK]") + out = out[:special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "" in out: + special_index = out.index("") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if len(out) > 0 and out[0] == " ": + out = out[1:] + + convert_tokens = convert_tokens[1:] + probs = probs[1:] + return out + # return out, convert_tokens, probs From 5d2af908f789ab2db0614c7f97b035512f2532fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= Date: Mon, 25 Sep 2023 10:00:43 +0000 Subject: [PATCH 08/11] remove utils.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 严照东 --- examples/Aquila2/utils.py | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100755 examples/Aquila2/utils.py diff --git a/examples/Aquila2/utils.py b/examples/Aquila2/utils.py deleted file mode 100755 index 336c04b8..00000000 --- a/examples/Aquila2/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -from fastchat.conversation import get_conv_template - -def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): - # aquila-chat as default - conv = get_conv_template(convo_template) - - conv.append_message(conv.roles[1], None) - conv.append_message(conv.roles[0], text) - - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - while(len(history) > 0 and (len(example) < max_token)): - tmp = history.pop() - if tmp[0] == 'ASSISTANT': - conv.append_message(conv.roles[1], tmp[1]) - else: - conv.append_message(conv.roles[0], tmp[1]) - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - if len(example) >= max_token: - conv.messages.pop() - conv.messages = conv.messages[::-1] - print('model in:', conv.get_prompt()) - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - return example \ No newline at end of file From 2fb5318e7d5a4800ccbd9cdae4e420cf6eecc257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= Date: Tue, 26 Sep 2023 07:21:02 +0000 Subject: [PATCH 09/11] updated model usage method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 严照东 --- examples/Aquila2/generate_chat.py | 29 --- flagai/auto_model/auto_loader.py | 63 ++++++- flagai/model/aquila2/modeling_aquila.py | 169 ++++++++++++++++++ flagai/model/aquila2/utils.py | 38 ++++ flagai/model/aquila2_model.py | 224 ------------------------ 5 files changed, 261 insertions(+), 262 deletions(-) delete mode 100755 examples/Aquila2/generate_chat.py mode change 100644 => 100755 flagai/model/aquila2/modeling_aquila.py create mode 100755 flagai/model/aquila2/utils.py delete mode 100755 flagai/model/aquila2_model.py diff --git a/examples/Aquila2/generate_chat.py b/examples/Aquila2/generate_chat.py deleted file mode 100755 index 0cff2e8a..00000000 --- a/examples/Aquila2/generate_chat.py +++ /dev/null @@ -1,29 +0,0 @@ -from flagai.auto_model.auto_loader import AutoLoader - -state_dict = "./checkpoints/" -model_name = 'Aquila2Chat-hf' - -state_dict = "/data2/20230907/" -model_name = 'iter_0205000_hf' - -autoloader = AutoLoader("aquila2", - model_dir=state_dict, - model_name=model_name, - qlora_dir="/data2/yzd/FastChat/checkpoints_out/30bhf_save/checkpoint-4200",) - # qlora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/qlora/aquila2chat-hf') - # lora_dir='/data2/yzd/FlagAI/examples/Aquila2/checkpoints/lora/aquila2chat-hf') - # ) - -model = autoloader.get_model() -tokenizer = autoloader.get_tokenizer() -# - -test_data = [ - "请介绍下北京有哪些景点。", - "唾面自干是什么意思", - "'我'字有几个笔划", -] - -for text in test_data: - print(model.predict(text, tokenizer=tokenizer)) - diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index 7375aff3..b068d308 100755 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -4,10 +4,10 @@ import importlib import os import copy -from flagai.model.file_utils import _get_model_id +from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files +from flagai.model.aquila2.modeling_aquila import AquilaForCausalLM import torch - class LazyImport(object): def __init__(self, name): @@ -207,8 +207,54 @@ def __init__(self, print(f"All supported models are {list(MODEL_DICT.keys())}") return if task_name == "aquila2": - from flagai.model.aquila2_model import Aquila2Model - from accelerate import init_empty_weights, load_checkpoint_and_dispatch + download_path = os.path.join(model_dir, model_name) + + if not os.path.exists(download_path): + # Try to download from ModelHub + try: + model_id = _get_model_id(model_name) + except: + raise FileNotFoundError("Model name not found in local path and ModelHub") + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(download_path, file_name, model_id) + + if os.path.exists( + os.path.join(download_path, 'config.json')): + if os.getenv('ENV_TYPE') == 'deepspeed+mpu': + model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) + if model_parallel_size > 1: + # if gpus == nums_of_modelhub_models + # can load + # else need to download the pytorch_model.bin and to recut. + model_hub_parallel_size = 0 + for f in model_files: + if "pytorch_model_" in f: + model_hub_parallel_size += 1 + else: + model_parallel_size = 1 + + if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: + # Only to download the model slices(megatron-lm). + for file_to_load in model_files: + if "pytorch_model_" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + + elif 'pytorch_model.bin' in model_files: + checkpoint_path = _get_checkpoint_path( + download_path, 'pytorch_model.bin', model_id) + else: + checkpoint_merge = {} + # maybe multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + if qlora_dir: from transformers import BitsAndBytesConfig quantization_config=BitsAndBytesConfig( @@ -217,11 +263,14 @@ def __init__(self, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype, ) - model = Aquila2Model.from_pretrain(model_dir, model_name, + + + model = AquilaForCausalLM.from_pretrained(download_path, low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype, quantization_config=quantization_config) model.eval() + # from accelerate import load_checkpoint_and_dispatch # model = load_checkpoint_and_dispatch( # model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"]) if not qlora_dir: @@ -236,13 +285,9 @@ def __init__(self, print("Qlora modules loaded") from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name) - #args.cuda_index = 0 - # device = f"cuda" self.model = model self.tokenizer = tokenizer - else: - brief_model_name = MODEL_DICT[model_name][2] model_type = MODEL_DICT[model_name][3] # The dir to save config, vocab and model. diff --git a/flagai/model/aquila2/modeling_aquila.py b/flagai/model/aquila2/modeling_aquila.py old mode 100644 new mode 100755 index 17c5c58a..b0731cce --- a/flagai/model/aquila2/modeling_aquila.py +++ b/flagai/model/aquila2/modeling_aquila.py @@ -31,6 +31,17 @@ from transformers.modeling_utils import PreTrainedModel from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_aquila import AquilaConfig +from transformers import ( + LogitsProcessorList, + MinLengthLogitsProcessor, + TopKLogitsWarper, + TemperatureLogitsWarper, + TopPLogitsWarper, + StoppingCriteriaList, + MaxLengthCriteria, + BitsAndBytesConfig, +) +from .utils import * logger = logging.get_logger(__name__) @@ -754,6 +765,57 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @classmethod + def from_pretrain(self, model_dir, model_name, **kwargs): + download_path = os.path.join(model_dir, model_name) + if os.path.exists(download_path): + return self.from_pretrained(download_path, **kwargs) + + + config_path = os.path.join(download_path, "config.json") + checkpoint_path = os.path.join(download_path, "pytorch_model.bin") + from flagai.model.file_utils import _get_model_id + model_id = _get_model_id(model_name) + if model_id and model_id != "null": + model_files = eval(_get_model_files(model_name)) + print("model files:" + str(model_files)) + for file_name in model_files: + if not file_name.endswith("bin"): + _get_vocab_path(download_path, file_name, model_id) + + if os.path.exists( + os.path.join(download_path, 'config.json')): + if os.getenv('ENV_TYPE') == 'deepspeed+mpu': + model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) + if model_parallel_size > 1: + # if gpus == nums_of_modelhub_models + # can load + # else need to download the pytorch_model.bin and to recut. + model_hub_parallel_size = 0 + for f in model_files: + if "pytorch_model_" in f: + model_hub_parallel_size += 1 + else: + model_parallel_size = 1 + + if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: + # Only to download the model slices(megatron-lm). + for file_to_load in model_files: + if "pytorch_model_" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + + elif 'pytorch_model.bin' in model_files: + checkpoint_path = _get_checkpoint_path( + download_path, 'pytorch_model.bin', model_id) + else: + checkpoint_merge = {} + # maybe multi weights files + for file_to_load in model_files: + if "pytorch_model-0" in file_to_load: + _get_checkpoint_path(download_path, file_to_load, + model_id) + def get_input_embeddings(self): return self.model.embed_tokens @@ -905,6 +967,113 @@ def _reorder_cache(past_key_values, beam_idx): ) return reordered_past + def predict(self, text, tokenizer=None, + max_gen_len=200, top_p=0.95, + seed=1234, topk=100, + temperature=0.9, + sft=True, convo_template = "aquila-chat", + device = "cuda"): + + vocab = tokenizer.get_vocab() + #device = device + id2word = {v:k for k, v in vocab.items()} + + + set_random_seed(seed) + if temperature == 0: + topk = 1 + temperature = 1.0 + if sft: + tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template) + tokens = torch.tensor(tokens)[None,].to(device) + else : + tokens = tokenizer.encode_plus(text)["input_ids"] + print(tokenizer.decode(tokens)) + tokens = torch.tensor(tokens)[None,].to(device) + input_length = len(tokens[0]) + with torch.no_grad(): + + # instantiate logits processors + logits_processor = LogitsProcessorList( + [ + MinLengthLogitsProcessor(1, eos_token_id=100007), + ] + ) + # instantiate logits processors + logits_warper = LogitsProcessorList( + [ + TopPLogitsWarper(top_p), + TopKLogitsWarper(topk), + TemperatureLogitsWarper(temperature), + + ] + ) + + stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)]) + out = self.sample( + tokens, + logits_processor=logits_processor, + logits_warper=logits_warper, + stopping_criteria=stopping_criteria, + return_dict_in_generate=True, + output_scores=True, + ) + + + # print(out) + out_ids = out["sequences"][0][input_length:].cpu().numpy() + + out_scores = out["scores"] + + out_scores = torch.cat(out_scores, dim=0) + out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy() + + probs = [] + for i in range(len(out_ids)): + probs.append(float(out_scores[i][out_ids[i]])) + + # print(f"probs is {probs}") + + convert_tokens = [] + for t in out_ids: + if t == 100006: + convert_tokens.append("[CLS]") + else : + convert_tokens.append(id2word.get(t, "[unkonwn_token]")) + + out_text = tokenizer.decode(out_ids.tolist()) + + + out = out_text + + if "###" in out: + special_index = out.index("###") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "[UNK]" in out: + special_index = out.index("[UNK]") + out = out[:special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if "" in out: + special_index = out.index("") + out = out[: special_index] + token_length = len(tokenizer.encode_plus(out)["input_ids"]) + convert_tokens = convert_tokens[:token_length] + probs = probs[:token_length] + + if len(out) > 0 and out[0] == " ": + out = out[1:] + + convert_tokens = convert_tokens[1:] + probs = probs[1:] + return out + @add_start_docstrings( """ The LLaMa Model transformer with a sequence classification head on top (linear layer). diff --git a/flagai/model/aquila2/utils.py b/flagai/model/aquila2/utils.py new file mode 100755 index 00000000..8e3de5f9 --- /dev/null +++ b/flagai/model/aquila2/utils.py @@ -0,0 +1,38 @@ +import random +import numpy as np +import torch +from fastchat.conversation import get_conv_template + +def set_random_seed(seed): + """Set random seed for reproducability.""" + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + + +def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): + # aquila-chat as default + conv = get_conv_template(convo_template) + + conv.append_message(conv.roles[1], None) + conv.append_message(conv.roles[0], text) + + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + while(len(history) > 0 and (len(example) < max_token)): + tmp = history.pop() + if tmp[0] == 'ASSISTANT': + conv.append_message(conv.roles[1], tmp[1]) + else: + conv.append_message(conv.roles[0], tmp[1]) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + if len(example) >= max_token: + conv.messages.pop() + conv.messages = conv.messages[::-1] + print('model in:', conv.get_prompt()) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + return example \ No newline at end of file diff --git a/flagai/model/aquila2_model.py b/flagai/model/aquila2_model.py deleted file mode 100755 index 48cb3a53..00000000 --- a/flagai/model/aquila2_model.py +++ /dev/null @@ -1,224 +0,0 @@ -from transformers import AutoTokenizer, LlamaForCausalLM , AutoModelForCausalLM -import random -import numpy as np -import torch -from utils import covert_prompt_to_input_ids_with_history -import os -from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files -from transformers import ( - LogitsProcessorList, - MinLengthLogitsProcessor, - TopKLogitsWarper, - TemperatureLogitsWarper, - TopPLogitsWarper, - StoppingCriteriaList, - MaxLengthCriteria, - BitsAndBytesConfig, -) -from fastchat.conversation import get_conv_template - - -def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"): - # aquila-chat as default - conv = get_conv_template(convo_template) - - conv.append_message(conv.roles[1], None) - conv.append_message(conv.roles[0], text) - - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - while(len(history) > 0 and (len(example) < max_token)): - tmp = history.pop() - if tmp[0] == 'ASSISTANT': - conv.append_message(conv.roles[1], tmp[1]) - else: - conv.append_message(conv.roles[0], tmp[1]) - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - if len(example) >= max_token: - conv.messages.pop() - conv.messages = conv.messages[::-1] - print('model in:', conv.get_prompt()) - example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] - - return example - - -def set_random_seed(seed): - """Set random seed for reproducability.""" - if seed is not None and seed > 0: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - - -class Aquila2Model(LlamaForCausalLM): - - @classmethod - def from_pretrain(self, model_dir, model_name, **kwargs): - download_path = os.path.join(model_dir, model_name) - if os.path.exists(download_path): - return self.from_pretrained(download_path, **kwargs) - - - config_path = os.path.join(download_path, "config.json") - checkpoint_path = os.path.join(download_path, "pytorch_model.bin") - from flagai.model.file_utils import _get_model_id - model_id = _get_model_id(model_name) - if model_id and model_id != "null": - model_files = eval(_get_model_files(model_name)) - print("model files:" + str(model_files)) - for file_name in model_files: - if not file_name.endswith("bin"): - _get_vocab_path(download_path, file_name, model_id) - - if os.path.exists( - os.path.join(download_path, 'config.json')): - if os.getenv('ENV_TYPE') == 'deepspeed+mpu': - model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) - if model_parallel_size > 1: - # if gpus == nums_of_modelhub_models - # can load - # else need to download the pytorch_model.bin and to recut. - model_hub_parallel_size = 0 - for f in model_files: - if "pytorch_model_" in f: - model_hub_parallel_size += 1 - else: - model_parallel_size = 1 - - if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: - # Only to download the model slices(megatron-lm). - for file_to_load in model_files: - if "pytorch_model_" in file_to_load: - _get_checkpoint_path(download_path, file_to_load, - model_id) - - elif 'pytorch_model.bin' in model_files: - checkpoint_path = _get_checkpoint_path( - download_path, 'pytorch_model.bin', model_id) - else: - checkpoint_merge = {} - # maybe multi weights files - for file_to_load in model_files: - if "pytorch_model-0" in file_to_load: - _get_checkpoint_path(download_path, file_to_load, - model_id) - # checkpoint_to_load = torch.load(os.path.join( - # download_path, file_to_load), - # map_location="cpu") - # for k, v in checkpoint_to_load.items(): - # checkpoint_merge[k] = v - # # save all parameters - # torch.save( - # checkpoint_merge, - # os.path.join(download_path, "pytorch_model.bin")) - - - def predict(self, text, tokenizer=None, - max_gen_len=200, top_p=0.95, - seed=1234, topk=100, - temperature=0.9, - sft=True, convo_template = "aquila-chat", - device = "cuda"): - - vocab = tokenizer.get_vocab() - #device = device - id2word = {v:k for k, v in vocab.items()} - - - set_random_seed(seed) - if temperature == 0: - topk = 1 - temperature = 1.0 - if sft: - tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template) - tokens = torch.tensor(tokens)[None,].to(device) - else : - tokens = tokenizer.encode_plus(text)["input_ids"] - print(tokenizer.decode(tokens)) - tokens = torch.tensor(tokens)[None,].to(device) - input_length = len(tokens[0]) - with torch.no_grad(): - - # instantiate logits processors - logits_processor = LogitsProcessorList( - [ - MinLengthLogitsProcessor(1, eos_token_id=100007), - ] - ) - # instantiate logits processors - logits_warper = LogitsProcessorList( - [ - TopPLogitsWarper(top_p), - TopKLogitsWarper(topk), - TemperatureLogitsWarper(temperature), - - ] - ) - - stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)]) - out = self.sample( - tokens, - logits_processor=logits_processor, - logits_warper=logits_warper, - stopping_criteria=stopping_criteria, - return_dict_in_generate=True, - output_scores=True, - ) - - - # print(out) - out_ids = out["sequences"][0][input_length:].cpu().numpy() - - out_scores = out["scores"] - - out_scores = torch.cat(out_scores, dim=0) - out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy() - - probs = [] - for i in range(len(out_ids)): - probs.append(float(out_scores[i][out_ids[i]])) - - # print(f"probs is {probs}") - - convert_tokens = [] - for t in out_ids: - if t == 100006: - convert_tokens.append("[CLS]") - else : - convert_tokens.append(id2word.get(t, "[unkonwn_token]")) - - out_text = tokenizer.decode(out_ids.tolist()) - - - out = out_text - - if "###" in out: - special_index = out.index("###") - out = out[: special_index] - token_length = len(tokenizer.encode_plus(out)["input_ids"]) - convert_tokens = convert_tokens[:token_length] - probs = probs[:token_length] - - if "[UNK]" in out: - special_index = out.index("[UNK]") - out = out[:special_index] - token_length = len(tokenizer.encode_plus(out)["input_ids"]) - convert_tokens = convert_tokens[:token_length] - probs = probs[:token_length] - - if "" in out: - special_index = out.index("") - out = out[: special_index] - token_length = len(tokenizer.encode_plus(out)["input_ids"]) - convert_tokens = convert_tokens[:token_length] - probs = probs[:token_length] - - if len(out) > 0 and out[0] == " ": - out = out[1:] - - convert_tokens = convert_tokens[1:] - probs = probs[1:] - return out - # return out, convert_tokens, probs From 349d8dfbc256b9422800f7f25906d6d2e39f58d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= Date: Tue, 26 Sep 2023 07:28:39 +0000 Subject: [PATCH 10/11] removed unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 严照东 --- flagai/model/aquila2/modeling_aquila.py | 51 ------------------------- 1 file changed, 51 deletions(-) diff --git a/flagai/model/aquila2/modeling_aquila.py b/flagai/model/aquila2/modeling_aquila.py index b0731cce..b1ae0cac 100755 --- a/flagai/model/aquila2/modeling_aquila.py +++ b/flagai/model/aquila2/modeling_aquila.py @@ -765,57 +765,6 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - @classmethod - def from_pretrain(self, model_dir, model_name, **kwargs): - download_path = os.path.join(model_dir, model_name) - if os.path.exists(download_path): - return self.from_pretrained(download_path, **kwargs) - - - config_path = os.path.join(download_path, "config.json") - checkpoint_path = os.path.join(download_path, "pytorch_model.bin") - from flagai.model.file_utils import _get_model_id - model_id = _get_model_id(model_name) - if model_id and model_id != "null": - model_files = eval(_get_model_files(model_name)) - print("model files:" + str(model_files)) - for file_name in model_files: - if not file_name.endswith("bin"): - _get_vocab_path(download_path, file_name, model_id) - - if os.path.exists( - os.path.join(download_path, 'config.json')): - if os.getenv('ENV_TYPE') == 'deepspeed+mpu': - model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE")) - if model_parallel_size > 1: - # if gpus == nums_of_modelhub_models - # can load - # else need to download the pytorch_model.bin and to recut. - model_hub_parallel_size = 0 - for f in model_files: - if "pytorch_model_" in f: - model_hub_parallel_size += 1 - else: - model_parallel_size = 1 - - if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size: - # Only to download the model slices(megatron-lm). - for file_to_load in model_files: - if "pytorch_model_" in file_to_load: - _get_checkpoint_path(download_path, file_to_load, - model_id) - - elif 'pytorch_model.bin' in model_files: - checkpoint_path = _get_checkpoint_path( - download_path, 'pytorch_model.bin', model_id) - else: - checkpoint_merge = {} - # maybe multi weights files - for file_to_load in model_files: - if "pytorch_model-0" in file_to_load: - _get_checkpoint_path(download_path, file_to_load, - model_id) - def get_input_embeddings(self): return self.model.embed_tokens From fbbcaca26dd0b33f946665f5feb1affb824e4360 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=A5=E7=85=A7=E4=B8=9C?= Date: Tue, 26 Sep 2023 07:39:15 +0000 Subject: [PATCH 11/11] remove unused files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 严照东 --- flagai/auto_model/auto_loader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index b068d308..f67e8219 100755 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -108,8 +108,6 @@ def __getattr__(self, name): "aquilacode-7b-ts": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], "aquilacode-multi": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], "aquilacode-python": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], - "aquila2-7b": ["flagai.model.aquila_model", "AQUILAModel", "aquila", "nlp"], - "aquila2chat-hf":["flagai.model.aquila2_model", "AQUILAModel", "aquila", "nlp"], "vit-base-p16-224": ["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"], "vit-base-p16-384":