diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py
index fe5b16ae..f67e8219 100755
--- a/flagai/auto_model/auto_loader.py
+++ b/flagai/auto_model/auto_loader.py
@@ -4,8 +4,9 @@
 import importlib
 import os
 import copy
-from flagai.model.file_utils import _get_model_id
-
+from flagai.model.file_utils import _get_model_id, _get_checkpoint_path, _get_vocab_path, _get_model_files
+from flagai.model.aquila2.modeling_aquila import AquilaForCausalLM
+import torch
 
 class LazyImport(object):
 
@@ -16,7 +17,7 @@ def __init__(self, name):
     def __getattr__(self, name):
         mod = self.cache.get(self.mod_name)
         if not mod:
-            mod = importlib.import_module(self.mod_name)
+            mod = importlib.import_module(self.mod_name) 
             self.cache[self.mod_name] = mod
         return getattr(mod, name)
 
@@ -163,7 +164,12 @@ def __init__(self,
                  model_name: str = "RoBERTa-base-ch",
                  model_dir: str = "./checkpoints/",
                  only_download_config: bool = False,
-                 device="cpu",
+                 device="cuda",
+                 torch_dtype=torch.float16,
+                 low_cpu_mem_usage=True,
+                 lora_dir=None,
+                 qlora_dir=None,
+                 quantization_config=None,
                  **kwargs):
         """
         Args:
@@ -194,66 +200,149 @@ def __init__(self,
         raw_model_name = copy.deepcopy(model_name)
         model_name = model_name.lower()
 
-        if model_name not in MODEL_DICT:
+        if model_name not in MODEL_DICT and task_name != "aquila2":
             print(f"The model_name: {model_name} is not be supported")
             print(f"All supported models are {list(MODEL_DICT.keys())}")
             return
+        if task_name == "aquila2":
+            download_path = os.path.join(model_dir, model_name)
+            
+            if not os.path.exists(download_path):
+                # Try to download from ModelHub
+                try:
+                    model_id = _get_model_id(model_name)
+                except:
+                    raise FileNotFoundError("Model name not found in local path and ModelHub")
+                if model_id and model_id != "null":
+                    model_files = eval(_get_model_files(model_name))
+                    print("model files:" + str(model_files))
+                    for file_name in model_files:
+                        if not file_name.endswith("bin"):
+                            _get_vocab_path(download_path, file_name, model_id)
 
-        brief_model_name = MODEL_DICT[model_name][2]
-        model_type = MODEL_DICT[model_name][3]
-        # The dir to save config, vocab and model.
+                    if os.path.exists(
+                            os.path.join(download_path, 'config.json')):
+                        if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
+                            model_parallel_size = int(os.getenv("MODEL_PARALLEL_SIZE"))
+                            if model_parallel_size > 1:
+                                # if gpus == nums_of_modelhub_models
+                                # can load
+                                # else need to download the pytorch_model.bin and to recut.
+                                model_hub_parallel_size = 0
+                                for f in model_files:
+                                    if "pytorch_model_" in f:
+                                        model_hub_parallel_size += 1
+                        else:
+                            model_parallel_size = 1
 
-        self.model_name = ALL_TASK.get(f"{brief_model_name}_{task_name}", None)
-        if self.model_name is None:
-            print(f"For the model_name: {model_name}, task_name: {task_name} \
-                is not be supported.")
-            tasks = self.get_task_name(brief_model_name)
-            print(
-                f"For the model_name: {model_name}, these tasks are be supported: {tasks}"
-            )
-            return
-        download_path = os.path.join(model_dir, raw_model_name)
-        print("*" * 20, task_name, model_name)
-        model_name_ = self.is_exist_finetuned_model(raw_model_name, task_name)
-        self.model = getattr(LazyImport(self.model_name[0]),
-                             self.model_name[1]).from_pretrain(
-            download_path=model_dir,
-            model_name=model_name_,
-            only_download_config=only_download_config,
-            device=device,
-            **kwargs)
+                        if "pytorch_model_01.bin" in model_files and model_parallel_size > 1 and model_hub_parallel_size == model_parallel_size:
+                            # Only to download the model slices(megatron-lm).
+                            for file_to_load in model_files:
+                                if "pytorch_model_" in file_to_load:
+                                    _get_checkpoint_path(download_path, file_to_load,
+                                                        model_id)
 
-        if model_type == "nlp":
-            if brief_model_name in ["galactica",]:
-                self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]),
-                                                    MODEL_DICT[model_name][5])(download_path)
-            else :
-                tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"),
-                                        "Tokenizer")
-                self.tokenizer = tokenizer_class.from_pretrained(
-                    model_name, cache_dir=download_path)
+                        elif 'pytorch_model.bin' in model_files:
+                            checkpoint_path = _get_checkpoint_path(
+                                download_path, 'pytorch_model.bin', model_id)
+                        else:
+                            checkpoint_merge = {}
+                            # maybe multi weights files
+                            for file_to_load in model_files:
+                                if "pytorch_model-0" in file_to_load:
+                                    _get_checkpoint_path(download_path, file_to_load,
+                                                        model_id)            
+
+            if qlora_dir:
+                from transformers import BitsAndBytesConfig
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=torch_dtype,
+                )
 
-        elif model_type == "mm":
-            if model_name.startswith("altdiffusion"):
-                self.process = getattr(LazyImport(MODEL_DICT[model_name][4]),
-                                MODEL_DICT[model_name][5]).from_pretrained(os.path.join(model_dir, raw_model_name))
-                self.tokenizer = self.process.tokenizer
-                self.model.tokenizer = self.tokenizer
-            elif "altclip" not in model_name:
-                from flagai.data.tokenizer.clip.tokenizer import ClipTokenizer
-                self.tokenizer = ClipTokenizer(bpe_path=os.path.join(download_path, 'bpe_simple_vocab_16e6.txt.gz'))
-                self.transform = None
-            else:
-                
-                self.process = getattr(LazyImport(MODEL_DICT[model_name][4]),
-                                       MODEL_DICT[model_name][5]).from_pretrained(
-                    os.path.join(model_dir, raw_model_name))
-                self.transform = self.process.feature_extractor
-                self.tokenizer = self.process.tokenizer
 
+            model = AquilaForCausalLM.from_pretrained(download_path,
+                                                    low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype,
+                                                    quantization_config=quantization_config)
+            
+            model.eval()
+            # from accelerate import load_checkpoint_and_dispatch
+            # model = load_checkpoint_and_dispatch(
+            #                 model, model_dir+model_name, device_map="balanced", no_split_module_classes=["LlamaDecoderLayer"])
+            if not qlora_dir:
+                model.to(device)
+            if lora_dir:
+                from flagai.model.tools.peft import PeftModel
+                model = PeftModel.from_pretrained(model, lora_dir)
+                print("lora modules loaded")
+            if qlora_dir:
+                from flagai.model.tools.peft import PeftModel
+                model = PeftModel.from_pretrained(model, qlora_dir)
+                print("Qlora modules loaded")
+            from transformers import AutoTokenizer
+            tokenizer = AutoTokenizer.from_pretrained(model_dir+model_name)
+            self.model = model 
+            self.tokenizer = tokenizer 
         else:
-            self.tokenizer = None
-            self.transform = None
+            brief_model_name = MODEL_DICT[model_name][2]
+            model_type = MODEL_DICT[model_name][3]
+            # The dir to save config, vocab and model.
+
+            self.model_name = ALL_TASK.get(f"{brief_model_name}_{task_name}", None)
+            if self.model_name is None:
+                print(f"For the model_name: {model_name}, task_name: {task_name} \
+                    is not be supported.")
+                tasks = self.get_task_name(brief_model_name)
+                print(
+                    f"For the model_name: {model_name}, these tasks are be supported: {tasks}"
+                )
+                return
+            download_path = os.path.join(model_dir, raw_model_name)
+            print("*" * 20, task_name, model_name)
+            model_name_ = self.is_exist_finetuned_model(raw_model_name, task_name)
+            self.model = getattr(LazyImport(self.model_name[0]),
+                                self.model_name[1]).from_pretrain(
+                download_path=model_dir,
+                model_name=model_name_,
+                only_download_config=only_download_config,
+                device=device,
+                **kwargs)
+
+            if model_type == "nlp":
+                if brief_model_name in ["galactica",]:
+                    self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]),
+                                                        MODEL_DICT[model_name][5])(download_path)
+                # elif 'Aquila2-7b' in model_name:
+
+                else :
+                    tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"),
+                                            "Tokenizer")
+                    self.tokenizer = tokenizer_class.from_pretrained(
+                        model_name, cache_dir=download_path)
+
+            elif model_type == "mm":
+                if model_name.startswith("altdiffusion"):
+                    self.process = getattr(LazyImport(MODEL_DICT[model_name][4]),
+                                    MODEL_DICT[model_name][5]).from_pretrained(os.path.join(model_dir, raw_model_name))
+                    self.tokenizer = self.process.tokenizer
+                    self.model.tokenizer = self.tokenizer
+                elif "altclip" not in model_name:
+                    from flagai.data.tokenizer.clip.tokenizer import ClipTokenizer
+                    self.tokenizer = ClipTokenizer(bpe_path=os.path.join(download_path, 'bpe_simple_vocab_16e6.txt.gz'))
+                    self.transform = None
+                else:
+                    
+                    self.process = getattr(LazyImport(MODEL_DICT[model_name][4]),
+                                        MODEL_DICT[model_name][5]).from_pretrained(
+                        os.path.join(model_dir, raw_model_name))
+                    self.transform = self.process.feature_extractor
+                    self.tokenizer = self.process.tokenizer
+
+            else:
+                self.tokenizer = None
+                self.transform = None
 
     def is_exist_finetuned_model(self, raw_model_name, task_name):
         try:
diff --git a/flagai/model/aquila2/configuration_aquila.py b/flagai/model/aquila2/configuration_aquila.py
new file mode 100644
index 00000000..f364ebd6
--- /dev/null
+++ b/flagai/model/aquila2/configuration_aquila.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Aquila model configuration"""
+
+from transformers import PretrainedConfig
+
+
+
+class AquilaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Aquila-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`AquilaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+
+    ```python
+    >>> from transformers import AquilaModel, AquilaConfig
+
+    >>> # Initializing a Aquila aquila-7b style configuration
+    >>> configuration = AquilaConfig()
+
+    >>> # Initializing a model from the aquila-7b style configuration
+    >>> model = AquilaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "aquila"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100008,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/flagai/model/aquila_modeling_hf.py b/flagai/model/aquila2/modeling_aquila.py
old mode 100644
new mode 100755
similarity index 72%
rename from flagai/model/aquila_modeling_hf.py
rename to flagai/model/aquila2/modeling_aquila.py
index 254669d2..b1ae0cac
--- a/flagai/model/aquila_modeling_hf.py
+++ b/flagai/model/aquila2/modeling_aquila.py
@@ -31,6 +31,17 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_aquila import AquilaConfig
+from transformers import (
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    TopKLogitsWarper,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    StoppingCriteriaList,
+    MaxLengthCriteria,
+    BitsAndBytesConfig,
+)
+from .utils import *
 
 
 logger = logging.get_logger(__name__)
@@ -93,34 +104,83 @@ def forward(self, hidden_states):
 class AquilaRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
         if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
         return (
             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
         )
 
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Aquila
+class AquilaLinearScalingRotaryEmbedding(AquilaRotaryEmbedding):
+    """AquilaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Aquila
+class AquilaDynamicNTKScalingRotaryEmbedding(AquilaRotaryEmbedding):
+    """AquilaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -142,33 +202,64 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
 
 # Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Aquila
 class AquilaMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
+    def __init__(self, config):
         super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Aquila
 class AquilaAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-
     def __init__(self, config: AquilaConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
@@ -176,10 +267,37 @@ def __init__(self, config: AquilaConfig):
                 f" and `num_heads`: {self.num_heads})."
             )
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = AquilaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = AquilaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = AquilaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = AquilaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -195,16 +313,37 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
 
         if past_key_value is not None:
             # reuse k, v, self_attention
@@ -213,8 +352,11 @@ def forward(
 
         past_key_value = (key_states, value_states) if use_cache else None
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
 
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         attn_weights = torch.clamp(attn_weights, min=-1024., max=1024.)
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -228,9 +370,6 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
-            )
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -242,10 +381,15 @@ def forward(
                 f" {attn_output.size()}"
             )
 
-        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
-        attn_output = self.o_proj(attn_output)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
             attn_weights = None
@@ -259,11 +403,7 @@ def __init__(self, config: AquilaConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = AquilaAttention(config=config)
-        self.mlp = AquilaMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
+        self.mlp = AquilaMLP(config)
         self.input_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -321,7 +461,6 @@ def forward(
 
         return outputs
 
-
 AQUILA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -350,7 +489,6 @@ class AquilaPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["AquilaDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -570,7 +708,7 @@ def forward(
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
                         # None for past_key_value
-                        return module(*inputs, output_attentions, None)
+                        return module(*inputs, past_key_value, output_attentions)
 
                     return custom_forward
 
@@ -579,7 +717,6 @@ def custom_forward(*inputs):
                     hidden_states,
                     attention_mask,
                     position_ids,
-                    None,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -615,13 +752,14 @@ def custom_forward(*inputs):
             attentions=all_self_attns,
         )
 
-
 # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->AQUILA,Llama->Aquila
 class AquilaForCausalLM(AquilaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
     def __init__(self, config):
         super().__init__(config)
         self.model = AquilaModel(config)
-
+        self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
         # Initialize weights and apply final processing
@@ -706,7 +844,13 @@ def forward(
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
 
         loss = None
         if labels is not None:
@@ -767,9 +911,117 @@ def prepare_inputs_for_generation(
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
 
+    def predict(self, text, tokenizer=None,
+                max_gen_len=200, top_p=0.95,
+                seed=1234, topk=100,
+                temperature=0.9, 
+                sft=True, convo_template = "aquila-chat",
+                device = "cuda"):
+
+        vocab = tokenizer.get_vocab()
+        #device = device
+        id2word = {v:k for k, v in vocab.items()}
+
+
+        set_random_seed(seed)
+        if temperature == 0:
+            topk = 1
+            temperature = 1.0
+        if sft:
+            tokens = covert_prompt_to_input_ids_with_history(text, history=[], tokenizer=tokenizer, max_token=2048, convo_template=convo_template)
+            tokens = torch.tensor(tokens)[None,].to(device)
+        else :
+            tokens = tokenizer.encode_plus(text)["input_ids"]
+            print(tokenizer.decode(tokens))
+            tokens = torch.tensor(tokens)[None,].to(device)
+        input_length = len(tokens[0])
+        with torch.no_grad():
+
+            # instantiate logits processors
+            logits_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(1, eos_token_id=100007),
+                ]
+            )
+            # instantiate logits processors
+            logits_warper = LogitsProcessorList(
+                [
+                    TopPLogitsWarper(top_p),
+                    TopKLogitsWarper(topk),
+                    TemperatureLogitsWarper(temperature),
+                    
+                ]
+            )
+
+            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)])
+            out = self.sample(
+                                tokens,
+                                logits_processor=logits_processor,
+                                logits_warper=logits_warper,
+                                stopping_criteria=stopping_criteria,
+                                return_dict_in_generate=True, 
+                                output_scores=True,
+                            )
+
+            
+            # print(out)
+            out_ids = out["sequences"][0][input_length:].cpu().numpy()
+
+            out_scores = out["scores"]
+
+            out_scores = torch.cat(out_scores, dim=0)
+            out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy()
+
+            probs = []
+            for i in range(len(out_ids)):
+                probs.append(float(out_scores[i][out_ids[i]]))
+
+            # print(f"probs is {probs}")
+
+            convert_tokens = []
+            for t in out_ids:
+                if t == 100006:
+                    convert_tokens.append("[CLS]")
+                else :
+                    convert_tokens.append(id2word.get(t, "[unkonwn_token]"))
+
+            out_text = tokenizer.decode(out_ids.tolist())
+            
+
+            out = out_text
+
+        if "###" in out:
+            special_index = out.index("###")
+            out = out[: special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if "[UNK]" in out:
+            special_index = out.index("[UNK]")
+            out = out[:special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if "</s>" in out:
+            special_index = out.index("</s>")
+            out = out[: special_index]
+            token_length = len(tokenizer.encode_plus(out)["input_ids"])
+            convert_tokens = convert_tokens[:token_length]
+            probs = probs[:token_length]
+
+        if len(out) > 0 and out[0] == " ":
+            out = out[1:]
+
+            convert_tokens = convert_tokens[1:]
+            probs = probs[1:]
+        return out 
 
 @add_start_docstrings(
     """
@@ -852,7 +1104,9 @@ def forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
             else:
                 sequence_lengths = -1
 
diff --git a/flagai/model/aquila2/utils.py b/flagai/model/aquila2/utils.py
new file mode 100755
index 00000000..8e3de5f9
--- /dev/null
+++ b/flagai/model/aquila2/utils.py
@@ -0,0 +1,38 @@
+import random
+import numpy as np 
+import torch 
+from fastchat.conversation import get_conv_template
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+
+
+def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"):
+    # aquila-chat as default
+    conv = get_conv_template(convo_template)
+
+    conv.append_message(conv.roles[1], None)
+    conv.append_message(conv.roles[0], text)
+
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    while(len(history) > 0 and (len(example) < max_token)):
+        tmp = history.pop()
+        if tmp[0] == 'ASSISTANT':
+            conv.append_message(conv.roles[1], tmp[1])
+        else:
+            conv.append_message(conv.roles[0], tmp[1])
+        example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    if len(example) >= max_token:
+        conv.messages.pop()
+    conv.messages = conv.messages[::-1]
+    print('model in:', conv.get_prompt())
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    return example
\ No newline at end of file