404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..588d352 --- /dev/null +++ b/404.html @@ -0,0 +1,126 @@ + + +
+ + + + +Page not found
+ + +We primarily define two dataclasses to contain the multimodal data to be processed by MLLMs, one for text-only samples and the other for image-text pairs. The detailed attributes in the dataclass are introduced below.
+TxtSample
: to support text-only sample
text
: prompt in texttarget
: ground-truth label(Default: None)extra
: auxiliary arguments that may help in the process afterwards, e.g., adversarial example generation(Default: None)ImageTxtSample
: to support multimodal input, i.e., an image-text pair
image_path
: path to the image filetext
: prompt in texttarget
: ground-truth label(Default: None)extra
:auxiliary arguments that may help in the process afterwards, e.g., adversarial example generation(Default: None)The type of the output from an MLLM is also restricted to these two dataclasses.
+_OutputType = Union[ImageTxtSample, TxtSample]
Source code in mmte/__init__.py
.
@dataclass
+class TxtSample:
+ text: str
+ target: Optional[str] = None
+ extra: Optional[Dict[str, Any]] = None
+
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "TxtSample":
+ return cls(**{k: v for k, v in data.items() if k in cls.__annotations__})
+
+ def to_dict(self) -> Dict[str, Any]:
+ return asdict(self)
+
+ def __getitem__(self, item):
+ return getattr(self, item)
+
+
+@dataclass
+class ImageTxtSample:
+ image_path: str
+ text: str
+ target: Optional[str] = None
+ extra: Optional[Dict[str, Any]] = None
+
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "ImageTxtSample":
+ return cls(**{k: v for k, v in data.items() if k in cls.__annotations__})
+
+ def to_dict(self) -> Dict[str, Any]:
+ return asdict(self)
+
+ def __getitem__(self, item):
+ return getattr(self, item)
+
+
+
+_OutputType = Union[ImageTxtSample, TxtSample]
+
+
+ In this section, we introduce each element in the task flow about their interface, typical usage, etc., in the order of their parts in the workflow. All elements are registered by an unique identifier into the global registry
(defined in mmte/utils/registry.py
) and can be accessed by the registry.get_**_class(id)
method.
Datasets are defined to collect the samples to be tested for a specific task. It provides the prompt, image path, labels and possibly other information about the data point to the following process. Here are some technical notes about this class.
+torch.utils.data.Dataset
and users can iterate through the dataset by the default torch.utils.data.Dataloader
. To customize a dataset, user need to define __getitem__
and __len__
as usual along with a collate_fn
so that the dataloader can support the dataclass of TxtSample
and ImageTxtSample
. We provide a default one in mmte.datasets.base.collate_fn
which should work for most cases, one can customized your own collate_fn if necessary.method_hook
when initialization. This could make additional augmentation, attack and other preprocessing to the existing datasets more convenient. This is illustrated in the Method part.dataset_ids
is the list of supported dataset_id
for this class, which specify the different splits and processors in sample preparing.Source code in mmte/datasets/base.py
. Refer to mmte/datasets/celeb.py
for an example.
class BaseDataset(Dataset, ABC):
+ """
+ Base class for datasets, __getitem__ function return Union[ImageTxtSample, TxtSample].
+ """
+
+ dataset_id: str # Identifier for the dataset
+ dataset_ids: Sequence[str] = [] # List of available datasets
+ dataset_config: Optional[str] = "" # dataset config path
+
+ def __init__(self, dataset_id: str, method_hook: Optional[BaseMethod] = None, **kwargs) -> None:
+ """
+ Initializing dataset instance.
+
+ Arguments:
+ dataset_id: Identifier for the dataset
+ method_hook: A method instance, which is used as a preprocess hook for __getitem__ funtion
+ kwargs: extra configurations
+
+ """
+
+ assert dataset_id in self.dataset_ids, f"Dataset {dataset_id} must be one of {self.dataset_ids}."
+ self.dataset_id = dataset_id
+ if method_hook:
+ self.method_hook = method_hook
+ else:
+ self.method_hook = None
+ self.dataset: List[Any] = []
+
+ @abstractmethod
+ def __getitem__(self, index: int) -> _OutputType:
+ if self.method_hook:
+ return self.method_hook.run(self.dataset[index])
+ return self.dataset[index]
+
+ @abstractmethod
+ def __len__(self) -> int:
+ return len(self.dataset)
+
+Methods are designed for additional data processing independent from and universal across datasets, for instance, generating adversarial examples, pairing text prompts with diverse images. Users do not need to modify the code for datasets but only implement a new class of Method
and pass it as a hook to the dataset. Here are some technical notes about this class.
__getitem__
of a dataset, which is optional. lazy_mode=True
to utilize the previously generated samples. To tackle the challenge that text-only data may not have clear identifiers pointing to a sample, a hash
function can be defined to generate the filename for the generated data.Source code in mmte/methods/base.py
. Refer to mmte/methods/unrelated_color.py
for an example.
class BaseMethod(ABC):
+ """
+ Base class for methods, which can be applied to any Dataset inherits from BaseDataset as a hook in __getitem__ function.
+ """
+
+ method_id: str # Identifier for the method
+ method_ids: List[str] # List of available methods
+
+ def __init__(self, method_id: str, img_dir: str, lazy_mode: bool = True) -> None:
+ """
+ Initializing method instance.
+
+ Arguments:
+ method_id: Identifier for the method
+ img_dir: Folder to save images
+ lazy_mode: If True, it will reuse the already generated dataset. If False, it will regenerate the result
+
+ Return:
+ evaluation result
+ """
+ assert method_id in self.method_ids, f"Method {self.method_id} is not available. Only methods in {self.method_ids} can be used."
+ self.method_id = method_id
+ self.img_dir = img_dir
+ self.lazy_mode = lazy_mode
+
+ @abstractmethod
+ def run(self, data: _OutputType, **kwargs) -> _OutputType:
+ """
+ Preprocess each sample in the Dataset one by one.
+
+ Arguments:
+ data: Union[ImageTxtSample, TxtSample], one sample in Dataset
+ kwargs: extra configurations
+
+
+ Return:
+ processed sample
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def hash(self, to_hash_str: str, **kwargs) -> str:
+ """
+ Get hash code given to_hash_str, to provide an identifier for the data sample and to reuse the generated samples.
+
+ Arguments:
+ to_hash_str: str
+ kwargs: extra configurations
+
+
+ Return:
+ hash code
+ """
+ raise NotImplementedError
+
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
+ return self.run(*args, **kwds)
+
+
+Models encapsulate the chat model of MLLMs into a unified interface for inference. This enables more convenient standardized evaluation of diverse models. Here are some technical notes about this class.
+chat
unifies the interface for generation. messages
is a list representing the conversation history, generation_kwargs
is the generation configuration, indicating whether to do_sample
, the setting of temperature
, max_new_tokens
, etc. The setting of generation configuration follows that in huggingface transformers. model_id
is the unique identifier to get the chatmodel from registry_getchatmodel_class
and model_family
defines the list of available model identifiers.Source code in mmte/models/base.py
. Refer to mmte/models/openai_chat.py
for an example.
class BaseChat(ABC):
+ """
+ Base class for models to be evaluated in a generative/chat manner.
+ """
+
+ model_id: str = '' # ID for a chat model, e.g., minigpt-4-vicuna-7b-v0
+ model_arch: str = '' # Architecture of the model, e.g., minigpt-4
+ model_family: List[str] = [] # List of available model_ids
+
+
+
+ def __init__(self, model_id:str) -> None:
+ self.model_id = model_id
+ assert self.model_id in self.model_family, f"Model {self.model_id} is not available. Only models in {self.model_family} can be used."
+
+
+ @abstractmethod
+ def chat(self,
+ messages: List,
+ **generation_kwargs,
+ ) -> "Response":
+ """
+ Chat interface for generative evaluation with batch size of 1.
+
+ messages: a list of messages, comprising the conversation history and following the format
+ [
+ {
+ 'role': 'system'/'user'/'assistant',
+ 'content': str/dict
+ },
+ ...
+ ],
+ where content is a dict {'text': str, 'image_path': str} when it's multimodal.
+ generation_kwargs: generation configuration specified for different models, including:
+ temperature: float, usually between 0-2, smaller means more deterministic
+ do_sample: bool, whether take sampling as the decoding strategy
+ num_beams: int, the parameter for beam search
+ max_new_tokens: int, maximal number of tokens to be generated
+ stop_sequences: str/List[str], stop words where the model will stop generating further tokens
+ output_scores: bool, whether return the logits of the generated tokens (not very practical)
+ """
+ raise NotImplementedError
+
+source_code: mmte/evaluators/base.py
This class is primarily used for evaluating the results output by a chat model. The process
function preprocesses the input sequences of predictions (preds
) and labels (labels
)—these sequences can be numerical or textual. The preprocessing aims to generate simple numerical sequences that can be directly used in the eval
function, which subsequently calls the metrics
function (the metrics
function only accepts numerical sequences).
Currently, there are three main types of evaluators:
+longformer-action-ro
model.Evaluators can be chained together to form an evaluator sequence (refer to SequentialEvaluator
for details). The primary purpose is to reuse the process
functions of different evaluators (for example, first using the chat model evaluator for text preprocessing, followed by the rule-based evaluator for scoring).
+class BaseEvaluator(ABC):
+ """
+ Base class for evaluators, to evaluate the responses from chatmodels.
+ """
+
+ evaluator_ids: List[str] = []
+ def __init__(self, evaluator_id: str, metrics_cfg: Dict[str, Any]) -> None:
+ """
+ Initializing evaluator instance.
+
+ Arguments:
+ evaluator_id: Identifier for the evaluator
+ metrics_cfg: config dict for metrics hooks, format: {metrics_id: metrics_kwargs, ...}
+
+ """
+ assert evaluator_id in self.evaluator_ids, f"Evaluator {self.evaluator_id} is not available. Only Evaluators in {self.evaluator_ids} can be used."
+
+ self.evaluator_id = evaluator_id
+
+ self.metrics_cfg = metrics_cfg
+ for metrics_id in self.metrics_cfg.keys():
+ assert metrics_id in _supported_metrics.keys(), f"{metrics_id} is not supported."
+
+ @abstractmethod
+ def process(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Tuple[Sequence[Any], Sequence[Any]]:
+ """
+ 1. Perform some processing on sequence data, mainly including scoring/text-extraction with chatmodel/classifier/rule-based, etc.
+ 2. Different evaluators can be concatenated, and the process function can be cascaded to perform multi-step processing on sequence data.
+
+ Arguments:
+ preds: responses from chatmodels or preds from `process` function of another evaluator
+ labels: groundtruth labels or labels from `process` function of another evaluator
+ extras: extra parameters or extra sequence from `process` function of another evaluator
+
+ Return:
+ preds: processed preds sequence
+ labels: processed labels sequence
+ extras: processed extra sequence
+ """
+
+ # no-op
+ return preds, labels, extras
+
+ def eval(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Dict[str, Union[Sequence, float]]:
+ """
+ Evaluate pipeline including data processing and metrics calculation.
+
+ Arguments:
+ preds: responses from chatmodels
+ labels: groundtruth labels
+ extras: extra parameters
+
+ Return:
+ results
+ """
+
+ processed_preds, processed_labels, processed_extras = self.process(preds, labels, extras)
+ results = {}
+
+ for metrics_id, kwargs in self.metrics_cfg.items():
+ metrics_fn = _supported_metrics[metrics_id]
+ results[metrics_id] = metrics_fn(processed_labels, processed_preds, **kwargs)
+
+ return results
+
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
+ return self.eval(*args, **kwds)
+
+
+class SequentialEvaluator:
+ """
+ Class for cascading evaluators to perform multi-step processing on sequence data and get results from final sequence data.
+ """
+
+ def __init__(self, evaluator_seq_cfg: Dict[str, Any]) -> None:
+ """
+ Initializing sequence-evaluator instance.
+
+ Arguments:
+ evaluator_seq_cfg: config dict for instantiatizing evaluators, format: {evaluator: evaluator_kwargs, ...}
+
+ """
+
+ evaluator_seq, evaluator_cls_names = [], []
+ for evaluator_id, evaluator_kwargs in evaluator_seq_cfg.items():
+ evaluator_cls = registry.get_evaluator_class(evaluator_id)
+ evaluator = evaluator_cls(evaluator_id, **evaluator_kwargs)
+ evaluator_cls_names.append(evaluator_cls.__name__)
+ evaluator_seq.append(evaluator)
+ self.evaluator_seq = evaluator_seq
+ self.keyname_prefix_seq = self.create_sequence_list(evaluator_cls_names)
+
+ def create_sequence_list(self, input_list: List[str]) -> List[str]:
+ result = []
+ current = ""
+ for item in input_list:
+ if current:
+ current += f"->{item}"
+ else:
+ current = item
+ result.append(current)
+ return result
+
+ def eval(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Dict[str, Union[Sequence, float]]:
+ """
+ Evaluate pipeline including data processing and metrics calculation.
+
+ Arguments:
+ preds: responses from chatmodels
+ labels: groundtruth labels
+ extras: extra parameters
+
+ Return:
+ results
+ """
+ prefix_results = {}
+ seq_len = len(self.evaluator_seq)
+ for evaluator_idx, (evaluator, keyname_prefix) in enumerate(zip(self.evaluator_seq, self.keyname_prefix_seq)):
+ if evaluator_idx < seq_len - 1:
+ preds, labels, extras = evaluator.process(preds, labels, extras)
+ prefix_results.update({f"{keyname_prefix}:pred_no_op": preds})
+ else:
+ # final evaluator
+ results = evaluator(preds, labels, extras)
+ prefix_results.update({f"{keyname_prefix}:{key}": value for key, value in results.items()})
+
+ return prefix_results
+
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
+ return self.eval(*args, **kwds)
+
+
+
+We pre-define some common metrics for users to call. These functions to calculate metrics take two array-like arguments of digits to compute the statistical or sample-wise results. We also consider some cases where simple operations of aggregation are needed, e.g., sum, mean.
+Source code in mmte/evaluators/metrics.py
.
"""
+Input Requirement
+y_true: 1d array-like
+y_pred: 1d array-like
+"""
+
+_supported_metrics = {
+ # aggregation op
+ "pred_no_op": pred_no_op,
+ "pred_sum": pred_sum,
+ "pred_mean": pred_mean,
+
+ # general metrics
+ "accuracy_score": accuracy_score,
+ "precision_score": precision_score,
+ "recall_score": recall_score,
+ "f1_score": f1_score,
+ "pearson_corr": pearson_corr,
+ "failure": failure,
+}
+
+
+
+ ' + escapeHtml(summary) +'
' + noResultsText + '
'); + } +} + +function doSearch () { + var query = document.getElementById('mkdocs-search-query').value; + if (query.length > min_search_length) { + if (!window.Worker) { + displayResults(search(query)); + } else { + searchWorker.postMessage({query: query}); + } + } else { + // Clear results for short queries + displayResults([]); + } +} + +function initSearch () { + var search_input = document.getElementById('mkdocs-search-query'); + if (search_input) { + search_input.addEventListener("keyup", doSearch); + } + var term = getSearchTermFromLocation(); + if (term) { + search_input.value = term; + doSearch(); + } +} + +function onWorkerMessage (e) { + if (e.data.allowSearch) { + initSearch(); + } else if (e.data.results) { + var results = e.data.results; + displayResults(results); + } else if (e.data.config) { + min_search_length = e.data.config.min_search_length-1; + } +} + +if (!window.Worker) { + console.log('Web Worker API not supported'); + // load index in main thread + $.getScript(joinUrl(base_url, "search/worker.js")).done(function () { + console.log('Loaded worker'); + init(); + window.postMessage = function (msg) { + onWorkerMessage({data: msg}); + }; + }).fail(function (jqxhr, settings, exception) { + console.error('Could not load worker.js'); + }); +} else { + // Wrap search in a web worker + var searchWorker = new Worker(joinUrl(base_url, "search/worker.js")); + searchWorker.postMessage({init: true}); + searchWorker.onmessage = onWorkerMessage; +} diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 0000000..38a03e3 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"indexing":"full","lang":["en","zh"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Welcome to MMTrustEval Framework of MultiTrust MMTrustEval (MMTE) is a toolbox developed for the benchmark, MultiTrust (Benchmarking Trustworthiness of Multimodal Large Language Models, paper ) It provides a universal and scalable infrastructure for evaluating MLLM trustworthiness and facilitating future research. Different MLLMs are integrated into a unified interface to conduct standardized inference. Tasks are modularized by separting data, inference, and evaluation metrics to encourage tool reuse and easy updates for new tasks to be added.","title":"Home"},{"location":"#welcome-to-mmtrusteval","text":"Framework of MultiTrust MMTrustEval (MMTE) is a toolbox developed for the benchmark, MultiTrust (Benchmarking Trustworthiness of Multimodal Large Language Models, paper ) It provides a universal and scalable infrastructure for evaluating MLLM trustworthiness and facilitating future research. Different MLLMs are integrated into a unified interface to conduct standardized inference. Tasks are modularized by separting data, inference, and evaluation metrics to encourage tool reuse and easy updates for new tasks to be added.","title":"Welcome to MMTrustEval"},{"location":"dataclass/","text":"Dataclass We primarily define two dataclasses to contain the multimodal data to be processed by MLLMs, one for text-only samples and the other for image-text pairs. The detailed attributes in the dataclass are introduced below. TxtSample : to support text-only sample text : prompt in text target : ground-truth label\uff08Default: None\uff09 extra : auxiliary arguments that may help in the process afterwards, e.g., adversarial example generation\uff08Default: None\uff09 ImageTxtSample : to support multimodal input, i.e., an image-text pair image_path : path to the image file text : prompt in text target : ground-truth label\uff08Default: None\uff09 extra \uff1aauxiliary arguments that may help in the process afterwards, e.g., adversarial example generation\uff08Default: None\uff09 The type of the output from an MLLM is also restricted to these two dataclasses. _OutputType = Union[ImageTxtSample, TxtSample] Source code in mmte/__init__.py . @dataclass class TxtSample: text: str target: Optional[str] = None extra: Optional[Dict[str, Any]] = None @classmethod def from_dict(cls, data: Dict[str, Any]) -> \"TxtSample\": return cls(**{k: v for k, v in data.items() if k in cls.__annotations__}) def to_dict(self) -> Dict[str, Any]: return asdict(self) def __getitem__(self, item): return getattr(self, item) @dataclass class ImageTxtSample: image_path: str text: str target: Optional[str] = None extra: Optional[Dict[str, Any]] = None @classmethod def from_dict(cls, data: Dict[str, Any]) -> \"ImageTxtSample\": return cls(**{k: v for k, v in data.items() if k in cls.__annotations__}) def to_dict(self) -> Dict[str, Any]: return asdict(self) def __getitem__(self, item): return getattr(self, item) _OutputType = Union[ImageTxtSample, TxtSample]","title":"Dataclass"},{"location":"dataclass/#dataclass","text":"We primarily define two dataclasses to contain the multimodal data to be processed by MLLMs, one for text-only samples and the other for image-text pairs. The detailed attributes in the dataclass are introduced below. TxtSample : to support text-only sample text : prompt in text target : ground-truth label\uff08Default: None\uff09 extra : auxiliary arguments that may help in the process afterwards, e.g., adversarial example generation\uff08Default: None\uff09 ImageTxtSample : to support multimodal input, i.e., an image-text pair image_path : path to the image file text : prompt in text target : ground-truth label\uff08Default: None\uff09 extra \uff1aauxiliary arguments that may help in the process afterwards, e.g., adversarial example generation\uff08Default: None\uff09 The type of the output from an MLLM is also restricted to these two dataclasses. _OutputType = Union[ImageTxtSample, TxtSample] Source code in mmte/__init__.py . @dataclass class TxtSample: text: str target: Optional[str] = None extra: Optional[Dict[str, Any]] = None @classmethod def from_dict(cls, data: Dict[str, Any]) -> \"TxtSample\": return cls(**{k: v for k, v in data.items() if k in cls.__annotations__}) def to_dict(self) -> Dict[str, Any]: return asdict(self) def __getitem__(self, item): return getattr(self, item) @dataclass class ImageTxtSample: image_path: str text: str target: Optional[str] = None extra: Optional[Dict[str, Any]] = None @classmethod def from_dict(cls, data: Dict[str, Any]) -> \"ImageTxtSample\": return cls(**{k: v for k, v in data.items() if k in cls.__annotations__}) def to_dict(self) -> Dict[str, Any]: return asdict(self) def __getitem__(self, item): return getattr(self, item) _OutputType = Union[ImageTxtSample, TxtSample]","title":"Dataclass"},{"location":"modules/","text":"Modules In this section, we introduce each element in the task flow about their interface, typical usage, etc., in the order of their parts in the workflow. All elements are registered by an unique identifier into the global registry (defined in mmte/utils/registry.py ) and can be accessed by the registry.get_**_class(id) method. Datasets Datasets are defined to collect the samples to be tested for a specific task. It provides the prompt, image path, labels and possibly other information about the data point to the following process. Here are some technical notes about this class. The class is a subclass of the torch.utils.data.Dataset and users can iterate through the dataset by the default torch.utils.data.Dataloader . To customize a dataset, user need to define __getitem__ and __len__ as usual along with a collate_fn so that the dataloader can support the dataclass of TxtSample and ImageTxtSample . We provide a default one in mmte.datasets.base.collate_fn which should work for most cases, one can customized your own collate_fn if necessary. A method to further process the data for a certain task, which can be independent from the original dataset, can be optionally specified via the argument method_hook when initialization. This could make additional augmentation, attack and other preprocessing to the existing datasets more convenient. This is illustrated in the Method part. Some information about the dataset can be configured through a yaml config file, like the directory of images, the path to the annotation file. dataset_ids is the list of supported dataset_id for this class, which specify the different splits and processors in sample preparing. Source code in mmte/datasets/base.py . Refer to mmte/datasets/celeb.py for an example. class BaseDataset(Dataset, ABC): \"\"\" Base class for datasets, __getitem__ function return Union[ImageTxtSample, TxtSample]. \"\"\" dataset_id: str # Identifier for the dataset dataset_ids: Sequence[str] = [] # List of available datasets dataset_config: Optional[str] = \"\" # dataset config path def __init__(self, dataset_id: str, method_hook: Optional[BaseMethod] = None, **kwargs) -> None: \"\"\" Initializing dataset instance. Arguments: dataset_id: Identifier for the dataset method_hook: A method instance, which is used as a preprocess hook for __getitem__ funtion kwargs: extra configurations \"\"\" assert dataset_id in self.dataset_ids, f\"Dataset {dataset_id} must be one of {self.dataset_ids}.\" self.dataset_id = dataset_id if method_hook: self.method_hook = method_hook else: self.method_hook = None self.dataset: List[Any] = [] @abstractmethod def __getitem__(self, index: int) -> _OutputType: if self.method_hook: return self.method_hook.run(self.dataset[index]) return self.dataset[index] @abstractmethod def __len__(self) -> int: return len(self.dataset) Methods Methods are designed for additional data processing independent from and universal across datasets, for instance, generating adversarial examples, pairing text prompts with diverse images. Users do not need to modify the code for datasets but only implement a new class of Method and pass it as a hook to the dataset. Here are some technical notes about this class. This class works as a hook in the function __getitem__ of a dataset, which is optional. For cases where new images are generated with time-consuming methods or reproducibility is needed, we can set the lazy_mode=True to utilize the previously generated samples. To tackle the challenge that text-only data may not have clear identifiers pointing to a sample, a hash function can be defined to generate the filename for the generated data. Source code in mmte/methods/base.py . Refer to mmte/methods/unrelated_color.py for an example. class BaseMethod(ABC): \"\"\" Base class for methods, which can be applied to any Dataset inherits from BaseDataset as a hook in __getitem__ function. \"\"\" method_id: str # Identifier for the method method_ids: List[str] # List of available methods def __init__(self, method_id: str, img_dir: str, lazy_mode: bool = True) -> None: \"\"\" Initializing method instance. Arguments: method_id: Identifier for the method img_dir: Folder to save images lazy_mode: If True, it will reuse the already generated dataset. If False, it will regenerate the result Return: evaluation result \"\"\" assert method_id in self.method_ids, f\"Method {self.method_id} is not available. Only methods in {self.method_ids} can be used.\" self.method_id = method_id self.img_dir = img_dir self.lazy_mode = lazy_mode @abstractmethod def run(self, data: _OutputType, **kwargs) -> _OutputType: \"\"\" Preprocess each sample in the Dataset one by one. Arguments: data: Union[ImageTxtSample, TxtSample], one sample in Dataset kwargs: extra configurations Return: processed sample \"\"\" raise NotImplementedError @abstractmethod def hash(self, to_hash_str: str, **kwargs) -> str: \"\"\" Get hash code given to_hash_str, to provide an identifier for the data sample and to reuse the generated samples. Arguments: to_hash_str: str kwargs: extra configurations Return: hash code \"\"\" raise NotImplementedError def __call__(self, *args: Any, **kwds: Any) -> Any: return self.run(*args, **kwds) Models Models encapsulate the chat model of MLLMs into a unified interface for inference. This enables more convenient standardized evaluation of diverse models. Here are some technical notes about this class. chat unifies the interface for generation. messages is a list representing the conversation history, generation_kwargs is the generation configuration, indicating whether to do_sample , the setting of temperature , max_new_tokens , etc. The setting of generation configuration follows that in huggingface transformers. model_id is the unique identifier to get the chatmodel from registry_getchatmodel_class and model_family defines the list of available model identifiers. Source code in mmte/models/base.py . Refer to mmte/models/openai_chat.py for an example. class BaseChat(ABC): \"\"\" Base class for models to be evaluated in a generative/chat manner. \"\"\" model_id: str = '' # ID for a chat model, e.g., minigpt-4-vicuna-7b-v0 model_arch: str = '' # Architecture of the model, e.g., minigpt-4 model_family: List[str] = [] # List of available model_ids def __init__(self, model_id:str) -> None: self.model_id = model_id assert self.model_id in self.model_family, f\"Model {self.model_id} is not available. Only models in {self.model_family} can be used.\" @abstractmethod def chat(self, messages: List, **generation_kwargs, ) -> \"Response\": \"\"\" Chat interface for generative evaluation with batch size of 1. messages: a list of messages, comprising the conversation history and following the format [ { 'role': 'system'/'user'/'assistant', 'content': str/dict }, ... ], where content is a dict {'text': str, 'image_path': str} when it's multimodal. generation_kwargs: generation configuration specified for different models, including: temperature: float, usually between 0-2, smaller means more deterministic do_sample: bool, whether take sampling as the decoding strategy num_beams: int, the parameter for beam search max_new_tokens: int, maximal number of tokens to be generated stop_sequences: str/List[str], stop words where the model will stop generating further tokens output_scores: bool, whether return the logits of the generated tokens (not very practical) \"\"\" raise NotImplementedError Evaluators source_code: mmte/evaluators/base.py BaseEvaluator This class is primarily used for evaluating the results output by a chat model. The process function preprocesses the input sequences of predictions ( preds ) and labels ( labels )\u2014these sequences can be numerical or textual. The preprocessing aims to generate simple numerical sequences that can be directly used in the eval function, which subsequently calls the metrics function (the metrics function only accepts numerical sequences). Currently, there are three main types of evaluators: ChatModel Evaluator : Processes and evaluates the results using a chat model. Classifier Evaluator : Scores the results using a classifier. Currently, it supports the longformer-action-ro model. Rule-based Evaluator : Includes tools for template matching for refusal and score extraction. Evaluators can be chained together to form an evaluator sequence (refer to SequentialEvaluator for details). The primary purpose is to reuse the process functions of different evaluators (for example, first using the chat model evaluator for text preprocessing, followed by the rule-based evaluator for scoring). class BaseEvaluator(ABC): \"\"\" Base class for evaluators, to evaluate the responses from chatmodels. \"\"\" evaluator_ids: List[str] = [] def __init__(self, evaluator_id: str, metrics_cfg: Dict[str, Any]) -> None: \"\"\" Initializing evaluator instance. Arguments: evaluator_id: Identifier for the evaluator metrics_cfg: config dict for metrics hooks, format: {metrics_id: metrics_kwargs, ...} \"\"\" assert evaluator_id in self.evaluator_ids, f\"Evaluator {self.evaluator_id} is not available. Only Evaluators in {self.evaluator_ids} can be used.\" self.evaluator_id = evaluator_id self.metrics_cfg = metrics_cfg for metrics_id in self.metrics_cfg.keys(): assert metrics_id in _supported_metrics.keys(), f\"{metrics_id} is not supported.\" @abstractmethod def process(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Tuple[Sequence[Any], Sequence[Any]]: \"\"\" 1. Perform some processing on sequence data, mainly including scoring/text-extraction with chatmodel/classifier/rule-based, etc. 2. Different evaluators can be concatenated, and the process function can be cascaded to perform multi-step processing on sequence data. Arguments: preds: responses from chatmodels or preds from `process` function of another evaluator labels: groundtruth labels or labels from `process` function of another evaluator extras: extra parameters or extra sequence from `process` function of another evaluator Return: preds: processed preds sequence labels: processed labels sequence extras: processed extra sequence \"\"\" # no-op return preds, labels, extras def eval(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Dict[str, Union[Sequence, float]]: \"\"\" Evaluate pipeline including data processing and metrics calculation. Arguments: preds: responses from chatmodels labels: groundtruth labels extras: extra parameters Return: results \"\"\" processed_preds, processed_labels, processed_extras = self.process(preds, labels, extras) results = {} for metrics_id, kwargs in self.metrics_cfg.items(): metrics_fn = _supported_metrics[metrics_id] results[metrics_id] = metrics_fn(processed_labels, processed_preds, **kwargs) return results def __call__(self, *args: Any, **kwds: Any) -> Any: return self.eval(*args, **kwds) SequentialEvaluator class SequentialEvaluator: \"\"\" Class for cascading evaluators to perform multi-step processing on sequence data and get results from final sequence data. \"\"\" def __init__(self, evaluator_seq_cfg: Dict[str, Any]) -> None: \"\"\" Initializing sequence-evaluator instance. Arguments: evaluator_seq_cfg: config dict for instantiatizing evaluators, format: {evaluator: evaluator_kwargs, ...} \"\"\" evaluator_seq, evaluator_cls_names = [], [] for evaluator_id, evaluator_kwargs in evaluator_seq_cfg.items(): evaluator_cls = registry.get_evaluator_class(evaluator_id) evaluator = evaluator_cls(evaluator_id, **evaluator_kwargs) evaluator_cls_names.append(evaluator_cls.__name__) evaluator_seq.append(evaluator) self.evaluator_seq = evaluator_seq self.keyname_prefix_seq = self.create_sequence_list(evaluator_cls_names) def create_sequence_list(self, input_list: List[str]) -> List[str]: result = [] current = \"\" for item in input_list: if current: current += f\"->{item}\" else: current = item result.append(current) return result def eval(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Dict[str, Union[Sequence, float]]: \"\"\" Evaluate pipeline including data processing and metrics calculation. Arguments: preds: responses from chatmodels labels: groundtruth labels extras: extra parameters Return: results \"\"\" prefix_results = {} seq_len = len(self.evaluator_seq) for evaluator_idx, (evaluator, keyname_prefix) in enumerate(zip(self.evaluator_seq, self.keyname_prefix_seq)): if evaluator_idx < seq_len - 1: preds, labels, extras = evaluator.process(preds, labels, extras) prefix_results.update({f\"{keyname_prefix}:pred_no_op\": preds}) else: # final evaluator results = evaluator(preds, labels, extras) prefix_results.update({f\"{keyname_prefix}:{key}\": value for key, value in results.items()}) return prefix_results def __call__(self, *args: Any, **kwds: Any) -> Any: return self.eval(*args, **kwds) Metrics We pre-define some common metrics for users to call. These functions to calculate metrics take two array-like arguments of digits to compute the statistical or sample-wise results. We also consider some cases where simple operations of aggregation are needed, e.g., sum, mean. Source code in mmte/evaluators/metrics.py . \"\"\" Input Requirement y_true: 1d array-like y_pred: 1d array-like \"\"\" _supported_metrics = { # aggregation op \"pred_no_op\": pred_no_op, \"pred_sum\": pred_sum, \"pred_mean\": pred_mean, # general metrics \"accuracy_score\": accuracy_score, \"precision_score\": precision_score, \"recall_score\": recall_score, \"f1_score\": f1_score, \"pearson_corr\": pearson_corr, \"failure\": failure, }","title":"Modules"},{"location":"modules/#modules","text":"In this section, we introduce each element in the task flow about their interface, typical usage, etc., in the order of their parts in the workflow. All elements are registered by an unique identifier into the global registry (defined in mmte/utils/registry.py ) and can be accessed by the registry.get_**_class(id) method.","title":"Modules"},{"location":"modules/#datasets","text":"Datasets are defined to collect the samples to be tested for a specific task. It provides the prompt, image path, labels and possibly other information about the data point to the following process. Here are some technical notes about this class. The class is a subclass of the torch.utils.data.Dataset and users can iterate through the dataset by the default torch.utils.data.Dataloader . To customize a dataset, user need to define __getitem__ and __len__ as usual along with a collate_fn so that the dataloader can support the dataclass of TxtSample and ImageTxtSample . We provide a default one in mmte.datasets.base.collate_fn which should work for most cases, one can customized your own collate_fn if necessary. A method to further process the data for a certain task, which can be independent from the original dataset, can be optionally specified via the argument method_hook when initialization. This could make additional augmentation, attack and other preprocessing to the existing datasets more convenient. This is illustrated in the Method part. Some information about the dataset can be configured through a yaml config file, like the directory of images, the path to the annotation file. dataset_ids is the list of supported dataset_id for this class, which specify the different splits and processors in sample preparing. Source code in mmte/datasets/base.py . Refer to mmte/datasets/celeb.py for an example. class BaseDataset(Dataset, ABC): \"\"\" Base class for datasets, __getitem__ function return Union[ImageTxtSample, TxtSample]. \"\"\" dataset_id: str # Identifier for the dataset dataset_ids: Sequence[str] = [] # List of available datasets dataset_config: Optional[str] = \"\" # dataset config path def __init__(self, dataset_id: str, method_hook: Optional[BaseMethod] = None, **kwargs) -> None: \"\"\" Initializing dataset instance. Arguments: dataset_id: Identifier for the dataset method_hook: A method instance, which is used as a preprocess hook for __getitem__ funtion kwargs: extra configurations \"\"\" assert dataset_id in self.dataset_ids, f\"Dataset {dataset_id} must be one of {self.dataset_ids}.\" self.dataset_id = dataset_id if method_hook: self.method_hook = method_hook else: self.method_hook = None self.dataset: List[Any] = [] @abstractmethod def __getitem__(self, index: int) -> _OutputType: if self.method_hook: return self.method_hook.run(self.dataset[index]) return self.dataset[index] @abstractmethod def __len__(self) -> int: return len(self.dataset)","title":"Datasets"},{"location":"modules/#methods","text":"Methods are designed for additional data processing independent from and universal across datasets, for instance, generating adversarial examples, pairing text prompts with diverse images. Users do not need to modify the code for datasets but only implement a new class of Method and pass it as a hook to the dataset. Here are some technical notes about this class. This class works as a hook in the function __getitem__ of a dataset, which is optional. For cases where new images are generated with time-consuming methods or reproducibility is needed, we can set the lazy_mode=True to utilize the previously generated samples. To tackle the challenge that text-only data may not have clear identifiers pointing to a sample, a hash function can be defined to generate the filename for the generated data. Source code in mmte/methods/base.py . Refer to mmte/methods/unrelated_color.py for an example. class BaseMethod(ABC): \"\"\" Base class for methods, which can be applied to any Dataset inherits from BaseDataset as a hook in __getitem__ function. \"\"\" method_id: str # Identifier for the method method_ids: List[str] # List of available methods def __init__(self, method_id: str, img_dir: str, lazy_mode: bool = True) -> None: \"\"\" Initializing method instance. Arguments: method_id: Identifier for the method img_dir: Folder to save images lazy_mode: If True, it will reuse the already generated dataset. If False, it will regenerate the result Return: evaluation result \"\"\" assert method_id in self.method_ids, f\"Method {self.method_id} is not available. Only methods in {self.method_ids} can be used.\" self.method_id = method_id self.img_dir = img_dir self.lazy_mode = lazy_mode @abstractmethod def run(self, data: _OutputType, **kwargs) -> _OutputType: \"\"\" Preprocess each sample in the Dataset one by one. Arguments: data: Union[ImageTxtSample, TxtSample], one sample in Dataset kwargs: extra configurations Return: processed sample \"\"\" raise NotImplementedError @abstractmethod def hash(self, to_hash_str: str, **kwargs) -> str: \"\"\" Get hash code given to_hash_str, to provide an identifier for the data sample and to reuse the generated samples. Arguments: to_hash_str: str kwargs: extra configurations Return: hash code \"\"\" raise NotImplementedError def __call__(self, *args: Any, **kwds: Any) -> Any: return self.run(*args, **kwds)","title":"Methods"},{"location":"modules/#models","text":"Models encapsulate the chat model of MLLMs into a unified interface for inference. This enables more convenient standardized evaluation of diverse models. Here are some technical notes about this class. chat unifies the interface for generation. messages is a list representing the conversation history, generation_kwargs is the generation configuration, indicating whether to do_sample , the setting of temperature , max_new_tokens , etc. The setting of generation configuration follows that in huggingface transformers. model_id is the unique identifier to get the chatmodel from registry_getchatmodel_class and model_family defines the list of available model identifiers. Source code in mmte/models/base.py . Refer to mmte/models/openai_chat.py for an example. class BaseChat(ABC): \"\"\" Base class for models to be evaluated in a generative/chat manner. \"\"\" model_id: str = '' # ID for a chat model, e.g., minigpt-4-vicuna-7b-v0 model_arch: str = '' # Architecture of the model, e.g., minigpt-4 model_family: List[str] = [] # List of available model_ids def __init__(self, model_id:str) -> None: self.model_id = model_id assert self.model_id in self.model_family, f\"Model {self.model_id} is not available. Only models in {self.model_family} can be used.\" @abstractmethod def chat(self, messages: List, **generation_kwargs, ) -> \"Response\": \"\"\" Chat interface for generative evaluation with batch size of 1. messages: a list of messages, comprising the conversation history and following the format [ { 'role': 'system'/'user'/'assistant', 'content': str/dict }, ... ], where content is a dict {'text': str, 'image_path': str} when it's multimodal. generation_kwargs: generation configuration specified for different models, including: temperature: float, usually between 0-2, smaller means more deterministic do_sample: bool, whether take sampling as the decoding strategy num_beams: int, the parameter for beam search max_new_tokens: int, maximal number of tokens to be generated stop_sequences: str/List[str], stop words where the model will stop generating further tokens output_scores: bool, whether return the logits of the generated tokens (not very practical) \"\"\" raise NotImplementedError","title":"Models"},{"location":"modules/#evaluators","text":"source_code: mmte/evaluators/base.py","title":"Evaluators"},{"location":"modules/#baseevaluator","text":"This class is primarily used for evaluating the results output by a chat model. The process function preprocesses the input sequences of predictions ( preds ) and labels ( labels )\u2014these sequences can be numerical or textual. The preprocessing aims to generate simple numerical sequences that can be directly used in the eval function, which subsequently calls the metrics function (the metrics function only accepts numerical sequences). Currently, there are three main types of evaluators: ChatModel Evaluator : Processes and evaluates the results using a chat model. Classifier Evaluator : Scores the results using a classifier. Currently, it supports the longformer-action-ro model. Rule-based Evaluator : Includes tools for template matching for refusal and score extraction. Evaluators can be chained together to form an evaluator sequence (refer to SequentialEvaluator for details). The primary purpose is to reuse the process functions of different evaluators (for example, first using the chat model evaluator for text preprocessing, followed by the rule-based evaluator for scoring). class BaseEvaluator(ABC): \"\"\" Base class for evaluators, to evaluate the responses from chatmodels. \"\"\" evaluator_ids: List[str] = [] def __init__(self, evaluator_id: str, metrics_cfg: Dict[str, Any]) -> None: \"\"\" Initializing evaluator instance. Arguments: evaluator_id: Identifier for the evaluator metrics_cfg: config dict for metrics hooks, format: {metrics_id: metrics_kwargs, ...} \"\"\" assert evaluator_id in self.evaluator_ids, f\"Evaluator {self.evaluator_id} is not available. Only Evaluators in {self.evaluator_ids} can be used.\" self.evaluator_id = evaluator_id self.metrics_cfg = metrics_cfg for metrics_id in self.metrics_cfg.keys(): assert metrics_id in _supported_metrics.keys(), f\"{metrics_id} is not supported.\" @abstractmethod def process(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Tuple[Sequence[Any], Sequence[Any]]: \"\"\" 1. Perform some processing on sequence data, mainly including scoring/text-extraction with chatmodel/classifier/rule-based, etc. 2. Different evaluators can be concatenated, and the process function can be cascaded to perform multi-step processing on sequence data. Arguments: preds: responses from chatmodels or preds from `process` function of another evaluator labels: groundtruth labels or labels from `process` function of another evaluator extras: extra parameters or extra sequence from `process` function of another evaluator Return: preds: processed preds sequence labels: processed labels sequence extras: processed extra sequence \"\"\" # no-op return preds, labels, extras def eval(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Dict[str, Union[Sequence, float]]: \"\"\" Evaluate pipeline including data processing and metrics calculation. Arguments: preds: responses from chatmodels labels: groundtruth labels extras: extra parameters Return: results \"\"\" processed_preds, processed_labels, processed_extras = self.process(preds, labels, extras) results = {} for metrics_id, kwargs in self.metrics_cfg.items(): metrics_fn = _supported_metrics[metrics_id] results[metrics_id] = metrics_fn(processed_labels, processed_preds, **kwargs) return results def __call__(self, *args: Any, **kwds: Any) -> Any: return self.eval(*args, **kwds)","title":"BaseEvaluator"},{"location":"modules/#sequentialevaluator","text":"class SequentialEvaluator: \"\"\" Class for cascading evaluators to perform multi-step processing on sequence data and get results from final sequence data. \"\"\" def __init__(self, evaluator_seq_cfg: Dict[str, Any]) -> None: \"\"\" Initializing sequence-evaluator instance. Arguments: evaluator_seq_cfg: config dict for instantiatizing evaluators, format: {evaluator: evaluator_kwargs, ...} \"\"\" evaluator_seq, evaluator_cls_names = [], [] for evaluator_id, evaluator_kwargs in evaluator_seq_cfg.items(): evaluator_cls = registry.get_evaluator_class(evaluator_id) evaluator = evaluator_cls(evaluator_id, **evaluator_kwargs) evaluator_cls_names.append(evaluator_cls.__name__) evaluator_seq.append(evaluator) self.evaluator_seq = evaluator_seq self.keyname_prefix_seq = self.create_sequence_list(evaluator_cls_names) def create_sequence_list(self, input_list: List[str]) -> List[str]: result = [] current = \"\" for item in input_list: if current: current += f\"->{item}\" else: current = item result.append(current) return result def eval(self, preds: Sequence[Any], labels: Optional[Sequence[Any]] = None, extras: Optional[Sequence[Any]] = None, **kwargs) -> Dict[str, Union[Sequence, float]]: \"\"\" Evaluate pipeline including data processing and metrics calculation. Arguments: preds: responses from chatmodels labels: groundtruth labels extras: extra parameters Return: results \"\"\" prefix_results = {} seq_len = len(self.evaluator_seq) for evaluator_idx, (evaluator, keyname_prefix) in enumerate(zip(self.evaluator_seq, self.keyname_prefix_seq)): if evaluator_idx < seq_len - 1: preds, labels, extras = evaluator.process(preds, labels, extras) prefix_results.update({f\"{keyname_prefix}:pred_no_op\": preds}) else: # final evaluator results = evaluator(preds, labels, extras) prefix_results.update({f\"{keyname_prefix}:{key}\": value for key, value in results.items()}) return prefix_results def __call__(self, *args: Any, **kwds: Any) -> Any: return self.eval(*args, **kwds)","title":"SequentialEvaluator"},{"location":"modules/#metrics","text":"We pre-define some common metrics for users to call. These functions to calculate metrics take two array-like arguments of digits to compute the statistical or sample-wise results. We also consider some cases where simple operations of aggregation are needed, e.g., sum, mean. Source code in mmte/evaluators/metrics.py . \"\"\" Input Requirement y_true: 1d array-like y_pred: 1d array-like \"\"\" _supported_metrics = { # aggregation op \"pred_no_op\": pred_no_op, \"pred_sum\": pred_sum, \"pred_mean\": pred_mean, # general metrics \"accuracy_score\": accuracy_score, \"precision_score\": precision_score, \"recall_score\": recall_score, \"f1_score\": f1_score, \"pearson_corr\": pearson_corr, \"failure\": failure, }","title":"Metrics"},{"location":"structure/","text":"Structure Project Structure ./ \u251c\u2500\u2500 mmte \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u251c\u2500\u2500 configs \u2502 \u2502 \u251c\u2500\u2500 datasets/*.yaml \u2502 \u2502 \u251c\u2500\u2500 models/*/*.yaml \u2502 \u2502 \u2514\u2500\u2500 task/*/*.yaml \u2502 \u251c\u2500\u2500 datasets \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 evaluators \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u251c\u2500\u2500 metrics.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 methods \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 models \u2502 \u2502 \u251c\u2500\u2500 [model_dependence] \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 tasks/base.py \u2502 \u2514\u2500\u2500 utils/ \u251c\u2500\u2500 scripts/ \u2502 \u251c\u2500\u2500 run/ \u2502 \u2514\u2500\u2500 score/ \u251c\u2500\u2500 docs/ \u251c\u2500\u2500 data/ \u251c\u2500\u2500 env/ \u251c\u2500\u2500 LICENSE \u251c\u2500\u2500 README.md \u2514\u2500\u2500 run_task.py Task Workflow Task Workflow The basic workflow of a task in MMTrustEval follows the pipeline above. The image-text pairs (or text-only samples) are retrieved from the customized dataset . They are likely to be further processed with a pre-defined method (e.g., pairing text with synthesized images, imposing adversarial noises to the images) by method_hook passed into the dataset. Data in multiple modalities is gathered into a dataclass, TxtSample or ImageTxtSample . The samples ready for inference are then input to MLLMs with unified interface for chat . Further, the generated content is processed by diverse evaluators (e.g., keyword extraction, GPT-4 rating, classifier) and further standardized to be computed with specified metrics (e.g., accuracy, pearson correlation coefficient).","title":"Structure"},{"location":"structure/#structure","text":"","title":"Structure"},{"location":"structure/#project-structure","text":"./ \u251c\u2500\u2500 mmte \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u251c\u2500\u2500 configs \u2502 \u2502 \u251c\u2500\u2500 datasets/*.yaml \u2502 \u2502 \u251c\u2500\u2500 models/*/*.yaml \u2502 \u2502 \u2514\u2500\u2500 task/*/*.yaml \u2502 \u251c\u2500\u2500 datasets \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 evaluators \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u251c\u2500\u2500 metrics.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 methods \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 models \u2502 \u2502 \u251c\u2500\u2500 [model_dependence] \u2502 \u2502 \u251c\u2500\u2500 __init__.py \u2502 \u2502 \u251c\u2500\u2500 base.py \u2502 \u2502 \u2514\u2500\u2500 *.py \u2502 \u251c\u2500\u2500 tasks/base.py \u2502 \u2514\u2500\u2500 utils/ \u251c\u2500\u2500 scripts/ \u2502 \u251c\u2500\u2500 run/ \u2502 \u2514\u2500\u2500 score/ \u251c\u2500\u2500 docs/ \u251c\u2500\u2500 data/ \u251c\u2500\u2500 env/ \u251c\u2500\u2500 LICENSE \u251c\u2500\u2500 README.md \u2514\u2500\u2500 run_task.py","title":"Project Structure"},{"location":"structure/#task-workflow","text":"Task Workflow The basic workflow of a task in MMTrustEval follows the pipeline above. The image-text pairs (or text-only samples) are retrieved from the customized dataset . They are likely to be further processed with a pre-defined method (e.g., pairing text with synthesized images, imposing adversarial noises to the images) by method_hook passed into the dataset. Data in multiple modalities is gathered into a dataclass, TxtSample or ImageTxtSample . The samples ready for inference are then input to MLLMs with unified interface for chat . Further, the generated content is processed by diverse evaluators (e.g., keyword extraction, GPT-4 rating, classifier) and further standardized to be computed with specified metrics (e.g., accuracy, pearson correlation coefficient).","title":"Task Workflow"},{"location":"task/","text":"Task A task powered by MMTrustEval is a collection of the aforementioned modules, i.e., a certain model is evaluated on a certain dataset with certain metrics. Here, we introduce the basic logic of a task workflow in MMTrustEval and how to construct a task via a configuration file. Note that, the purpose of MMTrustEval is to provide modularized tools for developing new tasks to evaluate trustworthiness of MLLMs, instead of to restrict the implementation to modularized configuration. While the tasks in MultiTrust are organized with these modules, users are free to implement their own tasks in a more customized style and only use one or several modules like unified models and off-the-shelf evaluators to facilitate their evaluation. Task Pipeline We provide a BaseTask class to organize the modules in a standardized logic and provide a way for modular customization. The modules for different elements are instantiated according to the configuration from a yaml file or commandline argurments automatically and the user can simply launch the evaluation with one command. Source code in mmte/tasks/base.py class BaseTask(ABC): def __init__(self, dataset_id: str, model_id: str, method_cfg: Optional[Dict] = {}, dataset_cfg: Optional[Dict] = {}, evaluator_seq_cfgs: List = [], log_file: Optional[str] = None) -> None: self.dataset_id = dataset_id self.model_id = model_id self.method_cfg = method_cfg self.dataset_cfg = dataset_cfg self.evaluator_seq_cfgs = evaluator_seq_cfgs self.log_file = log_file def pipeline(self) -> None: self.get_handlers() # automatic module instantiation dataloader = self.get_dataloader() responses = self.generate(dataloader) # unified model inference results = self.eval(responses) # response postprocessing and metrics computation self.save_results(results) # result logging Task Configuration model_id : The ID of the model dataset_id : The ID of the dataset log_file : The file path for the output JSON file method_cfg : Configuration for the method, formatted as {method_id: method_kwargs, ...} . Default: {} dataset_cfg : Additional parameters required for initializing the dataset, formatted as dataset_kwargs . Default: {} generation_kwargs : Additional parameters used for inference with the chat model. Default: {} evaluator_seq_cfgs : Configuration for the evaluator sequence. Each list item represents a different evaluator sequence, with each sequence containing multiple evaluators, and each evaluator corresponding to multiple metrics. Default: [] List[ {evaluator_id: {metrics_cfg: {metrics_id: metrics_kwargs}, {metrics_id: metrics_kwargs}, ... }, ... }, ... ]","title":"Task"},{"location":"task/#task","text":"A task powered by MMTrustEval is a collection of the aforementioned modules, i.e., a certain model is evaluated on a certain dataset with certain metrics. Here, we introduce the basic logic of a task workflow in MMTrustEval and how to construct a task via a configuration file. Note that, the purpose of MMTrustEval is to provide modularized tools for developing new tasks to evaluate trustworthiness of MLLMs, instead of to restrict the implementation to modularized configuration. While the tasks in MultiTrust are organized with these modules, users are free to implement their own tasks in a more customized style and only use one or several modules like unified models and off-the-shelf evaluators to facilitate their evaluation.","title":"Task"},{"location":"task/#task-pipeline","text":"We provide a BaseTask class to organize the modules in a standardized logic and provide a way for modular customization. The modules for different elements are instantiated according to the configuration from a yaml file or commandline argurments automatically and the user can simply launch the evaluation with one command. Source code in mmte/tasks/base.py class BaseTask(ABC): def __init__(self, dataset_id: str, model_id: str, method_cfg: Optional[Dict] = {}, dataset_cfg: Optional[Dict] = {}, evaluator_seq_cfgs: List = [], log_file: Optional[str] = None) -> None: self.dataset_id = dataset_id self.model_id = model_id self.method_cfg = method_cfg self.dataset_cfg = dataset_cfg self.evaluator_seq_cfgs = evaluator_seq_cfgs self.log_file = log_file def pipeline(self) -> None: self.get_handlers() # automatic module instantiation dataloader = self.get_dataloader() responses = self.generate(dataloader) # unified model inference results = self.eval(responses) # response postprocessing and metrics computation self.save_results(results) # result logging","title":"Task Pipeline"},{"location":"task/#task-configuration","text":"model_id : The ID of the model dataset_id : The ID of the dataset log_file : The file path for the output JSON file method_cfg : Configuration for the method, formatted as {method_id: method_kwargs, ...} . Default: {} dataset_cfg : Additional parameters required for initializing the dataset, formatted as dataset_kwargs . Default: {} generation_kwargs : Additional parameters used for inference with the chat model. Default: {} evaluator_seq_cfgs : Configuration for the evaluator sequence. Each list item represents a different evaluator sequence, with each sequence containing multiple evaluators, and each evaluator corresponding to multiple metrics. Default: [] List[ {evaluator_id: {metrics_cfg: {metrics_id: metrics_kwargs}, {metrics_id: metrics_kwargs}, ... }, ... }, ... ]","title":"Task Configuration"},{"location":"tutorial/","text":"Tutorial to Add Tasks Using the example of Privacy Task P3: infoflow-expectation with dataset confaide-unrelated-image-color : Define the Dataset Define the dataset and register the dataset_id : confaide-unrelated-image-color (refer to mmte/datasets/confaide.py ). If you need to do some preprocessing to your custom dataset, please define Method and use it as a hook function in your dataset. Define the Evaluator Define the evaluator and register the evaluator_id : rule_match_and_score_eval (refer to mmte/evaluators/rule_eval.py ). If additional metrics are required, modify mmte/evaluators/metrics.py and register the metrics_fn in _supported_metrics . Edit task_config.yaml (refer to mmte/configs/task/privacy/infoflow.yaml ) dataset_id: \"confaide-unrelated-image-color\" model_id: \"llava-v1.5-7b\" log_file: \"./logs/privacy/infoflow.json\" evaluator_seq_cfgs: [ { \"rule_match_and_score_eval\": { metrics_cfg: { pearson_corr: {}, failure: {} } }, }, ] Run with a Single Command python run_task.py --config mmte/configs/task/privacy/infoflow.yaml To modify configurations without changing the yaml file, one can ADD or OVERWRITE configurations in yaml files using the --cfg-options parameter. For example: python run_task.py --config mmte/configs/task/privacy/infoflow.yaml --cfg-options dataset_id=confaide-image log_file=\"logs/privacy/infoflow-confaide-image.json\"","title":"Tutorial"},{"location":"tutorial/#tutorial-to-add-tasks","text":"Using the example of Privacy Task P3: infoflow-expectation with dataset confaide-unrelated-image-color :","title":"Tutorial to Add Tasks"},{"location":"tutorial/#define-the-dataset","text":"Define the dataset and register the dataset_id : confaide-unrelated-image-color (refer to mmte/datasets/confaide.py ). If you need to do some preprocessing to your custom dataset, please define Method and use it as a hook function in your dataset.","title":"Define the Dataset"},{"location":"tutorial/#define-the-evaluator","text":"Define the evaluator and register the evaluator_id : rule_match_and_score_eval (refer to mmte/evaluators/rule_eval.py ). If additional metrics are required, modify mmte/evaluators/metrics.py and register the metrics_fn in _supported_metrics .","title":"Define the Evaluator"},{"location":"tutorial/#edit-task_configyaml","text":"(refer to mmte/configs/task/privacy/infoflow.yaml ) dataset_id: \"confaide-unrelated-image-color\" model_id: \"llava-v1.5-7b\" log_file: \"./logs/privacy/infoflow.json\" evaluator_seq_cfgs: [ { \"rule_match_and_score_eval\": { metrics_cfg: { pearson_corr: {}, failure: {} } }, }, ]","title":"Edit task_config.yaml"},{"location":"tutorial/#run-with-a-single-command","text":"python run_task.py --config mmte/configs/task/privacy/infoflow.yaml To modify configurations without changing the yaml file, one can ADD or OVERWRITE configurations in yaml files using the --cfg-options parameter. For example: python run_task.py --config mmte/configs/task/privacy/infoflow.yaml --cfg-options dataset_id=confaide-image log_file=\"logs/privacy/infoflow-confaide-image.json\"","title":"Run with a Single Command"}]} \ No newline at end of file diff --git a/search/worker.js b/search/worker.js new file mode 100644 index 0000000..8628dbc --- /dev/null +++ b/search/worker.js @@ -0,0 +1,133 @@ +var base_path = 'function' === typeof importScripts ? '.' : '/search/'; +var allowSearch = false; +var index; +var documents = {}; +var lang = ['en']; +var data; + +function getScript(script, callback) { + console.log('Loading script: ' + script); + $.getScript(base_path + script).done(function () { + callback(); + }).fail(function (jqxhr, settings, exception) { + console.log('Error: ' + exception); + }); +} + +function getScriptsInOrder(scripts, callback) { + if (scripts.length === 0) { + callback(); + return; + } + getScript(scripts[0], function() { + getScriptsInOrder(scripts.slice(1), callback); + }); +} + +function loadScripts(urls, callback) { + if( 'function' === typeof importScripts ) { + importScripts.apply(null, urls); + callback(); + } else { + getScriptsInOrder(urls, callback); + } +} + +function onJSONLoaded () { + data = JSON.parse(this.responseText); + var scriptsToLoad = ['lunr.js']; + if (data.config && data.config.lang && data.config.lang.length) { + lang = data.config.lang; + } + if (lang.length > 1 || lang[0] !== "en") { + scriptsToLoad.push('lunr.stemmer.support.js'); + if (lang.length > 1) { + scriptsToLoad.push('lunr.multi.js'); + } + if (lang.includes("ja") || lang.includes("jp")) { + scriptsToLoad.push('tinyseg.js'); + } + for (var i=0; i < lang.length; i++) { + if (lang[i] != 'en') { + scriptsToLoad.push(['lunr', lang[i], 'js'].join('.')); + } + } + } + loadScripts(scriptsToLoad, onScriptsLoaded); +} + +function onScriptsLoaded () { + console.log('All search scripts loaded, building Lunr index...'); + if (data.config && data.config.separator && data.config.separator.length) { + lunr.tokenizer.separator = new RegExp(data.config.separator); + } + + if (data.index) { + index = lunr.Index.load(data.index); + data.docs.forEach(function (doc) { + documents[doc.location] = doc; + }); + console.log('Lunr pre-built index loaded, search ready'); + } else { + index = lunr(function () { + if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) { + this.use(lunr[lang[0]]); + } else if (lang.length > 1) { + this.use(lunr.multiLanguage.apply(null, lang)); // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility + } + this.field('title'); + this.field('text'); + this.ref('location'); + + for (var i=0; i < data.docs.length; i++) { + var doc = data.docs[i]; + this.add(doc); + documents[doc.location] = doc; + } + }); + console.log('Lunr index built, search ready'); + } + allowSearch = true; + postMessage({config: data.config}); + postMessage({allowSearch: allowSearch}); +} + +function init () { + var oReq = new XMLHttpRequest(); + oReq.addEventListener("load", onJSONLoaded); + var index_path = base_path + '/search_index.json'; + if( 'function' === typeof importScripts ){ + index_path = 'search_index.json'; + } + oReq.open("GET", index_path); + oReq.send(); +} + +function search (query) { + if (!allowSearch) { + console.error('Assets for search still loading'); + return; + } + + var resultDocuments = []; + var results = index.search(query); + for (var i=0; i < results.length; i++){ + var result = results[i]; + doc = documents[result.ref]; + doc.summary = doc.text.substring(0, 200); + resultDocuments.push(doc); + } + return resultDocuments; +} + +if( 'function' === typeof importScripts ) { + onmessage = function (e) { + if (e.data.init) { + init(); + } else if (e.data.query) { + postMessage({ results: search(e.data.query) }); + } else { + console.error("Worker - Unrecognized message: " + e); + } + }; +} diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..0f8724e --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,3 @@ + +./
+├── mmte
+│ ├── __init__.py
+│ ├── configs
+│ │ ├── datasets/*.yaml
+│ │ ├── models/*/*.yaml
+│ │ └── task/*/*.yaml
+│ ├── datasets
+│ │ ├── __init__.py
+│ │ ├── base.py
+│ │ └── *.py
+│ ├── evaluators
+│ │ ├── __init__.py
+│ │ ├── base.py
+│ │ ├── metrics.py
+│ │ └── *.py
+│ ├── methods
+│ │ ├── __init__.py
+│ │ ├── base.py
+│ │ └── *.py
+│ ├── models
+│ │ ├── [model_dependence]
+│ │ ├── __init__.py
+│ │ ├── base.py
+│ │ └── *.py
+│ ├── tasks/base.py
+│ └── utils/
+├── scripts/
+│ ├── run/
+│ └── score/
+├── docs/
+├── data/
+├── env/
+├── LICENSE
+├── README.md
+└── run_task.py
+
+The basic workflow of a task in MMTrustEval follows the pipeline above. The image-text pairs (or text-only samples) are retrieved from the customized dataset
. They are likely to be further processed with a pre-defined method (e.g., pairing text with synthesized images, imposing adversarial noises to the images) by method_hook
passed into the dataset. Data in multiple modalities is gathered into a dataclass, TxtSample
or ImageTxtSample
. The samples ready for inference are then input to MLLMs with unified interface for chat
. Further, the generated content is processed by diverse evaluators
(e.g., keyword extraction, GPT-4 rating, classifier) and further standardized to be computed with specified metrics
(e.g., accuracy, pearson correlation coefficient).
A task powered by MMTrustEval is a collection of the aforementioned modules, i.e., a certain model is evaluated on a certain dataset with certain metrics. Here, we introduce the basic logic of a task workflow in MMTrustEval and how to construct a task via a configuration file.
+Note that, the purpose of MMTrustEval is to provide modularized tools for developing new tasks to evaluate trustworthiness of MLLMs, instead of to restrict the implementation to modularized configuration. While the tasks in MultiTrust are organized with these modules, users are free to implement their own tasks in a more customized style and only use one or several modules like unified models and off-the-shelf evaluators to facilitate their evaluation.
+We provide a BaseTask
class to organize the modules in a standardized logic and provide a way for modular customization. The modules for different elements are instantiated according to the configuration from a yaml file or commandline argurments automatically and the user can simply launch the evaluation with one command.
Source code in mmte/tasks/base.py
class BaseTask(ABC):
+ def __init__(self, dataset_id: str, model_id: str, method_cfg: Optional[Dict] = {},
+ dataset_cfg: Optional[Dict] = {}, evaluator_seq_cfgs: List = [],
+ log_file: Optional[str] = None) -> None:
+ self.dataset_id = dataset_id
+ self.model_id = model_id
+ self.method_cfg = method_cfg
+ self.dataset_cfg = dataset_cfg
+ self.evaluator_seq_cfgs = evaluator_seq_cfgs
+ self.log_file = log_file
+
+ def pipeline(self) -> None:
+ self.get_handlers() # automatic module instantiation
+ dataloader = self.get_dataloader()
+ responses = self.generate(dataloader) # unified model inference
+ results = self.eval(responses) # response postprocessing and metrics computation
+ self.save_results(results) # result logging
+
+model_id
: The ID of the model
dataset_id
: The ID of the dataset
log_file
: The file path for the output JSON file
method_cfg
: Configuration for the method, formatted as {method_id: method_kwargs, ...}
. Default: {}
dataset_cfg
: Additional parameters required for initializing the dataset, formatted as dataset_kwargs
. Default: {}
generation_kwargs
: Additional parameters used for inference with the chat model. Default: {}
evaluator_seq_cfgs
: Configuration for the evaluator sequence. Each list item represents a different evaluator sequence, with each sequence containing multiple evaluators, and each evaluator corresponding to multiple metrics. Default: []
List[
+ {evaluator_id:
+
+ {metrics_cfg:
+
+ {metrics_id: metrics_kwargs},
+ {metrics_id: metrics_kwargs},
+ ...
+
+ },
+ ...
+ },
+ ...
+]
+
+
+ Using the example of Privacy Task P3: infoflow-expectation with dataset confaide-unrelated-image-color
:
Define the dataset and register the dataset_id
: confaide-unrelated-image-color
(refer to mmte/datasets/confaide.py
).
++If you need to do some preprocessing to your custom dataset, please define
+Method
and use it as a hook function in your dataset.
Define the evaluator and register the evaluator_id
: rule_match_and_score_eval
(refer to mmte/evaluators/rule_eval.py
).
If additional metrics are required, modify mmte/evaluators/metrics.py
and register the metrics_fn
in _supported_metrics
.
task_config.yaml
(refer to mmte/configs/task/privacy/infoflow.yaml
)
dataset_id: "confaide-unrelated-image-color"
+model_id: "llava-v1.5-7b"
+
+log_file: "./logs/privacy/infoflow.json"
+
+evaluator_seq_cfgs:
+ [
+ {
+ "rule_match_and_score_eval":
+ { metrics_cfg: { pearson_corr: {}, failure: {} } },
+ },
+ ]
+
+
+
+python run_task.py --config mmte/configs/task/privacy/infoflow.yaml
+
+++To modify configurations without changing the yaml file, one can
+ADD
orOVERWRITE
configurations in yaml files using the--cfg-options
parameter. For example:
python run_task.py --config mmte/configs/task/privacy/infoflow.yaml --cfg-options dataset_id=confaide-image log_file="logs/privacy/infoflow-confaide-image.json"
+
+
+