From dcafcb017cbee8741d93b3486198885b41caeb44 Mon Sep 17 00:00:00 2001 From: facebook-github-bot Date: Thu, 26 Sep 2024 01:52:22 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20pytorch/?= =?UTF-8?q?FBGEMM@b1523395c85de0f7bf512d574eb2af5efb60ef33=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- _modules/fbgemm_gpu/docs/examples.html | 3 +- ...table_batched_embeddings_ops_training.html | 375 ++++++--- _modules/index.html | 3 +- .../jagged_tensor_ops.rst.txt | 2 + .../pooled_embedding_ops.rst.txt | 6 + .../table_batched_embedding_ops.rst.txt | 11 +- _sources/index.rst.txt | 1 + fbgemm-cpp-api/QuantUtils.html | 3 +- fbgemm-cpp-api/tbe_cpu_autovec.html | 3 +- fbgemm-development/BuildInstructions.html | 3 +- fbgemm_gpu-cpp-api/embedding_ops.html | 3 +- fbgemm_gpu-cpp-api/experimental_ops.html | 7 +- fbgemm_gpu-cpp-api/input_combine.html | 3 +- fbgemm_gpu-cpp-api/jagged_tensor_ops.html | 3 +- fbgemm_gpu-cpp-api/layout_transform_ops.html | 3 +- fbgemm_gpu-cpp-api/memory_utils.html | 3 +- .../merge_pooled_embeddings.html | 3 +- fbgemm_gpu-cpp-api/quantize_ops.html | 3 +- fbgemm_gpu-cpp-api/sparse_ops.html | 3 +- .../split_table_batched_embeddings.html | 3 +- fbgemm_gpu-cpp-api/ssd_embedding_ops.html | 3 +- fbgemm_gpu-development/BuildInstructions.html | 3 +- .../InstallationInstructions.html | 3 +- fbgemm_gpu-development/TestInstructions.html | 3 +- .../jagged-tensor-ops/JaggedTensorOps.html | 3 +- fbgemm_gpu-python-api/jagged_tensor_ops.html | 275 ++++++- .../pooled_embedding_ops.html | 777 ++++++++++++++++++ .../table_batched_embedding_ops.html | 229 +++--- general/ContactUs.html | 3 +- general/Contributing.html | 3 +- general/License.html | 3 +- general/documentation/Cpp.html | 3 +- general/documentation/Overview.html | 3 +- general/documentation/Python.html | 3 +- general/documentation/Sphinx.html | 3 +- genindex.html | 73 +- index.html | 6 +- objects.inv | Bin 16438 -> 16575 bytes output.json | 52 +- output.txt | 2 +- py-modindex.html | 754 +++++++++++++++++ search.html | 3 +- searchindex.js | 2 +- 43 files changed, 2334 insertions(+), 319 deletions(-) create mode 100644 _sources/fbgemm_gpu-python-api/pooled_embedding_ops.rst.txt create mode 100644 fbgemm_gpu-python-api/pooled_embedding_ops.html create mode 100644 py-modindex.html diff --git a/_modules/fbgemm_gpu/docs/examples.html b/_modules/fbgemm_gpu/docs/examples.html index 61bd07e83..0f3eb51dc 100644 --- a/_modules/fbgemm_gpu/docs/examples.html +++ b/_modules/fbgemm_gpu/docs/examples.html @@ -336,8 +336,9 @@

FBGEMM_GPU Python API

diff --git a/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_training.html b/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_training.html index d6fa0a93b..f429084a4 100644 --- a/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_training.html +++ b/_modules/fbgemm_gpu/split_table_batched_embeddings_ops_training.html @@ -336,8 +336,9 @@

FBGEMM_GPU Python API

@@ -768,7 +769,7 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< (5) `MTIA` = placing an embedding table in the MTIA memory - Available `ComputeDevice`options are + Available `ComputeDevice` options are (1) `CPU` = performing table lookup on CPU @@ -962,86 +963,9 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< A config for global weight decay uvm_host_mapped (bool = False): If True, allocate every UVM tensor - using `malloc`+`cudaHostRegister`. Otherwise use + using `malloc` + `cudaHostRegister`. Otherwise use `cudaMallocManaged` - - Inputs: - indices (torch.Tensor): A 1D-tensor that contains indices to be accessed - in all embedding table - - offsets (torch.Tensor): A 1D-tensor that conatins offsets of indices. - Shape `(B * T + 1)` where `B` = batch size and `T` = number of tables. - `offsets[t * B + b + 1] - offsets[t * B + b]` is the length of bag `b` - of table `t` - - per_sample_weights (torch.Tensor): An optional 1D-tensor that contains - positional weights. Shape `(max(bag length))`. Positional weight `i` is - multiplied to all columns of row `i` in each bag after its read from the - embedding table and before pooling (if pooling mode is not - PoolingMode.NONE). - - feature_requires_grad (torch.Tensor): An optional tensor for checking if - `per_sample_weights` requires gradient - - Returns: - A 2D-tensor containing looked up data. Shape `(B, total_D)` where `B` = - batch size and `total_D` = the sum of all embedding dimensions in the - table - - Example: - >>> import torch - >>> - >>> from fbgemm_gpu.split_table_batched_embeddings_ops_common import ( - >>> EmbeddingLocation, - >>> ) - >>> from fbgemm_gpu.split_table_batched_embeddings_ops_training import ( - >>> SplitTableBatchedEmbeddingBagsCodegen, - >>> ComputeDevice, - >>> ) - >>> - >>> # Two tables - >>> embedding_specs = [ - >>> (3, 8, EmbeddingLocation.DEVICE, ComputeDevice.CUDA), - >>> (5, 4, EmbeddingLocation.MANAGED, ComputeDevice.CUDA) - >>> ] - >>> - >>> tbe = SplitTableBatchedEmbeddingBagsCodegen(embedding_specs) - >>> tbe.init_embedding_weights_uniform(-1, 1) - >>> - >>> print(tbe.split_embedding_weights()) - [tensor([[-0.9426, 0.7046, 0.4214, -0.0419, 0.1331, -0.7856, -0.8124, -0.2021], - [-0.5771, 0.5911, -0.7792, -0.1068, -0.6203, 0.4813, -0.1677, 0.4790], - [-0.5587, -0.0941, 0.5754, 0.3475, -0.8952, -0.1964, 0.0810, -0.4174]], - device='cuda:0'), tensor([[-0.2513, -0.4039, -0.3775, 0.3273], - [-0.5399, -0.0229, -0.1455, -0.8770], - [-0.9520, 0.4593, -0.7169, 0.6307], - [-0.1765, 0.8757, 0.8614, 0.2051], - [-0.0603, -0.9980, -0.7958, -0.5826]], device='cuda:0')] - - - >>> # Batch size = 3 - >>> indices = torch.tensor([0, 1, 2, 0, 1, 2, 0, 3, 1, 4, 2, 0, 0], - >>> device="cuda", - >>> dtype=torch.long) - >>> offsets = torch.tensor([0, 2, 5, 7, 9, 12, 13], - >>> device="cuda", - >>> dtype=torch.long) - >>> - >>> output = tbe(indices, offsets) - >>> - >>> # Batch size = 3, total embedding dimension = 12 - >>> print(output.shape) - torch.Size([3, 12]) - - >>> print(output) - tensor([[-1.5197, 1.2957, -0.3578, -0.1487, -0.4873, -0.3044, -0.9801, 0.2769, - -0.7164, 0.8528, 0.7159, -0.6719], - [-2.0784, 1.2016, 0.2176, 0.1988, -1.3825, -0.5008, -0.8991, -0.1405, - -1.2637, -0.9427, -1.8902, 0.3754], - [-1.5013, 0.6105, 0.9968, 0.3057, -0.7621, -0.9821, -0.7314, -0.6195, - -0.2513, -0.4039, -0.3775, 0.3273]], device='cuda:0', - grad_fn=<CppNode<SplitLookupFunction_sgd_Op>>) """ embedding_specs: List[Tuple[int, int, EmbeddingLocation, ComputeDevice]] @@ -1712,10 +1636,19 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< self._debug_print_input_stats_factory() ) -
[docs] @torch.jit.ignore + @torch.jit.ignore def log(self, msg: str) -> None: - """Log with TBE id prefix to distinguish between multiple TBE instances per process.""" - logging.info(f"[TBE={self.uuid}] {msg}")
+ """ + Log with TBE id prefix to distinguish between multiple TBE instances + per process + + Args: + msg (str): The message to print + + Returns: + None + """ + logging.info(f"[TBE={self.uuid}] {msg}") def _register_nonpersistent_buffers(self, prefix: str) -> None: # NOTE: make TorchScript work! @@ -1745,12 +1678,18 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< persistent=False, ) -
[docs] @staticmethod + @staticmethod def get_table_name_for_logging(table_names: Optional[List[str]]) -> str: """ - Given list of all table names in the TBE, generate a string to represent - them in logging. If there's more than one table, this method will count - them than list them. + Given a list of all table names in the TBE, generate a string to + represent them in logging. If there is more than one table, this method + will count them than list them. + + Args: + table_names (Optional[List[str]]): A list of table anmes in TBE + + Returns: + A string that represents tables in logging """ if table_names is None: return "<Unknown>" @@ -1759,22 +1698,38 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< table_name_set = set(table_names) if len(table_name_set) == 1: return next(iter(table_name_set)) - return f"<{len(table_name_set)} tables>"
+ return f"<{len(table_name_set)} tables>" -
[docs] @staticmethod + @staticmethod def get_prefetch_passes( multipass_prefetch_config: Optional[MultiPassPrefetchConfig], input_tensor: Tensor, output_tensor: Tensor, ) -> List[Tuple[Tensor, Tensor, int]]: """ - Given input (the indices to forward), return the segmentation for each pass - in the format of (input[start_idx:end_idx], output[start_idx:end_idx], start_idx). + Given inputs (the indices to forward), partition the input and output + into smaller chunks and return them as a list of tuples + (input[start_idx:end_idx], output[start_idx:end_idx], start_idx). + + The caller must guarantee that input and output have non-zero dimension + 0. The returned segments are guaranteed to completely and + non-overlappingly cover the input tensor. + + In non-multipass-prefetch mode, it returns the input/output tensor + itself. + + Args: + multipass_prefetch_config (Optional[MultiPassPrefetchConfig]): + A config for multi-pass cache prefetch. If None, multi-pass + prefetch is not used. + + input_tensor (Tensor): The input tensor to be partitioned - Caller should guarantee input and output are having the size on dimension 0 - The returned segments are guaranteed to completely and non-overlappingly cover the input tensor. + output_tensor (Tensor): The output tensor to be partitioned - In non-multipass-prefetch mode, it returns the input/output tensor itself. + Returns: + A list of partitioned inputs and outputs (List[Tuple[Tensor, + Tensor, int]]) """ if multipass_prefetch_config is None: return [(input_tensor, output_tensor, 0)] @@ -1796,9 +1751,32 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< torch.split(output_tensor, pass_size), range(0, N, pass_size), ) - )
+ ) def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: + """ + Get a state of a given tensor (`prefix`) + + Args: + prefix (str): A prefix of the state to obtain + + Returns: + A tuple of tensors corresponding to the obtained state containing + + (1) A GPU state tensor + + (2) A CPU state tensor + + (3) A UVM state tensor + + (4) A placement tensor - containing placements of embedding tables + (torch.int32_t tensor). (0 = DEVICE, 1 = MANAGED, 2 = + MANAGED_CACHING, 3 = HOST, 4 = MTIA) + + (5) An offset tensor - containing the relative positions of + embedding tables in the corresponding state tensor (GPU, CPU, + or UVM state tensor) + """ if not hasattr(self, f"{prefix}_physical_placements"): raise DoesNotHavePrefix() dev_param = getattr(self, f"{prefix}_dev") @@ -1815,6 +1793,15 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< ) def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]: + """ + Get all states in the TBE (`weights`, `momentum1`, `momentum2`, + `prev_iter`, and `row_counter`) + + Returns: + A list of states. Each state is a tuple of tensors (GPU state + tensor, CPU state tensor, UVM state tensor, placement tensor and + offset tensor) + """ all_states = [] for prefix in ["weights", "momentum1", "momentum2", "prev_iter", "row_counter"]: try: @@ -1825,16 +1812,29 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< @torch.jit.export def get_cache_miss_counter(self) -> Tensor: - # cache_miss_counter contains two items: - # The first one is cache_miss_forward_count which records the total number of forwards which has at least one cache miss - # The second one is the unique_cache_miss_count which records to total number of unique (dedup) cache misses + """ + Get the cache miss counter. `cache_miss_counter` contains two items: + (1) `cache_miss_forward_count` which records the total number of + forwards which has at least one cache miss + + (2) `unique_cache_miss_count` which records to total number of unique + (dedup) cache misses + + Returns: + The cache miss counter + """ return self.cache_miss_counter @torch.jit.export def get_table_wise_cache_miss(self) -> Tensor: - # table_wise_cache_miss contains all the cache miss count for each table in this embedding table object: + """ + Get the table-wise cache miss tensor. `table_wise_cache_miss` contains + all the cache miss count for each table in this embedding table object: + Returns: + The table-wise cache miss tensor + """ return self.table_wise_cache_miss # The callback function for AsyncTimer to record duration to different event @@ -1959,11 +1959,122 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< offsets: Tensor, per_sample_weights: Optional[Tensor] = None, feature_requires_grad: Optional[Tensor] = None, - # 2D tensor of batch size for each rank and feature. - # Shape (number of features, number of ranks) batch_size_per_feature_per_rank: Optional[List[List[int]]] = None, total_unique_indices: Optional[int] = None, ) -> Tensor: + """ + The forward pass function that + + (1) Performs input bound checking + + (2) Generates necessary variable batch size embedding (VBE) metadata (if + VBE is used) + + (3) Prefetches data from UVM to cache (if + `EmbeddingLocation.MANAGED_CACHING` is used and the user has not + explicitly prefetched data) + + (4) Performs the embedding table lookup by invoking a corresponding + Autograd function (based on the chosen optimizer) + + Args: + indices (Tensor): A 1D-tensor that contains indices to be looked up + from all embedding table + + offsets (Tensor): A 1D-tensor that conatins offsets of indices. + Shape `(B * T + 1)` where `B` = batch size and `T` = the number + of features. `offsets[t * B + b + 1] - offsets[t * B + b]` is + the length of bag `b` of feature `t` + + per_sample_weights (Optional[Tensor]): An optional 1D-float-tensor that + contains per sample weights. If None, **unweighted** embedding + lookup will be perform. Otherwise, **weighted** will be used. The + length of this tensor must be the same as the length of the + `indices` tensor. The value of `per_sample_weights[i]` will be + used to multiply with every element in the looked up row + `indices[i]`, where `0 <= i < len(per_sample_weights)`. + + feature_requires_grad (Optional[Tensor]): An optional 1D-tensor for + indicating if `per_sample_weights` requires gradient. The + length of the tensor must be equal to the number of features + + batch_size_per_feature_per_rank (Optional[List[List[int]]]): An + optional 2D-tensor that contains batch sizes for every rank and + every feature. If None, TBE assumes that **every feature has the + same batch size** and computes the batch size from the `offsets` + shape. Otherwise, TBE assumes that different features can have + different batch sizes and uses the **variable batch size + embedding look up mode (VBE)**. Shape (number of features, + number of ranks). `batch_size_per_feature_per_rank[f][r]` + represents the batch size of feature `f` and rank `r` + + total_unique_indices (Optional[int]): An optional integer that + represents the total number of unique indices. This value must + be set when using `OptimType.NONE`. This is because TBE + requires this information for allocating the weight gradient + tensor in the backward pass. + + Returns: + A 2D-tensor containing looked up data. Shape `(B, total_D)` where `B` = + batch size and `total_D` = the sum of all embedding dimensions in the + table + + Example: + + >>> import torch + >>> + >>> from fbgemm_gpu.split_table_batched_embeddings_ops_common import ( + >>> EmbeddingLocation, + >>> ) + >>> from fbgemm_gpu.split_table_batched_embeddings_ops_training import ( + >>> SplitTableBatchedEmbeddingBagsCodegen, + >>> ComputeDevice, + >>> ) + >>> + >>> # Two tables + >>> embedding_specs = [ + >>> (3, 8, EmbeddingLocation.DEVICE, ComputeDevice.CUDA), + >>> (5, 4, EmbeddingLocation.MANAGED, ComputeDevice.CUDA) + >>> ] + >>> + >>> tbe = SplitTableBatchedEmbeddingBagsCodegen(embedding_specs) + >>> tbe.init_embedding_weights_uniform(-1, 1) + >>> + >>> print(tbe.split_embedding_weights()) + [tensor([[-0.9426, 0.7046, 0.4214, -0.0419, 0.1331, -0.7856, -0.8124, -0.2021], + [-0.5771, 0.5911, -0.7792, -0.1068, -0.6203, 0.4813, -0.1677, 0.4790], + [-0.5587, -0.0941, 0.5754, 0.3475, -0.8952, -0.1964, 0.0810, -0.4174]], + device='cuda:0'), tensor([[-0.2513, -0.4039, -0.3775, 0.3273], + [-0.5399, -0.0229, -0.1455, -0.8770], + [-0.9520, 0.4593, -0.7169, 0.6307], + [-0.1765, 0.8757, 0.8614, 0.2051], + [-0.0603, -0.9980, -0.7958, -0.5826]], device='cuda:0')] + + + >>> # Batch size = 3 + >>> indices = torch.tensor([0, 1, 2, 0, 1, 2, 0, 3, 1, 4, 2, 0, 0], + >>> device="cuda", + >>> dtype=torch.long) + >>> offsets = torch.tensor([0, 2, 5, 7, 9, 12, 13], + >>> device="cuda", + >>> dtype=torch.long) + >>> + >>> output = tbe(indices, offsets) + >>> + >>> # Batch size = 3, total embedding dimension = 12 + >>> print(output.shape) + torch.Size([3, 12]) + + >>> print(output) + tensor([[-1.5197, 1.2957, -0.3578, -0.1487, -0.4873, -0.3044, -0.9801, 0.2769, + -0.7164, 0.8528, 0.7159, -0.6719], + [-2.0784, 1.2016, 0.2176, 0.1988, -1.3825, -0.5008, -0.8991, -0.1405, + -1.2637, -0.9427, -1.8902, 0.3754], + [-1.5013, 0.6105, 0.9968, 0.3057, -0.7621, -0.9821, -0.7314, -0.6195, + -0.2513, -0.4039, -0.3775, 0.3273]], device='cuda:0', + grad_fn=<CppNode<SplitLookupFunction_sgd_Op>>) + + """ ( indices, offsets, @@ -2487,13 +2598,13 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< if self.should_log(): self.print_uvm_cache_stats(use_local_cache=False) -
[docs] def should_log(self) -> bool: + def should_log(self) -> bool: """Determines if we should log for this step, using exponentially decreasing frequency. Logs for steps: 100 200 ... 1,000 2,000 ... 10,000 20,000 ... 100,000 200,000 ... """ s = self.step + 1 # step starts at 0 - return s >= 100 and s % (10 ** int(math.log10(s))) == 0
+ return s >= 100 and s % (10 ** int(math.log10(s))) == 0 def _prefetch_tensors_record_stream( self, forward_stream: torch.cuda.Stream @@ -2572,7 +2683,10 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

<
[docs] @torch.jit.ignore def split_embedding_weights(self) -> List[Tensor]: """ - Returns a list of weights, split by table + Returns a list of embedding weights (view), split by table + + Returns: + A list of weights. Length = the number of tables """ splits = [] for t, (rows, dim, _, _) in enumerate(self.embedding_specs): @@ -2604,7 +2718,7 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< return buffer return torch.tensor(0) -
[docs] @torch.jit.export + @torch.jit.export def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]: r""" Get the optimizer state dict that matches the OSS Pytorch optims @@ -2656,14 +2770,47 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< f"Getting optimizer state {self.optimizer} is not implmeneted" ) - return list_of_state_dict
+ return list_of_state_dict
[docs] @torch.jit.ignore def split_optimizer_states( self, ) -> List[List[torch.Tensor]]: """ - Returns a list of states, split by table + Returns a list of optimizer states (view), split by table + + Returns: + A list of list of states. Shape = (the number of tables, the number + of states). + + The following shows the list of states (in the returned order) for + each optimizer: + + (1) `ADAM`: `momentum1`, `momentum2` + + (2) `EXACT_ADAGRAD`: `momentum1` + + (3) `EXACT_ROWWISE_ADAGRAD`: `momentum1` (rowwise), `prev_iter` + (rowwise; only when using `WeightDecayMode` = `COUNTER` or + `COWCLIP` or `global_weight_decay` is not None), `row_counter` + (rowwise; only when using `WeightDecayMode` = `COUNTER` or + `COWCLIP`) + + (4) `EXACT_SGD`: no states + + (5) `LAMB`: `momentum1`, `momentum2` + + (6) `LARS_SGD`: `momentum1` + + (7) `PARTIAL_ROWWISE_ADAM`: `momentum1`, `momentum2` (rowwise) + + (8) `PARTIAL_ROWWISE_LAMB`: `momentum1`, `momentum2` (rowwise) + + (9) `ENSEMBLE_ROWWISE_ADAGRAD`: `momentum2` (rowwise), `momentum1`, + `prev_iter` (rowwise), `row_counter` (rowwise) + + (10) `NONE`: no states (throwing an error) + """ if self.optimizer == OptimType.NONE: raise NotImplementedError( @@ -2777,6 +2924,9 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< def set_learning_rate(self, lr: float) -> None: """ Sets the learning rate. + + Args: + lr (float): The learning rate value to set to """ if self.optimizer == OptimType.NONE: raise NotImplementedError( @@ -2788,6 +2938,10 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< def update_hyper_parameters(self, params_dict: Dict[str, float]) -> None: """ Sets hyper-parameters from external control flow. + + Args: + params_dict (Dict[str, float]): The dict that contains the + hyper-parameter names and their values """ if self.optimizer == OptimType.NONE: raise NotImplementedError( @@ -2824,6 +2978,9 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< def set_optimizer_step(self, step: int) -> None: """ Sets the optimizer step. + + Args: + step (int): The setp value to set to """ self.log(f"set_optimizer_step from {self.iter[0]} to {step}") if self.optimizer == OptimType.NONE: @@ -3275,7 +3432,7 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< total_cache_hash_size=total_cache_hash_size, ) -
[docs] def prepare_inputs( + def prepare_inputs( self, indices: Tensor, offsets: Tensor, @@ -3340,7 +3497,7 @@

Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training

< max_B=vbe_metadata.max_B, ) - return indices, offsets, per_sample_weights, vbe_metadata
+ return indices, offsets, per_sample_weights, vbe_metadata def _debug_print_input_stats_factory(self) -> Callable[..., None]: """ diff --git a/_modules/index.html b/_modules/index.html index 41a397f8c..6d72bd7d9 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -336,8 +336,9 @@

FBGEMM_GPU Python API

diff --git a/_sources/fbgemm_gpu-python-api/jagged_tensor_ops.rst.txt b/_sources/fbgemm_gpu-python-api/jagged_tensor_ops.rst.txt index ca1cbe522..92e8f1148 100644 --- a/_sources/fbgemm_gpu-python-api/jagged_tensor_ops.rst.txt +++ b/_sources/fbgemm_gpu-python-api/jagged_tensor_ops.rst.txt @@ -1,6 +1,8 @@ Jagged Tensor Operators ======================= +.. automodule:: fbgemm_gpu + .. autofunction:: torch.ops.fbgemm.jagged_2d_to_dense .. autofunction:: torch.ops.fbgemm.jagged_1d_to_dense diff --git a/_sources/fbgemm_gpu-python-api/pooled_embedding_ops.rst.txt b/_sources/fbgemm_gpu-python-api/pooled_embedding_ops.rst.txt new file mode 100644 index 000000000..519b74e6b --- /dev/null +++ b/_sources/fbgemm_gpu-python-api/pooled_embedding_ops.rst.txt @@ -0,0 +1,6 @@ +Pooled Embedding Operators +========================== + +.. automodule:: fbgemm_gpu + +.. autofunction:: torch.ops.fbgemm.merge_pooled_embeddings diff --git a/_sources/fbgemm_gpu-python-api/table_batched_embedding_ops.rst.txt b/_sources/fbgemm_gpu-python-api/table_batched_embedding_ops.rst.txt index 2059b7a6d..bbd39d873 100644 --- a/_sources/fbgemm_gpu-python-api/table_batched_embedding_ops.rst.txt +++ b/_sources/fbgemm_gpu-python-api/table_batched_embedding_ops.rst.txt @@ -1,5 +1,10 @@ -Table Batched Embedding (TBE) Operators -======================================= +Table Batched Embedding (TBE) Training Module +============================================= .. autoclass:: fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen - :members: + :members: forward, + split_embedding_weights, + split_optimizer_states, + set_learning_rate, + update_hyper_parameters, + set_optimizer_step diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt index a71a58995..c4d98c720 100644 --- a/_sources/index.rst.txt +++ b/_sources/index.rst.txt @@ -91,3 +91,4 @@ Table of Contents fbgemm_gpu-python-api/table_batched_embedding_ops.rst fbgemm_gpu-python-api/jagged_tensor_ops.rst + fbgemm_gpu-python-api/pooled_embedding_ops.rst diff --git a/fbgemm-cpp-api/QuantUtils.html b/fbgemm-cpp-api/QuantUtils.html index 2a7c883c7..4abbfee72 100644 --- a/fbgemm-cpp-api/QuantUtils.html +++ b/fbgemm-cpp-api/QuantUtils.html @@ -339,8 +339,9 @@

FBGEMM_GPU Python API

diff --git a/fbgemm-cpp-api/tbe_cpu_autovec.html b/fbgemm-cpp-api/tbe_cpu_autovec.html index 77c437c1b..fbad6fa00 100644 --- a/fbgemm-cpp-api/tbe_cpu_autovec.html +++ b/fbgemm-cpp-api/tbe_cpu_autovec.html @@ -339,8 +339,9 @@

FBGEMM_GPU Python API

diff --git a/fbgemm-development/BuildInstructions.html b/fbgemm-development/BuildInstructions.html index 4bfd0f64a..b40a242fa 100644 --- a/fbgemm-development/BuildInstructions.html +++ b/fbgemm-development/BuildInstructions.html @@ -339,8 +339,9 @@

FBGEMM_GPU Python API

diff --git a/fbgemm_gpu-cpp-api/embedding_ops.html b/fbgemm_gpu-cpp-api/embedding_ops.html index 9e59accbf..21f9cee79 100644 --- a/fbgemm_gpu-cpp-api/embedding_ops.html +++ b/fbgemm_gpu-cpp-api/embedding_ops.html @@ -339,8 +339,9 @@

FBGEMM_GPU Python API

diff --git a/fbgemm_gpu-cpp-api/experimental_ops.html b/fbgemm_gpu-cpp-api/experimental_ops.html index e44001223..e1c2e70eb 100644 --- a/fbgemm_gpu-cpp-api/experimental_ops.html +++ b/fbgemm_gpu-cpp-api/experimental_ops.html @@ -33,7 +33,7 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + + + +
+
+
+ + + + + + + + + + + +
+
+
+ + + + + + + + + + + + + + + + +
+ +
    + +
  • + + + Docs + + > +
  • + + +
  • Pooled Embedding Operators
  • + + +
  • + + + + + +
  • + +
+ + +
+
+ +
+ Shortcuts +
+
+ +
+
+ + + + + + +
+ +
+
+ +
+

Pooled Embedding Operators¶

+
+
+torch.ops.fbgemm.merge_pooled_embeddings(pooled_embeddings, uncat_dim_size, target_device, cat_dim=1) Tensor¶
+

Concatenate embedding outputs from different devices (on the same host) +on to the target device.

+
+
Parameters:
+
    +
  • pooled_embeddings (List[Tensor]) – A list of embedding outputs from +different devices on the same host. Each output has 2 +dimensions.

  • +
  • uncat_dim_size (int) – The size of the dimension that is not +concatenated, i.e., if cat_dim=0, uncat_dim_size is the size +of dim 1 and vice versa.

  • +
  • target_device (torch.device) – The target device that aggregates all +the embedding outputs.

  • +
  • cat_dim (int = 1) – The dimension that the tensors are concatenated

  • +
+
+
Returns:
+

The concatenated embedding output (2D) on the target device

+
+
+
+ +
+ + +
+ +
+ + +
+
+ + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+

Docs

+

Access comprehensive developer documentation for PyTorch

+ View Docs +
+ +
+

Tutorials

+

Get in-depth tutorials for beginners and advanced developers

+ View Tutorials +
+ +
+

Resources

+

Find development resources and get your questions answered

+ View Resources +
+
+
+
+ + + + + + + + + +
+
+
+
+ + +
+
+
+ + +
+ + + + + + + + \ No newline at end of file diff --git a/fbgemm_gpu-python-api/table_batched_embedding_ops.html b/fbgemm_gpu-python-api/table_batched_embedding_ops.html index 5fe762c89..6d0974c7f 100644 --- a/fbgemm_gpu-python-api/table_batched_embedding_ops.html +++ b/fbgemm_gpu-python-api/table_batched_embedding_ops.html @@ -10,7 +10,7 @@ - Table Batched Embedding (TBE) Operators — FBGEMM 0.8.0 documentation + Table Batched Embedding (TBE) Training Module — FBGEMM 0.8.0 documentation @@ -339,8 +339,9 @@

FBGEMM_GPU Python API

@@ -381,7 +382,7 @@ -
  • Table Batched Embedding (TBE) Operators
  • +
  • Table Batched Embedding (TBE) Training Module
  • @@ -417,11 +418,11 @@
    -
    -

    Table Batched Embedding (TBE) Operators¶

    +
    +

    Table Batched Embedding (TBE) Training Module¶

    -class fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen(embedding_specs: List[Tuple[int, int, EmbeddingLocation, ComputeDevice]], feature_table_map: List[int] | None = None, cache_algorithm: CacheAlgorithm = CacheAlgorithm.LRU, cache_load_factor: float = 0.2, cache_sets: int = 0, cache_reserved_memory: float = 0.0, cache_precision: SparseType = SparseType.FP32, weights_precision: SparseType = SparseType.FP32, output_dtype: SparseType = SparseType.FP32, enforce_hbm: bool = False, optimizer: EmbOptimType = EmbOptimType.EXACT_SGD, record_cache_metrics: RecordCacheMetrics | None = None, gather_uvm_cache_stats: bool | None = False, stochastic_rounding: bool = True, gradient_clipping: bool = False, max_gradient: float = 1.0, max_norm: float = 0.0, learning_rate: float = 0.01, eps: float = 1e-08, momentum: float = 0.9, weight_decay: float = 0.0, weight_decay_mode: WeightDecayMode = WeightDecayMode.NONE, eta: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, step_ema: float = 10000, step_swap: float = 10000, step_start: float = 0, step_mode: StepMode = StepMode.USE_ITER, counter_based_regularization: CounterBasedRegularizationDefinition | None = None, cowclip_regularization: CowClipDefinition | None = None, pooling_mode: PoolingMode = PoolingMode.SUM, device: str | int | device | None = None, bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING, uvm_non_rowwise_momentum: bool = False, use_experimental_tbe: bool = False, prefetch_pipeline: bool = False, stats_reporter_config: TBEStatsReporterConfig | None = None, table_names: List[str] | None = None, optimizer_state_dtypes: Dict[str, SparseType] | None = None, multipass_prefetch_config: MultiPassPrefetchConfig | None = None, global_weight_decay: GlobalWeightDecayDefinition | None = None, uvm_host_mapped: bool = False)[source]¶
    +class fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen(embedding_specs: List[Tuple[int, int, EmbeddingLocation, ComputeDevice]], feature_table_map: List[int] | None = None, cache_algorithm: CacheAlgorithm = CacheAlgorithm.LRU, cache_load_factor: float = 0.2, cache_sets: int = 0, cache_reserved_memory: float = 0.0, cache_precision: SparseType = SparseType.FP32, weights_precision: SparseType = SparseType.FP32, output_dtype: SparseType = SparseType.FP32, enforce_hbm: bool = False, optimizer: EmbOptimType = EmbOptimType.EXACT_SGD, record_cache_metrics: RecordCacheMetrics | None = None, gather_uvm_cache_stats: bool | None = False, stochastic_rounding: bool = True, gradient_clipping: bool = False, max_gradient: float = 1.0, max_norm: float = 0.0, learning_rate: float = 0.01, eps: float = 1e-08, momentum: float = 0.9, weight_decay: float = 0.0, weight_decay_mode: WeightDecayMode = WeightDecayMode.NONE, eta: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, step_ema: float = 10000, step_swap: float = 10000, step_start: float = 0, step_mode: StepMode = StepMode.USE_ITER, counter_based_regularization: CounterBasedRegularizationDefinition | None = None, cowclip_regularization: CowClipDefinition | None = None, pooling_mode: PoolingMode = PoolingMode.SUM, device: str | int | device | None = None, bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING, uvm_non_rowwise_momentum: bool = False, use_experimental_tbe: bool = False, prefetch_pipeline: bool = False, stats_reporter_config: TBEStatsReporterConfig | None = None, table_names: List[str] | None = None, optimizer_state_dtypes: Dict[str, SparseType] | None = None, multipass_prefetch_config: MultiPassPrefetchConfig | None = None, global_weight_decay: GlobalWeightDecayDefinition | None = None, uvm_host_mapped: bool = False)[source]¶

    Table Batched Embedding (TBE) operator. Looks up one or more embedding tables. The module is application for training. The backward operator is fused with optimizer. Thus, the embedding tables are updated during @@ -445,7 +446,7 @@

    Table Batched Embedding (TBE) Operators`ComputeDevice`options are

    +

    Available ComputeDevice options are

    1. CPU = performing table lookup on CPU

    2. CUDA = performing table lookup on GPU

    3. @@ -552,7 +553,7 @@

      Table Batched Embedding (TBE) Operatorsstr, int, torch.device]] = None) – The current +
    4. device (Optional[Union[str, int, torch.device]] = None) – The current device to place tensors on

    5. bounds_check_mode (BoundsCheckMode = BoundsCheckMode.WARNING) –

      Input checking mode. Available BoundsCheckMode options are

      @@ -588,30 +589,62 @@

      Table Batched Embedding (TBE) Operators +
      +forward(indices: Tensor, offsets: Tensor, per_sample_weights: Tensor | None = None, feature_requires_grad: Tensor | None = None, batch_size_per_feature_per_rank: List[List[int]] | None = None, total_unique_indices: int | None = None) Tensor[source]¶
      +

      The forward pass function that

      +
        +
      1. Performs input bound checking

      2. +
      3. Generates necessary variable batch size embedding (VBE) metadata (if +VBE is used)

      4. +
      5. Prefetches data from UVM to cache (if +EmbeddingLocation.MANAGED_CACHING is used and the user has not +explicitly prefetched data)

      6. +
      7. Performs the embedding table lookup by invoking a corresponding +Autograd function (based on the chosen optimizer)

      8. +
      -
      Returns:
      -

      A 2D-tensor containing looked up data. Shape (B, total_D) where B = +

      Parameters:
      +
        +
      • indices (Tensor) – A 1D-tensor that contains indices to be looked up +from all embedding table

      • +
      • offsets (Tensor) – A 1D-tensor that conatins offsets of indices. +Shape (B * T + 1) where B = batch size and T = the number +of features. offsets[t * B + b + 1] - offsets[t * B + b] is +the length of bag b of feature t

      • +
      • per_sample_weights (Optional[Tensor]) – An optional 1D-float-tensor that +contains per sample weights. If None, unweighted embedding +lookup will be perform. Otherwise, weighted will be used. The +length of this tensor must be the same as the length of the +indices tensor. The value of per_sample_weights[i] will be +used to multiply with every element in the looked up row +indices[i], where 0 <= i < len(per_sample_weights).

      • +
      • feature_requires_grad (Optional[Tensor]) – An optional 1D-tensor for +indicating if per_sample_weights requires gradient. The +length of the tensor must be equal to the number of features

      • +
      • batch_size_per_feature_per_rank (Optional[List[List[int]]]) – An +optional 2D-tensor that contains batch sizes for every rank and +every feature. If None, TBE assumes that every feature has the +same batch size and computes the batch size from the offsets +shape. Otherwise, TBE assumes that different features can have +different batch sizes and uses the variable batch size +embedding look up mode (VBE). Shape (number of features, +number of ranks). batch_size_per_feature_per_rank[f][r] +represents the batch size of feature f and rank r

      • +
      • total_unique_indices (Optional[int]) – An optional integer that +represents the total number of unique indices. This value must +be set when using OptimType.NONE. This is because TBE +requires this information for allocating the weight gradient +tensor in the backward pass.

      • +
      +
      +
      Returns:
      +

      A 2D-tensor containing looked up data. Shape (B, total_D) where B = batch size and total_D = the sum of all embedding dimensions in the table

      @@ -672,117 +705,83 @@

      Table Batched Embedding (TBE) Operators grad_fn=<CppNode<SplitLookupFunction_sgd_Op>>)

  • -
    -
    -forward(indices: Tensor, offsets: Tensor, per_sample_weights: Tensor | None = None, feature_requires_grad: Tensor | None = None, batch_size_per_feature_per_rank: List[List[int]] | None = None, total_unique_indices: int | None = None) Tensor[source]¶
    -

    Define the computation performed at every call.

    -

    Should be overridden by all subclasses.

    -
    -

    Note

    -

    Although the recipe for forward pass needs to be defined within -this function, one should call the Module instance afterwards -instead of this since the former takes care of running the -registered hooks while the latter silently ignores them.

    -
    -
    - -
    -
    -get_optimizer_state() List[Dict[str, Tensor]][source]¶
    -

    Get the optimizer state dict that matches the OSS Pytorch optims -TODO: populate the supported list of optimizers

    -
    - -
    -
    -static get_prefetch_passes(multipass_prefetch_config: MultiPassPrefetchConfig | None, input_tensor: Tensor, output_tensor: Tensor) List[Tuple[Tensor, Tensor, int]][source]¶
    -

    Given input (the indices to forward), return the segmentation for each pass -in the format of (input[start_idx:end_idx], output[start_idx:end_idx], start_idx).

    -

    Caller should guarantee input and output are having the size on dimension 0 -The returned segments are guaranteed to completely and non-overlappingly cover the input tensor.

    -

    In non-multipass-prefetch mode, it returns the input/output tensor itself.

    -
    - -
    -
    -static get_table_name_for_logging(table_names: List[str] | None) str[source]¶
    -

    Given list of all table names in the TBE, generate a string to represent -them in logging. If there’s more than one table, this method will count -them than list them.

    -
    -log(msg: str) None[source]¶
    -

    Log with TBE id prefix to distinguish between multiple TBE instances per process.

    -
    - -
    -
    -prepare_inputs(indices: Tensor, offsets: Tensor, per_sample_weights: Tensor | None = None, batch_size_per_feature_per_rank: List[List[int]] | None = None, force_cast_input_types: bool = True) Tuple[Tensor, Tensor, Tensor | None, VBEMetadata][source]¶
    -

    Prepare TBE inputs as follows:

    -
      -
    1. Create VBE metadata

    2. -
    3. Convert input types if force_cast_input_types=True

    4. -
    5. Run bounds_check_indices if bounds_check_mode is not -BoundsCheckMode.NONE

    6. -
    +
    +set_learning_rate(lr: float) None[source]¶
    +

    Sets the learning rate.

    Parameters:
    -
      -
    • indices (Tensor) – Input indices

    • -
    • offsets (Tensor) – Input offsets

    • -
    • per_sample_weights (Optional[Tensor]) – Input per sample -weights

    • -
    • batch_size_per_feature_per_rank – (Optional[List[List[int]]]): A 2D tensor of batch size -for each rank and feature. Shape = (number of -features, number of ranks)

    • -
    • force_cast_input_types (bool) – A flag to force convert -input types if set to True

    • -
    -
    -
    Returns:
    -

    A tuple of indices, offsets, per_sample_weights, and VBE -metadata

    +

    lr (float) – The learning rate value to set to

    -
    -
    -set_learning_rate(lr: float) None[source]¶
    -

    Sets the learning rate.

    -
    -
    set_optimizer_step(step: int) None[source]¶

    Sets the optimizer step.

    -
    - -
    -
    -should_log() bool[source]¶
    -

    Determines if we should log for this step, using exponentially decreasing frequency.

    -

    Logs for steps: 100 200 … 1,000 2,000 … 10,000 20,000 … 100,000 200,000 …

    +
    +
    Parameters:
    +

    step (int) – The setp value to set to

    +
    +
    -split_embedding_weights() List[Tensor][source]¶
    -

    Returns a list of weights, split by table

    +split_embedding_weights() List[Tensor][source]¶ +

    Returns a list of embedding weights (view), split by table

    +
    +
    Returns:
    +

    A list of weights. Length = the number of tables

    +
    +
    -split_optimizer_states() List[List[Tensor]][source]¶
    -

    Returns a list of states, split by table

    +split_optimizer_states() List[List[Tensor]][source]¶ +

    Returns a list of optimizer states (view), split by table

    +
    +
    Returns:
    +

    A list of list of states. Shape = (the number of tables, the number +of states).

    +

    The following shows the list of states (in the returned order) for +each optimizer:

    +
      +
    1. ADAM: momentum1, momentum2

    2. +
    3. EXACT_ADAGRAD: momentum1

    4. +
    5. EXACT_ROWWISE_ADAGRAD: momentum1 (rowwise), prev_iter +(rowwise; only when using WeightDecayMode = COUNTER or +COWCLIP or global_weight_decay is not None), row_counter +(rowwise; only when using WeightDecayMode = COUNTER or +COWCLIP)

    6. +
    7. EXACT_SGD: no states

    8. +
    9. LAMB: momentum1, momentum2

    10. +
    11. LARS_SGD: momentum1

    12. +
    13. PARTIAL_ROWWISE_ADAM: momentum1, momentum2 (rowwise)

    14. +
    15. PARTIAL_ROWWISE_LAMB: momentum1, momentum2 (rowwise)

    16. +
    17. ENSEMBLE_ROWWISE_ADAGRAD: momentum2 (rowwise), momentum1, +prev_iter (rowwise), row_counter (rowwise)

    18. +
    19. NONE: no states (throwing an error)

    20. +
    +

    +
    +
    update_hyper_parameters(params_dict: Dict[str, float]) None[source]¶

    Sets hyper-parameters from external control flow.

    +
    +
    Parameters:
    +

    params_dict (Dict[str, float]) – The dict that contains the +hyper-parameter names and their values

    +
    +
    @@ -832,17 +831,11 @@

    Table Batched Embedding (TBE) Operators

    FBGEMM_GPU Python API

    diff --git a/general/documentation/Overview.html b/general/documentation/Overview.html index 8d165e992..4ebb96b84 100644 --- a/general/documentation/Overview.html +++ b/general/documentation/Overview.html @@ -339,8 +339,9 @@

    FBGEMM_GPU Python API

    diff --git a/general/documentation/Python.html b/general/documentation/Python.html index 9ddc745c7..7f1a29cb7 100644 --- a/general/documentation/Python.html +++ b/general/documentation/Python.html @@ -339,8 +339,9 @@

    FBGEMM_GPU Python API

    diff --git a/general/documentation/Sphinx.html b/general/documentation/Sphinx.html index 382b0ae41..fc141614f 100644 --- a/general/documentation/Sphinx.html +++ b/general/documentation/Sphinx.html @@ -339,8 +339,9 @@

    FBGEMM_GPU Python API

    diff --git a/genindex.html b/genindex.html index e14ddba3e..a463ceb7f 100644 --- a/genindex.html +++ b/genindex.html @@ -336,8 +336,9 @@

    FBGEMM_GPU Python API

    @@ -513,6 +514,8 @@

    B

    @@ -534,6 +537,8 @@

    D