Skip to content

Commit

Permalink
Deploying to gh-pages from @ 20a46c3 🚀
Browse files Browse the repository at this point in the history
  • Loading branch information
facebook-github-bot committed Mar 19, 2024
1 parent 2524ae0 commit c5da214
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1228,6 +1228,8 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="p">)</span>

<span class="bp">self</span><span class="o">.</span><span class="n">step</span> <span class="o">=</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">last_reported_step</span> <span class="o">=</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">last_reported_uvm_stats</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>

<span class="c1"># Check whether to use TBE v2</span>
<span class="n">is_experimental</span> <span class="o">=</span> <span class="kc">False</span>
Expand Down Expand Up @@ -1327,6 +1329,23 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="p">),</span> <span class="s2">&quot;We should not be here. AsyncTimer only happens with reporter present.&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span><span class="o">.</span><span class="n">report_duration</span><span class="p">(</span><span class="n">it_step</span><span class="p">,</span> <span class="n">event_name</span><span class="p">,</span> <span class="n">dur_ms</span><span class="p">)</span>

<span class="nd">@torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">ignore</span>
<span class="k">def</span> <span class="nf">_report_fwd_input</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">indices</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">stats_reporter</span><span class="p">:</span> <span class="n">TBEStatsReporter</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span>
<span class="k">if</span> <span class="n">stats_reporter</span><span class="o">.</span><span class="n">should_report</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">):</span>
<span class="n">stats_reporter</span><span class="o">.</span><span class="n">report_data_amount</span><span class="p">(</span>
<span class="n">iteration_step</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">,</span>
<span class="n">event_name</span><span class="o">=</span><span class="s2">&quot;tbe.fwd_input_size&quot;</span><span class="p">,</span>
<span class="n">data_bytes</span><span class="o">=</span><span class="n">indices</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span> <span class="o">*</span> <span class="n">indices</span><span class="o">.</span><span class="n">numel</span><span class="p">(),</span>
<span class="p">)</span>
<span class="n">stats_reporter</span><span class="o">.</span><span class="n">report_data_amount</span><span class="p">(</span>
<span class="n">iteration_step</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">,</span>
<span class="n">event_name</span><span class="o">=</span><span class="s2">&quot;tbe.fwd_input_count&quot;</span><span class="p">,</span>
<span class="n">data_bytes</span><span class="o">=</span><span class="n">indices</span><span class="o">.</span><span class="n">numel</span><span class="p">(),</span>
<span class="p">)</span>

<span class="k">def</span> <span class="nf">_generate_vbe_metadata</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">offsets</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
Expand Down Expand Up @@ -1482,6 +1501,8 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="bp">self</span><span class="o">.</span><span class="n">_vbe_max_B</span> <span class="o">=</span> <span class="n">vbe_metadata</span><span class="o">.</span><span class="n">max_B</span>

<span class="bp">self</span><span class="o">.</span><span class="n">step</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_report_fwd_input</span><span class="p">(</span><span class="n">indices</span><span class="p">)</span>

<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">timesteps_prefetched</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_prefetch</span><span class="p">(</span><span class="n">indices</span><span class="p">,</span> <span class="n">offsets</span><span class="p">,</span> <span class="n">vbe_metadata</span><span class="p">)</span>

Expand Down Expand Up @@ -1689,6 +1710,7 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><

<span class="nd">@torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">ignore</span>
<span class="k">def</span> <span class="nf">print_uvm_cache_stats</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">use_local_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># TODO: Create a separate reporter class to unify the stdlog reporting</span>
<span class="n">uvm_cache_stats</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_uvm_cache_print_state</span><span class="p">(</span><span class="n">use_local_cache</span><span class="p">)</span>
<span class="n">N</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">uvm_cache_stats</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="n">m</span> <span class="o">=</span> <span class="p">{</span>
Expand Down Expand Up @@ -1722,6 +1744,44 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;uvm_cache_stats=</span><span class="si">{</span><span class="n">m</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>

<span class="nd">@torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">ignore</span>
<span class="k">def</span> <span class="nf">_report_uvm_cache_stats</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span>
<span class="n">stats_reporter</span><span class="p">:</span> <span class="n">TBEStatsReporter</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span>
<span class="n">passed_steps</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">step</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">last_reported_step</span>
<span class="k">if</span> <span class="n">passed_steps</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">stats_reporter</span><span class="o">.</span><span class="n">should_report</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">):</span>
<span class="k">return</span>

<span class="n">uvm_cache_stats</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_uvm_cache_stats</span><span class="p">(</span>
<span class="n">use_local_cache</span><span class="o">=</span><span class="kc">False</span>
<span class="p">)</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">last_reported_step</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">step</span>

<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">last_reported_uvm_stats</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">last_reported_uvm_stats</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">uvm_cache_stats</span><span class="p">)</span>
<span class="n">uvm_cache_stats_delta</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">uvm_cache_stats</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">uvm_cache_stats</span><span class="p">)):</span>
<span class="n">uvm_cache_stats_delta</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">uvm_cache_stats</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">last_reported_uvm_stats</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">last_reported_uvm_stats</span> <span class="o">=</span> <span class="n">uvm_cache_stats</span>

<span class="n">element_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">lxu_cache_weights</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span>
<span class="k">for</span> <span class="n">stat_index</span> <span class="ow">in</span> <span class="n">UVMCacheStatsIndex</span><span class="p">:</span>
<span class="n">stats_reporter</span><span class="o">.</span><span class="n">report_data_amount</span><span class="p">(</span>
<span class="n">iteration_step</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">,</span>
<span class="n">event_name</span><span class="o">=</span><span class="sa">f</span><span class="s2">&quot;tbe.prefetch.cache_stats_by_data_size.</span><span class="si">{</span><span class="n">stat_index</span><span class="o">.</span><span class="n">name</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>
<span class="n">data_bytes</span><span class="o">=</span><span class="nb">int</span><span class="p">(</span>
<span class="n">uvm_cache_stats_delta</span><span class="p">[</span><span class="n">stat_index</span><span class="o">.</span><span class="n">value</span><span class="p">]</span>
<span class="o">*</span> <span class="n">element_size</span>
<span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_D_cache</span>
<span class="o">/</span> <span class="n">passed_steps</span>
<span class="p">),</span>
<span class="p">)</span>

<span class="k">def</span> <span class="nf">prefetch</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">indices</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
Expand All @@ -1742,6 +1802,23 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="k">if</span> <span class="n">forward_stream</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_prefetch_tensors_record_stream</span><span class="p">(</span><span class="n">forward_stream</span><span class="p">)</span>

<span class="nd">@torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">ignore</span>
<span class="k">def</span> <span class="nf">_report_prefetch_input</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">indices</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">stats_reporter</span><span class="p">:</span> <span class="n">TBEStatsReporter</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">stats_reporter</span>
<span class="k">if</span> <span class="n">stats_reporter</span><span class="o">.</span><span class="n">should_report</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">):</span>
<span class="n">stats_reporter</span><span class="o">.</span><span class="n">report_data_amount</span><span class="p">(</span>
<span class="n">iteration_step</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">,</span>
<span class="n">event_name</span><span class="o">=</span><span class="s2">&quot;tbe.prefetch_input_size&quot;</span><span class="p">,</span>
<span class="n">data_bytes</span><span class="o">=</span><span class="n">indices</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span> <span class="o">*</span> <span class="n">indices</span><span class="o">.</span><span class="n">numel</span><span class="p">(),</span>
<span class="p">)</span>
<span class="n">stats_reporter</span><span class="o">.</span><span class="n">report_data_amount</span><span class="p">(</span>
<span class="n">iteration_step</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">step</span><span class="p">,</span>
<span class="n">event_name</span><span class="o">=</span><span class="s2">&quot;tbe.prefetch_input_count&quot;</span><span class="p">,</span>
<span class="n">data_bytes</span><span class="o">=</span><span class="n">indices</span><span class="o">.</span><span class="n">numel</span><span class="p">(),</span>
<span class="p">)</span>

<span class="k">def</span> <span class="nf">_prefetch</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">indices</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span>
Expand All @@ -1758,6 +1835,7 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="c1"># forward step</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">gather_uvm_cache_stats</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">local_uvm_cache_stats</span><span class="o">.</span><span class="n">zero_</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_report_prefetch_input</span><span class="p">(</span><span class="n">indices</span><span class="p">)</span>

<span class="n">linear_cache_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">ops</span><span class="o">.</span><span class="n">fbgemm</span><span class="o">.</span><span class="n">linearize_cache_indices</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cache_hash_size_cumsum</span><span class="p">,</span>
Expand Down Expand Up @@ -1842,6 +1920,7 @@ <h1>Source code for fbgemm_gpu.split_table_batched_embeddings_ops_training</h1><
<span class="bp">self</span><span class="o">.</span><span class="n">uvm_cache_stats</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">add</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uvm_cache_stats</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">local_uvm_cache_stats</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_report_uvm_cache_stats</span><span class="p">()</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">should_log</span><span class="p">():</span>
<span class="bp">self</span><span class="o">.</span><span class="n">print_uvm_cache_stats</span><span class="p">(</span><span class="n">use_local_cache</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>

Expand Down
Loading

0 comments on commit c5da214

Please sign in to comment.