diff --git a/config_src/drivers/nuopc_cap/mom_cap.F90 b/config_src/drivers/nuopc_cap/mom_cap.F90
index 778a3486b9..3574943918 100644
--- a/config_src/drivers/nuopc_cap/mom_cap.F90
+++ b/config_src/drivers/nuopc_cap/mom_cap.F90
@@ -16,7 +16,7 @@ module MOM_cap_mod
 use MOM_domains,              only: MOM_infra_init, MOM_infra_end
 use MOM_file_parser,          only: get_param, log_version, param_file_type, close_param_file
 use MOM_get_input,            only: get_MOM_input, directories
-use MOM_domains,              only: pass_var
+use MOM_domains,              only: pass_var, pe_here
 use MOM_error_handler,        only: MOM_error, FATAL, is_root_pe
 use MOM_grid,                 only: ocean_grid_type, get_global_grid_size
 use MOM_ocean_model_nuopc,    only: ice_ocean_boundary_type
@@ -29,6 +29,7 @@ module MOM_cap_mod
 use MOM_cap_methods,          only: med2mod_areacor, state_diagnose
 use MOM_cap_methods,          only: ChkErr
 use MOM_ensemble_manager,     only: ensemble_manager_init
+use MOM_coms,                 only: sum_across_PEs
 
 #ifdef CESMCOUPLED
 use shr_log_mod,             only: shr_log_setLogUnit
@@ -842,6 +843,7 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
   type(ocean_grid_type)        , pointer     :: ocean_grid
   type(ocean_internalstate_wrapper)          :: ocean_internalstate
   integer                                    :: npet, ntiles
+  integer                                    :: npes ! number of PEs (from FMS).
   integer                                    :: nxg, nyg, cnt
   integer                                    :: isc,iec,jsc,jec
   integer, allocatable                       :: xb(:),xe(:),yb(:),ye(:),pe(:)
@@ -868,6 +870,8 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
   integer                                    :: lsize
   integer                                    :: ig,jg, ni,nj,k
   integer, allocatable                       :: gindex(:) ! global index space
+  integer, allocatable                       :: gindex_ocn(:) ! global index space for ocean cells (excl. masked cells)
+  integer, allocatable                       :: gindex_elim(:) ! global index space for eliminated cells
   character(len=128)                         :: fldname
   character(len=256)                         :: cvalue
   character(len=256)                         :: frmt    ! format specifier for several error msgs
@@ -891,6 +895,11 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
   real(ESMF_KIND_R8)                         :: min_areacor_glob(2)
   real(ESMF_KIND_R8)                         :: max_areacor_glob(2)
   character(len=*), parameter                :: subname='(MOM_cap:InitializeRealize)'
+  integer                                    :: niproc, njproc
+  integer                                    :: ip, jp, pe_ix
+  integer                                    :: num_elim_blocks ! number of blocks to be eliminated
+  integer                                    :: num_elim_cells_global, num_elim_cells_local, num_elim_cells_remaining
+  integer, allocatable                       :: cell_mask(:,:)
   real(8)                                    :: MPI_Wtime, timeirls
   !--------------------------------
 
@@ -937,19 +946,19 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
     rc = ESMF_FAILURE
     call ESMF_LogWrite(subname//' ntiles must be 1', ESMF_LOGMSG_ERROR)
   endif
-  ntiles = mpp_get_domain_npes(ocean_public%domain)
-  write(tmpstr,'(a,1i6)') subname//' ntiles = ',ntiles
+  npes = mpp_get_domain_npes(ocean_public%domain)
+  write(tmpstr,'(a,1i6)') subname//' npes = ',npes
   call ESMF_LogWrite(trim(tmpstr), ESMF_LOGMSG_INFO)
 
   !---------------------------------
   ! get start and end indices of each tile and their PET
   !---------------------------------
 
-  allocate(xb(ntiles),xe(ntiles),yb(ntiles),ye(ntiles),pe(ntiles))
+  allocate(xb(npes),xe(npes),yb(npes),ye(npes),pe(npes))
   call mpp_get_compute_domains(ocean_public%domain, xbegin=xb, xend=xe, ybegin=yb, yend=ye)
   call mpp_get_pelist(ocean_public%domain, pe)
   if (dbug > 1) then
-    do n = 1,ntiles
+    do n = 1,npes
       write(tmpstr,'(a,6i6)') subname//' tiles ',n,pe(n),xb(n),xe(n),yb(n),ye(n)
       call ESMF_LogWrite(trim(tmpstr), ESMF_LOGMSG_INFO)
     enddo
@@ -971,17 +980,102 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
     call get_global_grid_size(ocean_grid, ni, nj)
     lsize = ( ocean_grid%iec - ocean_grid%isc + 1 ) * ( ocean_grid%jec - ocean_grid%jsc + 1 )
 
-    ! Create the global index space for the computational domain
-    allocate(gindex(lsize))
-    k = 0
-    do j = ocean_grid%jsc, ocean_grid%jec
-      jg = j + ocean_grid%jdg_offset
-      do i = ocean_grid%isc, ocean_grid%iec
-        ig = i + ocean_grid%idg_offset
-        k = k + 1 ! Increment position within gindex
-        gindex(k) = ni * (jg - 1) + ig
+    num_elim_blocks = 0
+    num_elim_cells_global = 0
+    num_elim_cells_local = 0
+    num_elim_cells_remaining = 0
+
+    ! Compute the number of eliminated blocks (specified in MOM_mask_table)
+    if (associated(ocean_grid%Domain%maskmap)) then
+      njproc = size(ocean_grid%Domain%maskmap, 1)
+      niproc = size(ocean_grid%Domain%maskmap, 2)
+
+      do ip = 1, niproc
+        do jp = 1, njproc
+          if (.not. ocean_grid%Domain%maskmap(jp,ip)) then
+            num_elim_blocks = num_elim_blocks+1
+          endif
+        enddo
       enddo
-    enddo
+    endif
+
+    ! Apply land block elimination to ESMF gindex
+    ! (Here we assume that each processor gets assigned a single tile. If multi-tile implementation is to be added
+    ! in MOM6 NUOPC cap in the future, below code must be updated accordingly.)
+    if (num_elim_blocks>0) then
+
+      allocate(cell_mask(ni, nj), source=0)
+      allocate(gindex_ocn(lsize))
+      k = 0
+      do j = ocean_grid%jsc, ocean_grid%jec
+        jg = j + ocean_grid%jdg_offset
+        do i = ocean_grid%isc, ocean_grid%iec
+          ig = i + ocean_grid%idg_offset
+          k = k + 1 ! Increment position within gindex
+          gindex_ocn(k) = ni * (jg - 1) + ig
+          cell_mask(ig, jg) = 1
+        enddo
+      enddo
+      call sum_across_PEs(cell_mask, ni*nj)
+
+      if (maxval(cell_mask) /= 1 ) then
+        call MOM_error(FATAL, "Encountered cells shared by multiple PEs while attempting to determine masked cells.")
+      endif
+
+      num_elim_cells_global = ni * nj - sum(cell_mask)
+      num_elim_cells_local = num_elim_cells_global / npes
+
+      if (pe_here() == pe(npes)) then
+        ! assign all remaining cells to the last PE.
+        num_elim_cells_remaining = num_elim_cells_global - num_elim_cells_local * npes
+        allocate(gindex_elim(num_elim_cells_local+num_elim_cells_remaining))
+      else
+        allocate(gindex_elim(num_elim_cells_local))
+      endif
+
+      ! Zero-based PE index.
+      pe_ix = pe_here() - pe(1)
+
+      k = 0
+      do jg = 1, nj
+        do ig = 1, ni
+          if (cell_mask(ig, jg) == 0) then
+            k = k + 1
+            if (k > pe_ix * num_elim_cells_local .and.  &
+                    k <= ((pe_ix+1) * num_elim_cells_local + num_elim_cells_remaining)) then
+              gindex_elim(k - pe_ix * num_elim_cells_local) = ni * (jg -1) + ig
+            endif
+          endif
+        enddo
+      enddo
+
+      allocate(gindex(lsize + num_elim_cells_local + num_elim_cells_remaining))
+      do k = 1, lsize
+        gindex(k) = gindex_ocn(k)
+      enddo
+      do k = 1, num_elim_cells_local + num_elim_cells_remaining
+        gindex(k+lsize) = gindex_elim(k)
+      enddo
+
+      deallocate(cell_mask)
+      deallocate(gindex_ocn)
+      deallocate(gindex_elim)
+
+    else ! no eliminated land blocks
+
+      ! Create the global index space for the computational domain
+      allocate(gindex(lsize))
+      k = 0
+      do j = ocean_grid%jsc, ocean_grid%jec
+        jg = j + ocean_grid%jdg_offset
+        do i = ocean_grid%isc, ocean_grid%iec
+          ig = i + ocean_grid%idg_offset
+          k = k + 1 ! Increment position within gindex
+          gindex(k) = ni * (jg - 1) + ig
+        enddo
+      enddo
+
+    endif
 
     DistGrid = ESMF_DistGridCreate(arbSeqIndexList=gindex, rc=rc)
     if (ChkErr(rc,__LINE__,u_FILE_u)) return
@@ -1005,6 +1099,10 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
     call ESMF_MeshGet(Emesh, spatialDim=spatialDim, numOwnedElements=numOwnedElements, rc=rc)
     if (ChkErr(rc,__LINE__,u_FILE_u)) return
 
+    if (lsize /= numOwnedElements - num_elim_cells_local - num_elim_cells_remaining) then
+      call MOM_error(FATAL, "Discrepancy detected between ESMF mesh and internal MOM6 domain sizes. Check mask table.")
+    endif
+
     allocate(ownedElemCoords(spatialDim*numOwnedElements))
     allocate(lonMesh(numOwnedElements), lon(numOwnedElements))
     allocate(latMesh(numOwnedElements), lat(numOwnedElements))
@@ -1036,7 +1134,7 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
     end do
 
     eps_omesh = get_eps_omesh(ocean_state)
-    do n = 1,numOwnedElements
+    do n = 1,lsize
       diff_lon = abs(mod(lonMesh(n) - lon(n),360.0))
       if (diff_lon > eps_omesh) then
         frmt = "('ERROR: Difference between ESMF Mesh and MOM6 domain coords is "//&
@@ -1140,11 +1238,11 @@ subroutine InitializeRealize(gcomp, importState, exportState, clock, rc)
 
     ! generate delayout and dist_grid
 
-    allocate(deBlockList(2,2,ntiles))
-    allocate(petMap(ntiles))
-    allocate(deLabelList(ntiles))
+    allocate(deBlockList(2,2,npes))
+    allocate(petMap(npes))
+    allocate(deLabelList(npes))
 
-    do n = 1, ntiles
+    do n = 1, npes
       deLabelList(n) = n
       deBlockList(1,1,n) = xb(n)
       deBlockList(1,2,n) = xe(n)
@@ -1727,7 +1825,7 @@ subroutine ModelAdvance(gcomp, rc)
         rpointer_filename = 'rpointer.ocn'//trim(inst_suffix)
 
         write(restartname,'(A,".mom6.r.",I4.4,"-",I2.2,"-",I2.2,"-",I5.5)') &
-             trim(casename), year, month, day, seconds
+             trim(casename), year, month, day, hour * 3600 + minute * 60 + seconds
         call ESMF_LogWrite("MOM_cap: Writing restart :  "//trim(restartname), ESMF_LOGMSG_INFO)
         ! write restart file(s)
         call ocean_model_restart(ocean_state, restartname=restartname, num_rest_files=num_rest_files)
diff --git a/config_src/drivers/nuopc_cap/mom_cap_methods.F90 b/config_src/drivers/nuopc_cap/mom_cap_methods.F90
index f41c98b112..125bae5748 100644
--- a/config_src/drivers/nuopc_cap/mom_cap_methods.F90
+++ b/config_src/drivers/nuopc_cap/mom_cap_methods.F90
@@ -853,7 +853,7 @@ subroutine State_SetExport(state, fldname, isc, iec, jsc, jec, input, ocean_grid
 
   ! local variables
   type(ESMF_StateItem_Flag)     :: itemFlag
-  integer                       :: n, i, j, i1, j1, ig,jg
+  integer                       :: n, i, j, k, i1, j1, ig,jg
   integer                       :: lbnd1,lbnd2
   real(ESMF_KIND_R8), pointer   :: dataPtr1d(:)
   real(ESMF_KIND_R8), pointer   :: dataPtr2d(:,:)
@@ -889,6 +889,13 @@ subroutine State_SetExport(state, fldname, isc, iec, jsc, jec, input, ocean_grid
         enddo
       end if
 
+      ! if a maskmap is provided, set exports of all eliminated cells to zero.
+      if (associated(ocean_grid%Domain%maskmap)) then
+        do k = n+1, size(dataPtr1d)
+          dataPtr1d(k) = 0.0
+        enddo
+      endif
+
     else if (geomtype == ESMF_GEOMTYPE_GRID) then
 
       call state_getfldptr(state, trim(fldname), dataptr2d, rc)
diff --git a/config_src/infra/FMS1/MOM_domain_infra.F90 b/config_src/infra/FMS1/MOM_domain_infra.F90
index 2c97a0bb31..1de9a6d658 100644
--- a/config_src/infra/FMS1/MOM_domain_infra.F90
+++ b/config_src/infra/FMS1/MOM_domain_infra.F90
@@ -16,7 +16,7 @@ module MOM_domain_infra
 use mpp_domains_mod, only : mpp_create_group_update, mpp_do_group_update
 use mpp_domains_mod, only : mpp_reset_group_update_field, mpp_group_update_initialized
 use mpp_domains_mod, only : mpp_start_group_update, mpp_complete_group_update
-use mpp_domains_mod, only : mpp_compute_block_extent
+use mpp_domains_mod, only : mpp_compute_block_extent, mpp_compute_extent
 use mpp_domains_mod, only : mpp_broadcast_domain, mpp_redistribute, mpp_global_field
 use mpp_domains_mod, only : AGRID, BGRID_NE, CGRID_NE, SCALAR_PAIR, BITWISE_EXACT_SUM
 use mpp_domains_mod, only : CYCLIC_GLOBAL_DOMAIN, FOLD_NORTH_EDGE
@@ -40,7 +40,7 @@ module MOM_domain_infra
 public :: domain2D, domain1D, group_pass_type
 ! These interfaces are actually implemented or have explicit interfaces in this file.
 public :: create_MOM_domain, clone_MOM_domain, get_domain_components, get_domain_extent
-public :: deallocate_MOM_domain, get_global_shape, compute_block_extent
+public :: deallocate_MOM_domain, get_global_shape, compute_block_extent, compute_extent
 public :: pass_var, pass_vector, fill_symmetric_edges, rescale_comp_data
 public :: pass_var_start, pass_var_complete, pass_vector_start, pass_vector_complete
 public :: create_group_pass, do_group_pass, start_group_pass, complete_group_pass
@@ -1945,6 +1945,17 @@ subroutine compute_block_extent(isg, ieg, ndivs, ibegin, iend)
   call mpp_compute_block_extent(isg, ieg, ndivs, ibegin, iend)
 end subroutine compute_block_extent
 
+!> Get the array ranges in one dimension for the divisions of a global index space
+subroutine compute_extent(isg, ieg, ndivs, ibegin, iend)
+  integer,               intent(in)  :: isg    !< The starting index of the global index space
+  integer,               intent(in)  :: ieg    !< The ending index of the global index space
+  integer,               intent(in)  :: ndivs  !< The number of divisions
+  integer, dimension(:), intent(out) :: ibegin !< The starting index of each division
+  integer, dimension(:), intent(out) :: iend   !< The ending index of each division
+
+  call mpp_compute_extent(isg, ieg, ndivs, ibegin, iend)
+end subroutine compute_extent
+
 !> Broadcast a 2-d domain from the root PE to the other PEs
 subroutine broadcast_domain(domain)
   type(domain2d),  intent(inout) :: domain !< The domain2d type that will be shared across PEs.
diff --git a/config_src/infra/FMS2/MOM_coms_infra.F90 b/config_src/infra/FMS2/MOM_coms_infra.F90
index cf9a724734..06a9b9f343 100644
--- a/config_src/infra/FMS2/MOM_coms_infra.F90
+++ b/config_src/infra/FMS2/MOM_coms_infra.F90
@@ -42,6 +42,7 @@ module MOM_coms_infra
 interface sum_across_PEs
   module procedure sum_across_PEs_int4_0d
   module procedure sum_across_PEs_int4_1d
+  module procedure sum_across_PEs_int4_2d
   module procedure sum_across_PEs_int8_0d
   module procedure sum_across_PEs_int8_1d
   module procedure sum_across_PEs_int8_2d
@@ -357,6 +358,15 @@ subroutine sum_across_PEs_int4_1d(field, length, pelist)
   call mpp_sum(field, length, pelist)
 end subroutine sum_across_PEs_int4_1d
 
+!> Find the sum of the values in corresponding positions of field across PEs, and return these sums in field.
+subroutine sum_across_PEs_int4_2d(field, length, pelist)
+  integer(kind=int32), dimension(:,:), intent(inout) :: field     !< The values to add, the sums upon return
+  integer,                             intent(in)    :: length    !< Number of elements in field to add
+  integer,                 optional,   intent(in)    :: pelist(:) !< List of PEs to work with
+
+  call mpp_sum(field, length, pelist)
+end subroutine sum_across_PEs_int4_2d
+
 !> Find the sum of field across PEs, and return this sum in field.
 subroutine sum_across_PEs_int8_0d(field, pelist)
   integer(kind=int64), intent(inout) :: field     !< Value on this PE, and the sum across PEs upon return
diff --git a/config_src/infra/FMS2/MOM_domain_infra.F90 b/config_src/infra/FMS2/MOM_domain_infra.F90
index ff1d888c47..95159f7fe1 100644
--- a/config_src/infra/FMS2/MOM_domain_infra.F90
+++ b/config_src/infra/FMS2/MOM_domain_infra.F90
@@ -16,7 +16,7 @@ module MOM_domain_infra
 use mpp_domains_mod, only : mpp_create_group_update, mpp_do_group_update
 use mpp_domains_mod, only : mpp_reset_group_update_field, mpp_group_update_initialized
 use mpp_domains_mod, only : mpp_start_group_update, mpp_complete_group_update
-use mpp_domains_mod, only : mpp_compute_block_extent
+use mpp_domains_mod, only : mpp_compute_block_extent, mpp_compute_extent
 use mpp_domains_mod, only : mpp_broadcast_domain, mpp_redistribute, mpp_global_field
 use mpp_domains_mod, only : AGRID, BGRID_NE, CGRID_NE, SCALAR_PAIR, BITWISE_EXACT_SUM
 use mpp_domains_mod, only : CYCLIC_GLOBAL_DOMAIN, FOLD_NORTH_EDGE
@@ -38,7 +38,7 @@ module MOM_domain_infra
 public :: domain2D, domain1D, group_pass_type
 ! These interfaces are actually implemented or have explicit interfaces in this file.
 public :: create_MOM_domain, clone_MOM_domain, get_domain_components, get_domain_extent
-public :: deallocate_MOM_domain, get_global_shape, compute_block_extent
+public :: deallocate_MOM_domain, get_global_shape, compute_block_extent, compute_extent
 public :: pass_var, pass_vector, fill_symmetric_edges, rescale_comp_data
 public :: pass_var_start, pass_var_complete, pass_vector_start, pass_vector_complete
 public :: create_group_pass, do_group_pass, start_group_pass, complete_group_pass
@@ -1936,7 +1936,7 @@ subroutine get_global_shape(domain, niglobal, njglobal)
   njglobal = domain%njglobal
 end subroutine get_global_shape
 
-!> Get the array ranges in one dimension for the divisions of a global index space
+!> Get the array ranges in one dimension for the divisions of a global index space (alternative to compute_extent)
 subroutine compute_block_extent(isg, ieg, ndivs, ibegin, iend)
   integer,               intent(in)  :: isg    !< The starting index of the global index space
   integer,               intent(in)  :: ieg    !< The ending index of the global index space
@@ -1947,6 +1947,17 @@ subroutine compute_block_extent(isg, ieg, ndivs, ibegin, iend)
   call mpp_compute_block_extent(isg, ieg, ndivs, ibegin, iend)
 end subroutine compute_block_extent
 
+!> Get the array ranges in one dimension for the divisions of a global index space
+subroutine compute_extent(isg, ieg, ndivs, ibegin, iend)
+  integer,               intent(in)  :: isg    !< The starting index of the global index space
+  integer,               intent(in)  :: ieg    !< The ending index of the global index space
+  integer,               intent(in)  :: ndivs  !< The number of divisions
+  integer, dimension(:), intent(out) :: ibegin !< The starting index of each division
+  integer, dimension(:), intent(out) :: iend   !< The ending index of each division
+
+  call mpp_compute_extent(isg, ieg, ndivs, ibegin, iend)
+end subroutine compute_extent
+
 !> Broadcast a 2-d domain from the root PE to the other PEs
 subroutine broadcast_domain(domain)
   type(domain2d),  intent(inout) :: domain !< The domain2d type that will be shared across PEs.
diff --git a/src/core/MOM.F90 b/src/core/MOM.F90
index bcba4d37c7..a9c8c5cd9e 100644
--- a/src/core/MOM.F90
+++ b/src/core/MOM.F90
@@ -2539,12 +2539,12 @@ subroutine initialize_MOM(Time, Time_init, param_file, dirs, CS, &
   G_in => CS%G_in
 #ifdef STATIC_MEMORY_
   call MOM_domains_init(G_in%domain, param_file, symmetric=symmetric, &
-            static_memory=.true., NIHALO=NIHALO_, NJHALO=NJHALO_, &
-            NIGLOBAL=NIGLOBAL_, NJGLOBAL=NJGLOBAL_, NIPROC=NIPROC_, &
-            NJPROC=NJPROC_)
+                        static_memory=.true., NIHALO=NIHALO_, NJHALO=NJHALO_, &
+                        NIGLOBAL=NIGLOBAL_, NJGLOBAL=NJGLOBAL_, NIPROC=NIPROC_, &
+                        NJPROC=NJPROC_, US=US)
 #else
   call MOM_domains_init(G_in%domain, param_file, symmetric=symmetric, &
-                        domain_name="MOM_in")
+                        domain_name="MOM_in", US=US)
 #endif
 
   ! Copy input grid (G_in) domain to active grid G
diff --git a/src/framework/MOM_domains.F90 b/src/framework/MOM_domains.F90
index a0f3855d19..d937ed7b0c 100644
--- a/src/framework/MOM_domains.F90
+++ b/src/framework/MOM_domains.F90
@@ -20,10 +20,13 @@ module MOM_domains
 use MOM_domain_infra,     only : AGRID, BGRID_NE, CGRID_NE, SCALAR_PAIR
 use MOM_domain_infra,     only : CORNER, CENTER, NORTH_FACE, EAST_FACE
 use MOM_domain_infra,     only : To_East, To_West, To_North, To_South, To_All, Omit_Corners
-use MOM_error_handler,    only : MOM_error, MOM_mesg, NOTE, WARNING, FATAL
+use MOM_domain_infra,     only : compute_extent
+use MOM_error_handler,    only : MOM_error, MOM_mesg, NOTE, WARNING, FATAL, is_root_pe
 use MOM_file_parser,      only : get_param, log_param, log_version, param_file_type
-use MOM_io_infra,         only : file_exists
+use MOM_io_infra,         only : file_exists, read_field, open_ASCII_file, close_file, WRITEONLY_FILE
 use MOM_string_functions, only : slasher
+use MOM_cpu_clock,        only : cpu_clock_id, cpu_clock_begin, cpu_clock_end, CLOCK_ROUTINE
+use MOM_unit_scaling,     only : unit_scale_type
 
 implicit none ; private
 
@@ -62,7 +65,7 @@ module MOM_domains
 !! properties of the domain type.
 subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
                             NIHALO, NJHALO, NIGLOBAL, NJGLOBAL, NIPROC, NJPROC, &
-                            min_halo, domain_name, include_name, param_suffix)
+                            min_halo, domain_name, include_name, param_suffix, US)
   type(MOM_domain_type),           pointer       :: MOM_dom      !< A pointer to the MOM_domain_type
                                                                  !! being defined here.
   type(param_file_type),           intent(in)    :: param_file   !< A structure to parse for
@@ -95,9 +98,11 @@ subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
                                                                  !! "MOM_memory.h" if missing.
   character(len=*),      optional, intent(in)    :: param_suffix !< A suffix to apply to
                                                                  !! layout-specific parameters.
+  type(unit_scale_type), optional, pointer       :: US           !< A dimensional unit scaling type
 
   ! Local variables
   integer, dimension(2) :: layout    ! The number of logical processors in the i- and j- directions
+  integer, dimension(2) :: auto_layout ! The layout determined by the auto masking routine
   integer, dimension(2) :: io_layout ! The layout of logical processors for input and output
   !$ integer :: ocean_nthreads       ! Number of openMP threads
   !$ logical :: ocean_omp_hyper_thread ! If true use openMP hyper-threads
@@ -112,7 +117,10 @@ subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
   logical :: nonblocking    ! If true, nonblocking halo updates will be used.
   logical :: thin_halos     ! If true, If true, optional arguments may be used to specify the
                             ! width of the halos that are updated with each call.
+  logical            :: auto_mask_table ! Runtime flag that turns on automatic mask table generator
+  integer            :: auto_io_layout_fac ! Used to compute IO layout when auto_mask_table is True.
   logical            :: mask_table_exists ! True if there is a mask table file
+  logical :: is_MOM_domain  ! True if this domain is being set for MOM, and not another component like SIS2.
   character(len=128) :: inputdir   ! The directory in which to find the diag table
   character(len=200) :: mask_table ! The file name and later the full path to the diag table
   character(len=64)  :: inc_nm     ! The name of the memory include file
@@ -122,6 +130,10 @@ subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
   character(len=8) :: char_xsiz, char_ysiz, char_niglobal, char_njglobal
   character(len=40) :: nihalo_nm, njhalo_nm, layout_nm, io_layout_nm, masktable_nm
   character(len=40) :: niproc_nm, njproc_nm
+  character(len=200) :: topo_config
+  integer :: id_clock_auto_mask
+  character(len=:), allocatable :: masktable_desc
+  character(len=:), allocatable :: auto_mask_table_fname ! Auto-generated mask table file name
   ! This include declares and sets the variable "version".
 # include "version_variable.h"
   character(len=40)  :: mdl ! This module's name.
@@ -277,18 +289,61 @@ subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
   call get_param(param_file, mdl, "INPUTDIR", inputdir, do_not_log=.true., default=".")
   inputdir = slasher(inputdir)
 
-  call get_param(param_file, mdl, trim(masktable_nm), mask_table, &
-                 "A text file to specify n_mask, layout and mask_list. This feature masks out "//&
-                 "processors that contain only land points. The first line of mask_table is the "//&
-                 "number of regions to be masked out. The second line is the layout of the "//&
-                 "model and must be consistent with the actual model layout. The following "//&
-                 "(n_mask) lines give the logical positions of the processors that are masked "//&
-                 "out. The mask_table can be created by tools like check_mask. The following "//&
-                 "example of mask_table masks out 2 processors, (1,2) and (3,6), out of the 24 "//&
-                 "in a 4x6 layout: \n 2\n 4,6\n 1,2\n 3,6\n", default="MOM_mask_table", &
-                 layoutParam=.true.)
-  mask_table = trim(inputdir)//trim(mask_table)
-  mask_table_exists = file_exists(mask_table)
+  is_MOM_domain = .true.
+  if (present(domain_name)) then
+    is_MOM_domain = (index(domain_name, "MOM") > 1)
+  endif
+
+  if (is_MOM_domain) then
+    call get_param(param_file, mdl, "TOPO_CONFIG", topo_config, do_not_log=.true., fail_if_missing=.true.)
+  else ! SIS2 has a default value for TOPO_CONFIG.
+    call get_param(param_file, mdl, "TOPO_CONFIG", topo_config, default="file", do_not_log=.true.)
+  endif
+
+  auto_mask_table = .false.
+  if (.not. present(param_suffix) .and. .not. is_static .and. trim(topo_config) == 'file') then
+     call get_param(param_file, mdl, 'AUTO_MASKTABLE', auto_mask_table, &
+                 "Turn on automatic mask table generation to eliminate land blocks.", &
+                 default=.false., layoutParam=.true.)
+  endif
+
+  masktable_desc = "A text file to specify n_mask, layout and mask_list. This feature masks out "//&
+      "processors that contain only land points. The first line of mask_table is the "//&
+      "number of regions to be masked out. The second line is the layout of the "//&
+      "model and must be consistent with the actual model layout. The following "//&
+      "(n_mask) lines give the logical positions of the processors that are masked "//&
+      "out. The mask_table can be created by tools like check_mask. The following "//&
+      "example of mask_table masks out 2 processors, (1,2) and (3,6), out of the 24 "//&
+      "in a 4x6 layout: \n 2\n 4,6\n 1,2\n 3,6\n"
+
+  if (auto_mask_table) then
+    id_clock_auto_mask = cpu_clock_id('(Ocean gen_auto_mask_table)', grain=CLOCK_ROUTINE)
+    auto_mask_table_fname = "MOM_auto_mask_table"
+
+    ! Auto-generate a mask file and determine the layout
+    call cpu_clock_begin(id_clock_auto_mask)
+    if (is_root_PE()) then
+      call gen_auto_mask_table(n_global, reentrant, tripolar_N, PEs_used, param_file, inputdir, &
+                               auto_mask_table_fname, auto_layout, US)
+    endif
+    call broadcast(auto_layout, length=2)
+    call cpu_clock_end(id_clock_auto_mask)
+
+    mask_table = auto_mask_table_fname
+    call log_param(param_file, mdl, trim(masktable_nm), mask_table, masktable_desc, &
+                   default="MOM_mask_table", layoutParam=.true.)
+  else
+    call get_param(param_file, mdl, trim(masktable_nm), mask_table, masktable_desc, &
+                   default="MOM_mask_table", layoutParam=.true.)
+  endif
+
+  ! First, check the run directory for the mask_table input file.
+  mask_table_exists = file_exists(trim(mask_table))
+  ! If not found, check the input directory
+  if (.not. mask_table_exists) then
+    mask_table = trim(inputdir)//trim(mask_table)
+    mask_table_exists = file_exists(mask_table)
+  endif
 
   if (is_static) then
     layout(1) = NIPROC ; layout(2) = NJPROC
@@ -317,6 +372,16 @@ subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
                     "Shift to using "//trim(layout_nm)//" instead.")
     endif
 
+    if (auto_mask_table) then
+        if (layout(1) /= 0 .and. layout(1) /= auto_layout(1)) then
+          call MOM_error(FATAL, "Cannot set LAYOUT or NIPROC when AUTO_MASKTABLE is enabled.")
+        endif
+        if (layout(2) /= 0 .and. layout(2) /= auto_layout(2)) then
+          call MOM_error(FATAL, "Cannot set LAYOUT or NJPROC when AUTO_MASKTABLE is enabled.")
+        endif
+        layout(:) = auto_layout(:)
+    endif
+
     if ( (layout(1) == 0) .and. (layout(2) == 0) ) &
       call MOM_define_layout(n_global, PEs_used, layout)
     if ( (layout(1) /= 0) .and. (layout(2) == 0) ) layout(2) = PEs_used / layout(1)
@@ -351,9 +416,28 @@ subroutine MOM_domains_init(MOM_dom, param_file, symmetric, static_memory, &
   ! Set up the I/O layout, it will be checked later that it uses an even multiple of the number of
   ! PEs in each direction.
   io_layout(:) = (/ 1, 1 /)
-  call get_param(param_file, mdl, trim(io_layout_nm), io_layout, &
-                 "The processor layout to be used, or 0,0 to automatically set the io_layout "//&
-                 "to be the same as the layout.", default=1, layoutParam=.true.)
+
+  ! Compute a valid IO layout if auto_mask_table is on. Otherwise, read in IO_LAYOUT parameter,
+  if (auto_mask_table) then
+    call get_param(param_file, mdl, "AUTO_IO_LAYOUT_FAC", auto_io_layout_fac, &
+            "When AUTO_MASKTABLE is enabled, io layout is calculated by performing integer "//&
+            "division of the runtime-determined domain layout with this factor. If the factor "//&
+            "is set to 0 (default), the io layout is set to 1,1.", &
+            default=0, layoutParam=.true.)
+    if (auto_io_layout_fac>0) then
+      io_layout(1) = max(layout(1)/auto_io_layout_fac, 1)
+      io_layout(2) = max(layout(2)/auto_io_layout_fac, 1)
+    elseif (auto_io_layout_fac<0) then
+      call MOM_error(FATAL, 'AUTO_IO_LAYOUT_FAC must be a nonnegative integer.')
+    endif
+    call log_param(param_file, mdl, trim(io_layout_nm), io_layout, &
+                   "The processor layout to be used, or 0,0 to automatically set the io_layout "//&
+                   "to be the same as the layout.", layoutParam=.true.)
+  else
+    call get_param(param_file, mdl, trim(io_layout_nm), io_layout, &
+                   "The processor layout to be used, or 0,0 to automatically set the io_layout "//&
+                   "to be the same as the layout.", default=1, layoutParam=.true.)
+  endif
 
   call create_MOM_domain(MOM_dom, n_global, n_halo, reentrant, tripolar_N, layout, &
                          io_layout=io_layout, domain_name=domain_name, mask_table=mask_table, &
@@ -387,4 +471,221 @@ subroutine MOM_define_layout(n_global, ndivs, layout)
   layout = (/ idiv, jdiv /)
 end subroutine MOM_define_layout
 
+!> Given a desired number of active npes, generate a layout and mask_table
+subroutine gen_auto_mask_table(n_global, reentrant, tripolar_N, npes, param_file, inputdir, filename, layout, US)
+  integer, dimension(2), intent(in)         :: n_global   !< The total number of gridpoints in 2 directions
+  logical, dimension(2), intent(in)         :: reentrant  !< True if the x- and y- directions are periodic.
+  logical,               intent(in)         :: tripolar_N !< A flag indicating whether there is n. tripolar connectivity
+  integer,               intent(in)         :: npes       !< The desired number of active PEs.
+  type(param_file_type), intent(in)         :: param_file !< A structure to parse for run-time parameters
+  character(len=128),    intent(in)         :: inputdir   !< INPUTDIR parameter
+  character(len=:), allocatable, intent(in) :: filename   !< Mask table file path (to be auto-generated.)
+  integer, dimension(2), intent(out)        :: layout     !< The generated layout of PEs (incl. masked blocks)
+  type(unit_scale_type), optional, pointer  :: US         !< A dimensional unit scaling type
+
+  ! Local variables
+  real, dimension(n_global(1), n_global(2)) :: D        ! Bathymetric depth (to be read in from TOPO_FILE) [Z ~> m]
+  integer, dimension(:,:), allocatable :: mask          ! Cell masks (based on D and MINIMUM_DEPTH)
+  character(len=200) :: topo_filepath, topo_file        ! Strings for file/path
+  character(len=200) :: topo_varname                    ! Variable name in file
+  character(len=200) :: topo_config
+  character(len=40)  :: mdl = "gen_auto_mask_table"      ! This subroutine's name.
+  integer :: i, j, p
+  real :: Dmask          ! The depth for masking in the same units as D             [Z ~> m]
+  real :: min_depth      ! The minimum ocean depth in the same units as D           [Z ~> m]
+  real :: mask_depth     ! The depth shallower than which to mask a point as land.  [Z ~> m]
+  real :: glob_ocn_frac  ! ratio of ocean points to total number of points          [nondim]
+  real :: r_p            ! aspect ratio for division count p.                       [nondim]
+  real :: m_to_Z         ! A conversion factor from m to height units           [Z m-1 ~> 1]
+  integer :: nx, ny      ! global domain sizes
+  integer, parameter :: ibuf=2, jbuf=2
+  real, parameter :: r_extreme = 4.0 ! aspect ratio limit (>1) for a layout to be considered [nondim]
+  integer :: num_masked_blocks
+  integer, allocatable :: mask_table(:,:)
+
+  m_to_Z = 1.0 ; if (present(US)) m_to_Z = US%m_to_Z
+
+  ! Read in params necessary for auto-masking
+  call get_param(param_file, mdl, "MINIMUM_DEPTH", min_depth, &
+                 units="m", default=0.0, scale=m_to_Z, do_not_log=.true.)
+  call get_param(param_file, mdl, "MASKING_DEPTH", mask_depth, &
+                 units="m", default=-9999.0, scale=m_to_Z, do_not_log=.true.)
+  call get_param(param_file, mdl, "TOPO_CONFIG", topo_config, default="file", do_not_log=.true.)
+  call get_param(param_file, mdl, "TOPO_FILE", topo_file, do_not_log=.true., default="topog.nc")
+  call get_param(param_file, mdl, "TOPO_VARNAME", topo_varname, do_not_log=.true., default="depth")
+  topo_filepath = trim(inputdir)//trim(topo_file)
+
+  ! Sanity checks
+  if (.not. is_root_pe()) then
+    call MOM_error(FATAL, 'gen_auto_mask_table should only be called by the root PE.')
+  endif
+  if (trim(topo_config) /= "file") then
+    call MOM_error(FATAL, 'Auto mask table only works with TOPO_CONFIG="file"')
+  endif
+  if (.not.file_exists(topo_filepath)) then
+    call MOM_error(FATAL, " gen_auto_mask_table: Unable to open "//trim(topo_filepath))
+  endif
+
+  nx = n_global(1)
+  ny = n_global(2)
+
+  ! Read in bathymetric depth.
+  D(:,:) = -9.0e30 * m_to_Z ! Initializing to a very large negative depth (tall mountains) everywhere.
+  call read_field(topo_filepath, trim(topo_varname), D, start=(/1, 1/), nread=n_global, no_domain=.true., &
+                  scale=m_to_Z)
+
+  allocate(mask(nx+2*ibuf, ny+2*jbuf), source=0)
+
+  ! Determine cell masks
+  Dmask = mask_depth
+  if (mask_depth == -9999.0*m_to_Z) Dmask = min_depth
+  do i=1,nx ; do j=1,ny
+    if (D(i,j) <= Dmask) then
+      mask(i+ibuf,j+jbuf) = 0
+    else
+      mask(i+ibuf,j+jbuf) = 1
+    endif
+  enddo ; enddo
+
+  ! fill in buffer cells
+
+  if (reentrant(1)) then ! REENTRANT_X
+    mask(1:ibuf, :) = mask(nx+1:nx+ibuf, :)
+    mask(ibuf+nx+1:nx+2*ibuf, :) = mask(ibuf+1:2*ibuf, :)
+  endif
+
+  if (reentrant(2)) then ! REENTRANT_Y
+    mask(:, 1:jbuf) = mask(:, ny+1:ny+jbuf)
+    mask(:, jbuf+ny+1:ny+2*jbuf) = mask(:, jbuf+1:2*jbuf)
+  endif
+
+  if (tripolar_N) then ! TRIPOLAR_N
+    do i=1,nx+2*ibuf
+      do j=1,jbuf
+        mask(i, jbuf+ny+j) = mask(nx+2*ibuf+1-i, jbuf+ny+1-j)
+      enddo
+    enddo
+  endif
+
+  ! Tripolar Stitch Fix: In cases where masking is asymmetrical across the tripolar stitch, there's a possibility
+  ! that certain unmasked blocks won't be able to obtain grid metrics from the halo points. This occurs when the
+  ! neighboring block on the opposite side of the tripolar stitch is masked. As a consequence, certain metrics like
+  ! dxT and dyT may be calculated through extrapolation (refer to extrapolate_metric), potentially leading to the
+  ! generation of non-positive values. This can result in divide-by-zero errors elsewhere, e.g., in MOM_hor_visc.F90.
+  ! Currently, the safest and most general solution is to prohibit masking along the tripolar stitch:
+  if (tripolar_N) then
+    mask(:, jbuf+ny) = 1
+  endif
+
+  glob_ocn_frac = real(sum(mask(1+ibuf:nx+ibuf, 1+jbuf:ny+jbuf))) / (nx * ny)
+
+  ! Iteratively check for all possible division counts starting from the upper bound of npes/glob_ocn_frac,
+  ! which is over-optimistic for realistic domains, but may be satisfied with idealized domains.
+  do p = ceiling(npes/glob_ocn_frac), npes, -1
+
+    ! compute the layout for the current division count, p
+    call MOM_define_layout(n_global, p, layout)
+
+    ! don't bother checking this p if the aspect ratio is extreme
+    r_p = (real(nx)/layout(1)) / (real(ny)/layout(2))
+    if ( r_p * r_extreme < 1 .or. r_extreme < r_p ) cycle
+
+    ! Get the number of masked_blocks for this particular division count
+    call determine_land_blocks(mask, nx, ny, layout(1), layout(2), ibuf, jbuf, num_masked_blocks)
+
+    ! If we can eliminate enough blocks to reach the target npes, adopt
+    ! this p (and the associated layout) and terminate the iteration.
+    if (p-num_masked_blocks <= npes) then
+      call MOM_error(NOTE, "Found the optimum layout for auto-masking. Terminating iteration...")
+      exit
+    endif
+  enddo
+
+  if (num_masked_blocks == 0) then
+    call MOM_error(FATAL, "Couldn't auto-eliminate any land blocks. Try to increase the number "//&
+        "of MOM6 PEs or set AUTO_MASKTABLE to False.")
+  endif
+
+  ! Call determine_land_blocks once again, this time to retrieve and write out the mask_table.
+  allocate(mask_table(num_masked_blocks,2))
+  call determine_land_blocks(mask, nx, ny, layout(1), layout(2), ibuf, jbuf, num_masked_blocks, mask_table)
+  call write_auto_mask_file(mask_table, layout, npes, filename)
+  deallocate(mask_table)
+  deallocate(mask)
+
+end subroutine gen_auto_mask_table
+
+!> Given a number of domain divisions, compute the max number of land blocks that can be eliminated,
+!! and return the resulting mask table if requested.
+subroutine determine_land_blocks(mask, nx, ny, idiv, jdiv, ibuf, jbuf, num_masked_blocks, mask_table)
+  integer, dimension(:,:), intent(in)   :: mask     !< cell masks based on depth and MINIMUM_DEPTH
+  integer, intent(in)                   :: nx       !< Total number of gridpoints in x-dir (global)
+  integer, intent(in)                   :: ny       !< Total number of gridpoints in y-dir (global)
+  integer, intent(in)                   :: idiv     !< number of divisions along x-dir
+  integer, intent(in)                   :: jdiv     !< number of divisions along y-dir
+  integer, intent(in)                   :: ibuf     !< number of buffer cells in x-dir.
+                                                    !! (not necessarily the same as NIHALO)
+  integer, intent(in)                   :: jbuf     !< number of buffer cells in y-dir.
+                                                    !! (not necessarily the same as NJHALO)
+  integer, intent(out)                  :: num_masked_blocks !< the final number of masked blocks
+  integer, intent(out), optional        :: mask_table(:,:) !< the resulting array of mask_table
+  ! integer
+  integer, dimension(idiv) :: ibegin   !< The starting index of each division along x axis
+  integer, dimension(idiv) :: iend     !< The ending index of each division along x axis
+  integer, dimension(jdiv) :: jbegin   !< The starting index of each division along y axis
+  integer, dimension(jdiv) :: jend     !< The ending index of each division along y axis
+  integer :: i, j, ib, ie, jb,je
+
+  call compute_extent(1, nx, idiv, ibegin, iend)
+  call compute_extent(1, ny, jdiv, jbegin, jend)
+
+  num_masked_blocks = 0
+
+  do i=1,idiv
+    ib = ibegin(i)
+    ie = iend(i) + 2 * ibuf
+    do j=1,jdiv
+      jb = jbegin(j)
+      je = jend(j) + 2 * jbuf
+
+      if (any(mask(ib:ie,jb:je)==1)) cycle
+
+      num_masked_blocks = num_masked_blocks + 1
+
+      if (present(mask_table)) then
+        if ( num_masked_blocks > size(mask_table, dim=1)) then
+          call MOM_error(FATAL, "The mask_table argument passed to determine_land_blocks() has insufficient size.")
+        endif
+
+        mask_table(num_masked_blocks,1) = i
+        mask_table(num_masked_blocks,2) = j
+      endif
+    enddo
+  enddo
+
+end subroutine determine_land_blocks
+
+!> Write out the auto-generated mask information to a file in the run directory.
+subroutine write_auto_mask_file(mask_table, layout, npes, filename)
+  integer, intent(in) :: mask_table(:,:)      !> mask table array to be written out.
+  integer, dimension(2), intent(in) :: layout !> PE layout
+  integer, intent(in) :: npes                 !> Number of divisions (incl. eliminated ones)
+  character(len=:), allocatable, intent(in) :: filename !> file name for the mask_table to be written
+  ! local
+  integer :: file_ascii= -1  !< The unit number of the auto-generated mask_file file.
+  integer :: true_num_masked_blocks
+  integer :: p
+
+  ! Eliminate only enough blocks to ensure that the number of active blocks precisely matches the target npes.
+  true_num_masked_blocks = layout(1) * layout(2) - npes
+
+  call open_ASCII_file(file_ascii, trim(filename), action=WRITEONLY_FILE)
+  write(file_ascii, '(I0)'), true_num_masked_blocks
+  write(file_ascii, '(I0,",",I0)'), layout(1), layout(2)
+  do p = 1, true_num_masked_blocks
+    write(file_ascii, '(I0,",",I0)'), mask_table(p,1), mask_table(p,2)
+  enddo
+  call close_file(file_ascii)
+end subroutine write_auto_mask_file
+
 end module MOM_domains
diff --git a/src/framework/MOM_netcdf.F90 b/src/framework/MOM_netcdf.F90
index 95e6aa7bb7..73f276aba9 100644
--- a/src/framework/MOM_netcdf.F90
+++ b/src/framework/MOM_netcdf.F90
@@ -217,6 +217,8 @@ function register_netcdf_field(handle, label, axes, longname, units) &
   allocate(dimids(size(axes)))
   dimids(:) = [(axes(i)%dimid, i = 1, size(axes))]
 
+  field%label = label
+
   ! Determine the corresponding netCDF data type
   ! TODO: Support a `pack`-like argument
   select case (kind(1.0))
@@ -225,7 +227,7 @@ function register_netcdf_field(handle, label, axes, longname, units) &
     case (real64)
       xtype = NF90_DOUBLE
     case default
-      call MOM_error(FATAL, "register_netcdf_axis: Unknown kind(real).")
+      call MOM_error(FATAL, "register_netcdf_field: Unknown kind(real).")
   end select
 
   ! Register the field variable
@@ -293,6 +295,8 @@ function register_netcdf_axis(handle, label, units, longname, points, &
         "Axis must either have explicit points or be a time axis ('T').")
   endif
 
+  axis%label = label
+
   if (present(points)) then
     axis_size = size(points)
     allocate(axis%points(axis_size))
diff --git a/src/ice_shelf/MOM_ice_shelf.F90 b/src/ice_shelf/MOM_ice_shelf.F90
index b435b0a677..b9640502d2 100644
--- a/src/ice_shelf/MOM_ice_shelf.F90
+++ b/src/ice_shelf/MOM_ice_shelf.F90
@@ -1323,7 +1323,7 @@ subroutine initialize_ice_shelf(param_file, ocn_grid, Time, CS, diag, Time_init,
   wd_halos(:)=0
   allocate(CS%Grid)
   call MOM_domains_init(CS%Grid%domain, param_file, min_halo=wd_halos, symmetric=GRID_SYM_,&
-       domain_name='MOM_Ice_Shelf_in')
+                        domain_name='MOM_Ice_Shelf_in', US=CS%US)
   !allocate(CS%Grid_in%HI)
   !call hor_index_init(CS%Grid%Domain, CS%Grid%HI, param_file, &
   !     local_indexing=.not.global_indexing)
diff --git a/src/ocean_data_assim/MOM_oda_driver.F90 b/src/ocean_data_assim/MOM_oda_driver.F90
index 875051b6c7..6e24b9faee 100644
--- a/src/ocean_data_assim/MOM_oda_driver.F90
+++ b/src/ocean_data_assim/MOM_oda_driver.F90
@@ -291,7 +291,7 @@ subroutine init_oda(Time, G, GV, US, diag_CS, CS)
   CS%G => G
   allocate(CS%Grid)
   ! params NIHALO_ODA, NJHALO_ODA set the DA halo size
-  call MOM_domains_init(CS%Grid%Domain,PF,param_suffix='_ODA')
+  call MOM_domains_init(CS%Grid%Domain, PF, param_suffix='_ODA', US=CS%US)
   allocate(HI)
   call hor_index_init(CS%Grid%Domain, HI, PF)
   call verticalGridInit( PF, CS%GV, CS%US )
diff --git a/src/parameterizations/lateral/MOM_hor_visc.F90 b/src/parameterizations/lateral/MOM_hor_visc.F90
index e3249afb73..1b73a7ec12 100644
--- a/src/parameterizations/lateral/MOM_hor_visc.F90
+++ b/src/parameterizations/lateral/MOM_hor_visc.F90
@@ -75,6 +75,8 @@ module MOM_hor_visc
                              !! Ah is the background. Leithy = Leith+E
   real    :: c_K             !< Fraction of energy dissipated by the biharmonic term
                              !! that gets backscattered in the Leith+E scheme. [nondim]
+  logical :: smooth_Ah       !< If true (default), then Ah and m_leithy are smoothed.
+                             !! This smoothing requires a lot of blocking communication.
   logical :: use_QG_Leith_visc    !< If true, use QG Leith nonlinear eddy viscosity.
                              !! KH is the background value.
   logical :: bound_Coriolis  !< If true & SMAGORINSKY_AH is used, the biharmonic
@@ -271,16 +273,14 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     vort_xy_dy, & ! y-derivative of vertical vorticity (d/dy(dv/dx - du/dy)) [L-1 T-1 ~> m-1 s-1]
     vort_xy_dy_smooth, & ! y-derivative of smoothed vertical vorticity [L-1 T-1 ~> m-1 s-1]
     div_xx_dx, &  ! x-derivative of horizontal divergence (d/dx(du/dx + dv/dy)) [L-1 T-1 ~> m-1 s-1]
-    ubtav, &      ! zonal barotropic velocity averaged over a baroclinic time-step [L T-1 ~> m s-1]
-    u_smooth      ! Zonal velocity, smoothed with a spatial low-pass filter [L T-1 ~> m s-1]
+    ubtav         ! zonal barotropic velocity averaged over a baroclinic time-step [L T-1 ~> m s-1]
   real, dimension(SZI_(G),SZJB_(G)) :: &
     Del2v, &      ! The v-component of the Laplacian of velocity [L-1 T-1 ~> m-1 s-1]
     h_v, &        ! Thickness interpolated to v points [H ~> m or kg m-2].
     vort_xy_dx, & ! x-derivative of vertical vorticity (d/dx(dv/dx - du/dy)) [L-1 T-1 ~> m-1 s-1]
     vort_xy_dx_smooth, & ! x-derivative of smoothed vertical vorticity [L-1 T-1 ~> m-1 s-1]
     div_xx_dy, &  ! y-derivative of horizontal divergence (d/dy(du/dx + dv/dy)) [L-1 T-1 ~> m-1 s-1]
-    vbtav, &      ! meridional barotropic velocity averaged over a baroclinic time-step [L T-1 ~> m s-1]
-    v_smooth      ! Meridional velocity, smoothed with a spatial low-pass filter [L T-1 ~> m s-1]
+    vbtav         ! meridional barotropic velocity averaged over a baroclinic time-step [L T-1 ~> m s-1]
   real, dimension(SZI_(G),SZJ_(G)) :: &
     dudx_bt, dvdy_bt, & ! components in the barotropic horizontal tension [T-1 ~> s-1]
     div_xx, &     ! Estimate of horizontal divergence at h-points [T-1 ~> s-1]
@@ -298,8 +298,9 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     dudx, dvdy, &    ! components in the horizontal tension [T-1 ~> s-1]
     dudx_smooth, dvdy_smooth, & ! components in the horizontal tension from smoothed velocity [T-1 ~> s-1]
     GME_effic_h, &  ! The filtered efficiency of the GME terms at h points [nondim]
-    htot, &       ! The total thickness of all layers [H ~> m or kg m-2]
-    m_leithy      ! Kh=m_leithy*Ah in Leith+E parameterization [L-2 ~> m-2]
+    m_leithy, &   ! Kh=m_leithy*Ah in Leith+E parameterization [L-2 ~> m-2]
+    Ah_sq, &      ! The square of the biharmonic viscosity [L8 T-2 ~> m8 s-2]
+    htot          ! The total thickness of all layers [H ~> m or kg m-2]
   real :: Del2vort_h ! Laplacian of vorticity at h-points [L-2 T-1 ~> m-2 s-1]
   real :: grad_vel_mag_bt_h ! Magnitude of the barotropic velocity gradient tensor squared at h-points [T-2 ~> s-2]
   real :: boundary_mask_h ! A mask that zeroes out cells with at least one land edge [nondim]
@@ -322,9 +323,9 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     grad_vort_mag_q_2d, & ! Magnitude of 2d vorticity gradient at q-points [L-1 T-1 ~> m-1 s-1]
     Del2vort_q, & ! Laplacian of vorticity at q-points [L-2 T-1 ~> m-2 s-1]
     grad_div_mag_q, &  ! Magnitude of divergence gradient at q-points [L-1 T-1 ~> m-1 s-1]
-    hq, &         ! harmonic mean of the harmonic means of the u- & v point thicknesses [H ~> m or kg m-2]
-                  ! This form guarantees that hq/hu < 4.
-    GME_effic_q   ! The filtered efficiency of the GME terms at q points [nondim]
+    hq, &          ! harmonic mean of the harmonic means of the u- & v point thicknesses [H ~> m or kg m-2]
+                   ! This form guarantees that hq/hu < 4.
+    GME_effic_q    ! The filtered efficiency of the GME terms at q points [nondim]
   real :: grad_vel_mag_bt_q ! Magnitude of the barotropic velocity gradient tensor squared at q-points [T-2 ~> s-2]
   real :: boundary_mask_q ! A mask that zeroes out cells with at least one land edge [nondim]
 
@@ -351,7 +352,10 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     grid_Re_Kh, &    ! Grid Reynolds number for Laplacian horizontal viscosity at h points [nondim]
     grid_Re_Ah, &    ! Grid Reynolds number for Biharmonic horizontal viscosity at h points [nondim]
     GME_coeff_h      ! GME coefficient at h-points [L2 T-1 ~> m2 s-1]
-
+  real, dimension(SZIB_(G),SZJ_(G),SZK_(GV)) :: &
+    u_smooth         ! Zonal velocity, smoothed with a spatial low-pass filter [L T-1 ~> m s-1]
+  real, dimension(SZI_(G),SZJB_(G),SZK_(GV)) :: &
+    v_smooth         ! Meridional velocity, smoothed with a spatial low-pass filter [L T-1 ~> m s-1]
   real :: AhSm       ! Smagorinsky biharmonic viscosity [L4 T-1 ~> m4 s-1]
   real :: AhLth      ! 2D Leith biharmonic viscosity [L4 T-1 ~> m4 s-1]
   real :: AhLthy     ! 2D Leith+E biharmonic viscosity [L4 T-1 ~> m4 s-1]
@@ -391,6 +395,8 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
   logical :: apply_OBC = .false.
   logical :: use_MEKE_Ku
   logical :: use_MEKE_Au
+  integer :: is_vort, ie_vort, js_vort, je_vort  ! Loop ranges for vorticity terms
+  integer :: is_Kh, ie_Kh, js_Kh, je_Kh  ! Loop ranges for thickness point viscosities
   integer :: is, ie, js, je, Isq, Ieq, Jsq, Jeq, nz
   integer :: i, j, k, n
   real :: inv_PI3, inv_PI2, inv_PI6 ! Powers of the inverse of pi [nondim]
@@ -457,6 +463,22 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
                           "RES_SCALE_MEKE_VISC is True.")
   endif
 
+  ! Set the halo sizes used for the thickness-point viscosities.
+  if (CS%use_Leithy) then
+    js_Kh = js-1 ; je_Kh = je+1 ; is_Kh = is-1 ; ie_Kh = ie+1
+  else
+    js_Kh = Jsq ; je_Kh = je+1 ; is_Kh = Isq ; ie_Kh = ie+1
+  endif
+
+  ! Set the halo sizes used for the vorticity calculations.
+  if ((CS%Leith_Kh) .or. (CS%Leith_Ah) .or. (CS%use_Leithy)) then
+    js_vort = js_Kh-2 ; je_vort = Jeq+2 ; is_vort = is_Kh-2 ; ie_vort = Ieq+2
+    if ((G%isc-G%isd < 3) .or. (G%isc-G%isd < 3)) call MOM_error(FATAL, &
+          "The minimum halo size is 3 when a Leith viscosity is being used.")
+  else
+    js_vort = js-2 ; je_vort = Jeq+1 ; is_vort = is-2 ; ie_vort = Ieq+1
+  endif
+
   legacy_bound = (CS%Smagorinsky_Kh .or. CS%Leith_Kh) .and. &
                  (CS%bound_Kh .and. .not.CS%better_bound_Kh)
 
@@ -475,7 +497,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     call pass_var(h, G%domain, halo=2)
 
     ! Calculate the barotropic horizontal tension
-    do J=js-2,je+2 ; do I=is-2,ie+2
+    do j=js-2,je+2 ; do i=is-2,ie+2
       dudx_bt(i,j) = CS%DY_dxT(i,j)*(G%IdyCu(I,j) * ubtav(I,j) - &
                                      G%IdyCu(I-1,j) * ubtav(I-1,j))
       dvdy_bt(i,j) = CS%DX_dyT(i,j)*(G%IdxCv(i,J) * vbtav(i,J) - &
@@ -494,11 +516,11 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     enddo ; enddo
 
     if (CS%no_slip) then
-      do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
+      do J=js-2,je+1 ; do I=is-2,ie+1
         sh_xy_bt(I,J) = (2.0-G%mask2dBu(I,J)) * ( dvdx_bt(I,J) + dudy_bt(I,J) )
       enddo ; enddo
     else
-      do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
+      do J=js-2,je+1 ; do I=is-2,ie+1
         sh_xy_bt(I,J) = G%mask2dBu(I,J) * ( dvdx_bt(I,J) + dudy_bt(I,J) )
       enddo ; enddo
     endif
@@ -549,12 +571,24 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
   endif ! use_GME
 
+  if (CS%use_Leithy) then
+    ! Smooth the velocity. Right now it happens twice. In the future
+    ! one might make the number of smoothing cycles a user-specified parameter
+    do k=1,nz
+      ! One call applies the filter twice
+      u_smooth(:,:,k) = u(:,:,k)
+      v_smooth(:,:,k) = v(:,:,k)
+      call smooth_x9_uv(G, u_smooth(:,:,k), v_smooth(:,:,k), zero_land=.false.)
+    enddo
+    call pass_vector(u_smooth, v_smooth, G%Domain)
+  endif
+
   !$OMP parallel do default(none) &
   !$OMP shared( &
   !$OMP   CS, G, GV, US, OBC, VarMix, MEKE, u, v, h, &
-  !$OMP   is, ie, js, je, Isq, Ieq, Jsq, Jeq, nz, &
-  !$OMP   apply_OBC, rescale_Kh, legacy_bound, find_FrictWork, &
-  !$OMP   use_MEKE_Ku, use_MEKE_Au, &
+  !$OMP   is, ie, js, je, Isq, Ieq, Jsq, Jeq, nz, is_vort, ie_vort, js_vort, je_vort, &
+  !$OMP   is_Kh, ie_Kh, js_Kh, je_Kh, apply_OBC, rescale_Kh, legacy_bound, find_FrictWork, &
+  !$OMP   use_MEKE_Ku, use_MEKE_Au, u_smooth, v_smooth, &
   !$OMP   backscat_subround, GME_effic_h, GME_effic_q, &
   !$OMP   h_neglect, h_neglect3, inv_PI3, inv_PI6, &
   !$OMP   diffu, diffv, Kh_h, Kh_q, Ah_h, Ah_q, FrictWork, FrictWork_GME, &
@@ -577,8 +611,8 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
   !$OMP   h2uq, h2vq, hu, hv, hq, FatH, RoScl, GME_coeff, &
   !$OMP   dudx_smooth, dudy_smooth, dvdx_smooth, dvdy_smooth, &
   !$OMP   vort_xy_smooth, vort_xy_dx_smooth, vort_xy_dy_smooth, &
-  !$OMP   sh_xx_smooth, sh_xy_smooth, u_smooth, v_smooth, &
-  !$OMP   vert_vort_mag_smooth, m_leithy, AhLthy &
+  !$OMP   sh_xx_smooth, sh_xy_smooth, &
+  !$OMP   vert_vort_mag_smooth, m_leithy, Ah_sq, AhLthy &
   !$OMP )
   do k=1,nz
 
@@ -602,37 +636,32 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     enddo ; enddo
 
     ! Components for the shearing strain
-    do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
+    do J=js_vort,je_vort ; do I=is_vort,ie_vort
       dvdx(I,J) = CS%DY_dxBu(I,J)*(v(i+1,J,k)*G%IdyCv(i+1,J) - v(i,J,k)*G%IdyCv(i,J))
       dudy(I,J) = CS%DX_dyBu(I,J)*(u(I,j+1,k)*G%IdxCu(I,j+1) - u(I,j,k)*G%IdxCu(I,j))
     enddo ; enddo
 
     if (CS%use_Leithy) then
-      ! Smooth the velocity. Right now it happens twice. In the future
-      ! one might make the number of smoothing cycles a user-specified parameter
-      u_smooth(:,:) = u(:,:,k)
-      v_smooth(:,:) = v(:,:,k)
-      call smooth_x9(CS, G, field_u=u_smooth,field_v=v_smooth) ! one call applies the filter twice
       ! Calculate horizontal tension from smoothed velocity
-      do j=Jsq-1,Jeq+2 ; do i=Isq-1,Ieq+2
-        dudx_smooth(i,j) = CS%DY_dxT(i,j)*(G%IdyCu(I,j) * u_smooth(I,j) - &
-                                           G%IdyCu(I-1,j) * u_smooth(I-1,j))
-        dvdy_smooth(i,j) = CS%DX_dyT(i,j)*(G%IdxCv(i,J) * v_smooth(i,J) - &
-                                           G%IdxCv(i,J-1) * v_smooth(i,J-1))
+      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        dudx_smooth(i,j) = CS%DY_dxT(i,j)*(G%IdyCu(I,j) * u_smooth(I,j,k) - &
+                                           G%IdyCu(I-1,j) * u_smooth(I-1,j,k))
+        dvdy_smooth(i,j) = CS%DX_dyT(i,j)*(G%IdxCv(i,J) * v_smooth(i,J,k) - &
+                                           G%IdxCv(i,J-1) * v_smooth(i,J-1,k))
         sh_xx_smooth(i,j) = dudx_smooth(i,j) - dvdy_smooth(i,j)
       enddo ; enddo
 
       ! Components for the shearing strain from smoothed velocity
-      do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
+      do J=js_Kh-1,je_Kh ; do I=is_Kh-1,ie_Kh
         dvdx_smooth(I,J) = CS%DY_dxBu(I,J) * &
-                         (v_smooth(i+1,J)*G%IdyCv(i+1,J) - v_smooth(i,J)*G%IdyCv(i,J))
+                         (v_smooth(i+1,J,k)*G%IdyCv(i+1,J) - v_smooth(i,J,k)*G%IdyCv(i,J))
         dudy_smooth(I,J) = CS%DX_dyBu(I,J) * &
-                         (u_smooth(I,j+1)*G%IdxCu(I,j+1) - u_smooth(I,j)*G%IdxCu(I,j))
+                         (u_smooth(I,j+1,k)*G%IdxCu(I,j+1) - u_smooth(I,j,k)*G%IdxCu(I,j))
       enddo ; enddo
-    end if ! use Leith+E
+    endif ! use Leith+E
 
     if (CS%id_normstress > 0) then
-      do j=Jsq-1,Jeq+2 ; do i=Isq-1,Ieq+2
+      do j=js,je ; do i=is,ie
         NoSt(i,j,k) = sh_xx(i,j)
       enddo ; enddo
     endif
@@ -643,17 +672,17 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     ! even with OBCs if the accelerations are zeroed at OBC points, in which
     ! case the j-loop for h_u could collapse to j=js=1,je+1. -RWH
     if (CS%use_land_mask) then
-      do j=js-2,je+2 ; do I=Isq-1,Ieq+1
+      do j=js-2,je+2 ; do I=is-2,Ieq+1
         h_u(I,j) = 0.5 * (G%mask2dT(i,j)*h(i,j,k) + G%mask2dT(i+1,j)*h(i+1,j,k))
       enddo ; enddo
-      do J=Jsq-1,Jeq+1 ; do i=is-2,ie+2
+      do J=js-2,Jeq+1 ; do i=is-2,ie+2
         h_v(i,J) = 0.5 * (G%mask2dT(i,j)*h(i,j,k) + G%mask2dT(i,j+1)*h(i,j+1,k))
       enddo ; enddo
     else
-      do j=js-2,je+2 ; do I=Isq-1,Ieq+1
+      do j=js-2,je+2 ; do I=is-2,Ieq+1
         h_u(I,j) = 0.5 * (h(i,j,k) + h(i+1,j,k))
       enddo ; enddo
-      do J=Jsq-1,Jeq+1 ; do i=is-2,ie+2
+      do J=js-2,Jeq+1 ; do i=is-2,ie+2
         h_v(i,J) = 0.5 * (h(i,j,k) + h(i,j+1,k))
       enddo ; enddo
     endif
@@ -663,8 +692,8 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     if (apply_OBC) then ; do n=1,OBC%number_of_segments
       J = OBC%segment(n)%HI%JsdB ; I = OBC%segment(n)%HI%IsdB
       if (OBC%zero_strain .or. OBC%freeslip_strain .or. OBC%computed_strain) then
-        if (OBC%segment(n)%is_N_or_S .and. (J >= js-2) .and. (J <= Jeq+1)) then
-          do I=OBC%segment(n)%HI%IsdB,OBC%segment(n)%HI%IedB
+        if (OBC%segment(n)%is_N_or_S .and. (J >= Js_vort) .and. (J <= Je_vort)) then
+          do I = max(OBC%segment(n)%HI%IsdB,Is_vort), min(OBC%segment(n)%HI%IedB,Ie_vort)
             if (OBC%zero_strain) then
               dvdx(I,J) = 0. ; dudy(I,J) = 0.
             elseif (OBC%freeslip_strain) then
@@ -684,9 +713,13 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
                 dudy(I,J) = CS%DX_dyBu(I,J)*OBC%segment(n)%tangential_grad(I,J,k)*G%IdxCu(I,j+1)*G%dxBu(I,J)
               endif
             endif
+            if (CS%use_Leithy) then
+              dvdx_smooth(I,J) = dvdx(I,J)
+              dudy_smooth(I,J) = dudy(I,J)
+            endif
           enddo
-        elseif (OBC%segment(n)%is_E_or_W .and. (I >= is-2) .and. (I <= Ieq+1)) then
-          do J=OBC%segment(n)%HI%JsdB,OBC%segment(n)%HI%JedB
+        elseif (OBC%segment(n)%is_E_or_W .and. (I >= is_vort) .and. (I <= ie_vort)) then
+          do J = max(OBC%segment(n)%HI%JsdB,js_vort), min(OBC%segment(n)%HI%JedB,je_vort)
             if (OBC%zero_strain) then
               dvdx(I,J) = 0. ; dudy(I,J) = 0.
             elseif (OBC%freeslip_strain) then
@@ -706,6 +739,10 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
                 dvdx(I,J) = CS%DY_dxBu(I,J)*OBC%segment(n)%tangential_grad(I,J,k)*G%IdyCv(i+1,J)*G%dxBu(I,J)
               endif
             endif
+            if (CS%use_Leithy) then
+              dvdx_smooth(I,J) = dvdx(I,J)
+              dudy_smooth(I,J) = dudy(I,J)
+            endif
           enddo
         endif
       endif
@@ -715,25 +752,25 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
         ! OBC projections, but they might not be necessary if the accelerations
         ! are always zeroed out at OBC points, in which case the i-loop below
         ! becomes do i=is-1,ie+1. -RWH
-        if ((J >= Jsq-1) .and. (J <= Jeq+1)) then
+        if ((J >= js-2) .and. (J <= Jeq+1)) then
           do i = max(is-2,OBC%segment(n)%HI%isd), min(ie+2,OBC%segment(n)%HI%ied)
             h_v(i,J) = h(i,j,k)
           enddo
         endif
       elseif (OBC%segment(n)%direction == OBC_DIRECTION_S) then
-        if ((J >= Jsq-1) .and. (J <= Jeq+1)) then
+        if ((J >= js-2) .and. (J <= Jeq+1)) then
           do i = max(is-2,OBC%segment(n)%HI%isd), min(ie+2,OBC%segment(n)%HI%ied)
             h_v(i,J) = h(i,j+1,k)
           enddo
         endif
       elseif (OBC%segment(n)%direction == OBC_DIRECTION_E) then
-        if ((I >= Isq-1) .and. (I <= Ieq+1)) then
+        if ((I >= is-2) .and. (I <= Ieq+1)) then
           do j = max(js-2,OBC%segment(n)%HI%jsd), min(je+2,OBC%segment(n)%HI%jed)
             h_u(I,j) = h(i,j,k)
           enddo
         endif
       elseif (OBC%segment(n)%direction == OBC_DIRECTION_W) then
-        if ((I >= Isq-1) .and. (I <= Ieq+1)) then
+        if ((I >= is-2) .and. (I <= Ieq+1)) then
           do j = max(js-2,OBC%segment(n)%HI%jsd), min(je+2,OBC%segment(n)%HI%jed)
             h_u(I,j) = h(i+1,j,k)
           enddo
@@ -745,25 +782,25 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       J = OBC%segment(n)%HI%JsdB ; I = OBC%segment(n)%HI%IsdB
       if (OBC%segment(n)%direction == OBC_DIRECTION_N) then
         if ((J >= js-2) .and. (J <= je)) then
-          do I = max(Isq-1,OBC%segment(n)%HI%IsdB), min(Ieq+1,OBC%segment(n)%HI%IedB)
+          do I = max(is-2,OBC%segment(n)%HI%IsdB), min(Ieq+1,OBC%segment(n)%HI%IedB)
             h_u(I,j+1) = h_u(I,j)
           enddo
         endif
       elseif (OBC%segment(n)%direction == OBC_DIRECTION_S) then
         if ((J >= js-1) .and. (J <= je+1)) then
-          do I = max(Isq-1,OBC%segment(n)%HI%isd), min(Ieq+1,OBC%segment(n)%HI%ied)
+          do I = max(is-2,OBC%segment(n)%HI%isd), min(Ieq+1,OBC%segment(n)%HI%ied)
             h_u(I,j) = h_u(I,j+1)
           enddo
         endif
       elseif (OBC%segment(n)%direction == OBC_DIRECTION_E) then
         if ((I >= is-2) .and. (I <= ie)) then
-          do J = max(Jsq-1,OBC%segment(n)%HI%jsd), min(Jeq+1,OBC%segment(n)%HI%jed)
+          do J = max(js-2,OBC%segment(n)%HI%jsd), min(Jeq+1,OBC%segment(n)%HI%jed)
             h_v(i+1,J) = h_v(i,J)
           enddo
         endif
       elseif (OBC%segment(n)%direction == OBC_DIRECTION_W) then
         if ((I >= is-1) .and. (I <= ie+1)) then
-          do J = max(Jsq-1,OBC%segment(n)%HI%jsd), min(Jeq+1,OBC%segment(n)%HI%jed)
+          do J = max(js-2,OBC%segment(n)%HI%jsd), min(Jeq+1,OBC%segment(n)%HI%jed)
             h_v(i,J) = h_v(i+1,J)
           enddo
         endif
@@ -788,11 +825,11 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       ! Shearing strain (including no-slip boundary conditions at the 2-D land-sea mask).
       ! dudy_smooth and dvdx_smooth do not (yet) include modifications at OBCs from above.
       if (CS%no_slip) then
-        do J=js-2,Jeq+1 ; do I=is-2,Ieq+1
+        do J=js-1,Jeq ; do I=is-1,Ieq
           sh_xy_smooth(I,J) = (2.0-G%mask2dBu(I,J)) * ( dvdx_smooth(I,J) + dudy_smooth(I,J) )
         enddo ; enddo
       else
-        do J=js-2,Jeq+1 ; do I=is-2,Ieq+1
+        do J=js-1,Jeq ; do I=is-1,Ieq
           sh_xy_smooth(I,J) = G%mask2dBu(I,J) * ( dvdx_smooth(I,J) + dudy_smooth(I,J) )
         enddo ; enddo
       endif
@@ -825,55 +862,53 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     endif
 
     ! Vorticity
-    if (CS%no_slip) then
-      do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
-        vort_xy(I,J) = (2.0-G%mask2dBu(I,J)) * ( dvdx(I,J) - dudy(I,J) )
-      enddo ; enddo
-    else
-      do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
-        vort_xy(I,J) = G%mask2dBu(I,J) * ( dvdx(I,J) - dudy(I,J) )
-      enddo ; enddo
+    if ((CS%Leith_Kh) .or. (CS%Leith_Ah) .or. (CS%use_Leithy) .or. (CS%id_vort_xy_q>0)) then
+      if (CS%no_slip) then
+        do J=js_vort,je_vort ; do I=is_vort,ie_vort
+          vort_xy(I,J) = (2.0-G%mask2dBu(I,J)) * ( dvdx(I,J) - dudy(I,J) )
+        enddo ; enddo
+      else
+        do J=js_vort,je_vort ; do I=is_vort,ie_vort
+          vort_xy(I,J) = G%mask2dBu(I,J) * ( dvdx(I,J) - dudy(I,J) )
+        enddo ; enddo
+      endif
     endif
 
     if (CS%use_Leithy) then
       if (CS%no_slip) then
-        do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
+        do J=js_Kh-1,je_Kh ; do I=is_Kh-1,ie_Kh
           vort_xy_smooth(I,J) = (2.0-G%mask2dBu(I,J)) * ( dvdx_smooth(I,J) - dudy_smooth(I,J) )
         enddo ; enddo
       else
-        do J=Jsq-2,Jeq+2 ; do I=Isq-2,Ieq+2
+        do J=js_Kh-1,je_Kh ; do I=is_Kh-1,ie_Kh
           vort_xy_smooth(I,J) = G%mask2dBu(I,J) * ( dvdx_smooth(I,J) - dudy_smooth(I,J) )
         enddo ; enddo
       endif
     endif
 
-    ! Divergence
-    do j=Jsq-1,Jeq+2 ; do i=Isq-1,Ieq+2
-      div_xx(i,j) = dudx(i,j) + dvdy(i,j)
-    enddo ; enddo
 
     if ((CS%Leith_Kh) .or. (CS%Leith_Ah) .or. (CS%use_Leithy)) then
 
       ! Vorticity gradient
-      do J=Jsq-1,Jeq+1 ; do i=Isq-1,Ieq+2
+      do J=js-2,je_Kh ; do i=is_Kh-1,ie_Kh+1
         DY_dxBu = G%dyBu(I,J) * G%IdxBu(I,J)
         vort_xy_dx(i,J) = DY_dxBu * (vort_xy(I,J) * G%IdyCu(I,j) - vort_xy(I-1,J) * G%IdyCu(I-1,j))
       enddo ; enddo
 
-      do j=Jsq-1,Jeq+2 ; do I=Isq-1,Ieq+1
+      do j=js_Kh-1,je_Kh+1 ; do I=is-2,ie_Kh
         DX_dyBu = G%dxBu(I,J) * G%IdyBu(I,J)
         vort_xy_dy(I,j) = DX_dyBu * (vort_xy(I,J) * G%IdxCv(i,J) - vort_xy(I,J-1) * G%IdxCv(i,J-1))
       enddo ; enddo
 
       if (CS%use_Leithy) then
         ! Gradient of smoothed vorticity
-        do J=Jsq-1,Jeq+1 ; do i=Isq-1,Ieq+2
+        do J=js_Kh-1,je_Kh ; do i=is_Kh,ie_Kh
           DY_dxBu = G%dyBu(I,J) * G%IdxBu(I,J)
           vort_xy_dx_smooth(i,J) = DY_dxBu * &
                       (vort_xy_smooth(I,J) * G%IdyCu(I,j) - vort_xy_smooth(I-1,J) * G%IdyCu(I-1,j))
         enddo ; enddo
 
-        do j=Jsq-1,Jeq+2 ; do I=Isq-1,Ieq+1
+        do j=js_Kh,je_Kh ; do I=is_Kh-1,ie_Kh
           DX_dyBu = G%dxBu(I,J) * G%IdyBu(I,J)
           vort_xy_dy_smooth(I,j) = DX_dyBu * &
                       (vort_xy_smooth(I,J) * G%IdxCv(i,J) - vort_xy_smooth(I,J-1) * G%IdxCv(i,J-1))
@@ -881,46 +916,53 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       endif ! If Leithy
 
       ! Laplacian of vorticity
-      do J=Jsq-1,Jeq+1 ; do I=Isq-1,Ieq+1
+      ! if (CS%Leith_Ah .or. CS%use_Leithy) then
+      do J=js_Kh-1,je_Kh ; do I=is_Kh-1,ie_Kh
         DY_dxBu = G%dyBu(I,J) * G%IdxBu(I,J)
         DX_dyBu = G%dxBu(I,J) * G%IdyBu(I,J)
 
         Del2vort_q(I,J) = DY_dxBu * (vort_xy_dx(i+1,J) * G%IdyCv(i+1,J) - vort_xy_dx(i,J) * G%IdyCv(i,J)) + &
                           DX_dyBu * (vort_xy_dy(I,j+1) * G%IdyCu(I,j+1) - vort_xy_dy(I,j) * G%IdyCu(I,j))
       enddo ; enddo
+      ! endif
 
       if (CS%modified_Leith) then
 
+        ! Divergence
+        do j=js_Kh-1,je_Kh+1 ; do i=is_Kh-1,ie_Kh+1
+          div_xx(i,j) = dudx(i,j) + dvdy(i,j)
+        enddo ; enddo
+
         ! Divergence gradient
-        do j=Jsq-1,Jeq+2 ; do I=Isq-1,Ieq+1
+        do j=js-1,je+1 ; do I=is_Kh-1,ie_Kh
           div_xx_dx(I,j) = G%IdxCu(I,j)*(div_xx(i+1,j) - div_xx(i,j))
         enddo ; enddo
-        do J=Jsq-1,Jeq+1 ; do i=Isq-1,Ieq+2
+        do J=js_Kh-1,je_Kh ; do i=is-1,ie+1
           div_xx_dy(i,J) = G%IdyCv(i,J)*(div_xx(i,j+1) - div_xx(i,j))
         enddo ; enddo
 
         ! Magnitude of divergence gradient
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           grad_div_mag_h(i,j) = sqrt((0.5*(div_xx_dx(I,j) + div_xx_dx(I-1,j)))**2 + &
                                      (0.5*(div_xx_dy(i,J) + div_xx_dy(i,J-1)))**2)
         enddo ; enddo
-        do j=Jsq-1,Jeq+1 ; do i=Isq-1,Ieq+1
+        do J=js-1,Jeq ; do I=is-1,Ieq
           grad_div_mag_q(I,J) = sqrt((0.5*(div_xx_dx(I,j) + div_xx_dx(I,j+1)))**2 + &
                                      (0.5*(div_xx_dy(i,J) + div_xx_dy(i+1,J)))**2)
         enddo ; enddo
 
       else
 
-        do j=Jsq-1,Jeq+2 ; do I=is-2,Ieq+1
+        do j=js-1,je+1 ; do I=is_Kh-1,ie_Kh
           div_xx_dx(I,j) = 0.0
         enddo ; enddo
-        do J=Jsq-1,Jeq+1 ; do i=Isq-1,Ieq+2
+        do J=js_Kh-1,je_Kh ; do i=is-1,ie+1
           div_xx_dy(i,J) = 0.0
         enddo ; enddo
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           grad_div_mag_h(i,j) = 0.0
         enddo ; enddo
-        do J=Jsq-1,Jeq+1 ; do I=Isq-1,Ieq+1
+        do J=js-1,Jeq ; do I=is-1,Ieq
           grad_div_mag_q(I,J) = 0.0
         enddo ; enddo
 
@@ -928,17 +970,17 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
       ! Add in beta for the Leith viscosity
       if (CS%use_beta_in_Leith) then
-        do J=js-2,Jeq+1 ; do i=is-1,Ieq+1
+        do J=js-2,Jeq+1 ; do i=is-1,ie+1
           vort_xy_dx(i,J) = vort_xy_dx(i,J) + 0.5 * ( G%dF_dx(i,j) + G%dF_dx(i,j+1))
         enddo ; enddo
-        do j=js-1,Jeq+1 ; do I=is-2,Ieq+1
+        do j=js-1,je+1 ; do I=is-2,Ieq+1
           vort_xy_dy(I,j) = vort_xy_dy(I,j) + 0.5 * ( G%dF_dy(i,j) + G%dF_dy(i+1,j))
         enddo ; enddo
       endif ! CS%use_beta_in_Leith
 
       if (CS%use_QG_Leith_visc) then
 
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           grad_vort_mag_h_2d(i,j) = SQRT((0.5*(vort_xy_dx(i,J) + vort_xy_dx(i,J-1)))**2 + &
                                          (0.5*(vort_xy_dy(I,j) + vort_xy_dy(I-1,j)))**2 )
         enddo ; enddo
@@ -953,7 +995,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
       endif
 
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         grad_vort_mag_h(i,j) = SQRT((0.5*(vort_xy_dx(i,J) + vort_xy_dx(i,J-1)))**2 + &
                                     (0.5*(vort_xy_dy(I,j) + vort_xy_dy(I-1,j)))**2 )
       enddo ; enddo
@@ -963,7 +1005,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       enddo ; enddo
 
       if (CS%use_Leithy) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           vert_vort_mag_smooth(i,j) = SQRT((0.5*(vort_xy_dx_smooth(i,J) + &
                                                  vort_xy_dx_smooth(i,J-1)))**2 + &
                                            (0.5*(vort_xy_dy_smooth(I,j) + &
@@ -974,7 +1016,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     endif ! CS%Leith_Kh
 
     if ((CS%Smagorinsky_Kh) .or. (CS%Smagorinsky_Ah)) then
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         sh_xx_sq = sh_xx(i,j)**2
         sh_xy_sq = 0.25 * ( (sh_xy(I-1,J-1)**2 + sh_xy(I,J)**2) &
                           + (sh_xy(I-1,J)**2 + sh_xy(I,J-1)**2) )
@@ -983,13 +1025,13 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
     endif
 
     if (CS%better_bound_Ah .or. CS%better_bound_Kh) then
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         h_min = min(h_u(I,j), h_u(I-1,j), h_v(i,J), h_v(i,J-1))
         hrat_min(i,j) = min(1.0, h_min / (h(i,j,k) + h_neglect))
       enddo ; enddo
 
       if (CS%better_bound_Kh) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           visc_bound_rem(i,j) = 1.0
         enddo ; enddo
       endif
@@ -1000,28 +1042,28 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       ! largest value from several parameterizations. Also get
       ! the Laplacian component of str_xx.
 
-      if ((CS%Leith_Kh) .or. (CS%Leith_Ah)) then
+      if ((CS%Leith_Kh) .or. (CS%Leith_Ah) .or. (CS%use_Leithy)) then
         if (CS%use_QG_Leith_visc) then
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             grad_vort = grad_vort_mag_h(i,j) + grad_div_mag_h(i,j)
             grad_vort_qg = 3. * grad_vort_mag_h_2d(i,j)
             vert_vort_mag(i,j) = min(grad_vort, grad_vort_qg)
           enddo ; enddo
         else
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             vert_vort_mag(i,j) = grad_vort_mag_h(i,j) + grad_div_mag_h(i,j)
           enddo ; enddo
         endif
       endif
 
       ! Static (pre-computed) background viscosity
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         Kh(i,j) = CS%Kh_bg_xx(i,j)
       enddo ; enddo
 
       ! NOTE: The following do-block can be decomposed and vectorized after the
       !   stack size has been reduced.
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         if (CS%add_LES_viscosity) then
           if (CS%Smagorinsky_Kh) &
             Kh(i,j) = Kh(i,j) + CS%Laplac2_const_xx(i,j) * Shear_mag(i,j)
@@ -1038,38 +1080,38 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       ! All viscosity contributions above are subject to resolution scaling
 
       if (rescale_Kh) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           Kh(i,j) = VarMix%Res_fn_h(i,j) * Kh(i,j)
         enddo ; enddo
       endif
 
       if (legacy_bound) then
         ! Older method of bounding for stability
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           Kh(i,j) = min(Kh(i,j), CS%Kh_Max_xx(i,j))
         enddo ; enddo
       endif
 
       ! Place a floor on the viscosity, if desired.
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         Kh(i,j) = max(Kh(i,j), CS%Kh_bg_min)
       enddo ; enddo
 
       if (use_MEKE_Ku) then
         ! *Add* the MEKE contribution (which might be negative)
         if (CS%res_scale_MEKE) then
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Kh(i,j) = Kh(i,j) + MEKE%Ku(i,j) * VarMix%Res_fn_h(i,j)
           enddo ; enddo
         else
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Kh(i,j) = Kh(i,j) + MEKE%Ku(i,j)
           enddo ; enddo
         endif
       endif
 
       if (CS%anisotropic) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           ! *Add* the tension component of anisotropic viscosity
           Kh(i,j) = Kh(i,j) + CS%Kh_aniso * (1. - CS%n1n2_h(i,j)**2)
         enddo ; enddo
@@ -1077,7 +1119,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
       ! Newer method of bounding for stability
       if (CS%better_bound_Kh) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           if (Kh(i,j) >= hrat_min(i,j) * CS%Kh_Max_xx(i,j)) then
             visc_bound_rem(i,j) = 0.0
             Kh(i,j) = hrat_min(i,j) * CS%Kh_Max_xx(i,j)
@@ -1090,19 +1132,19 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       ! In Leith+E parameterization Kh is computed after Ah in the biharmonic loop.
       ! The harmonic component of str_xx is added in the biharmonic loop.
       if (CS%use_Leithy) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           Kh(i,j) = 0.
         enddo ; enddo
-      end if
+      endif
 
       if (CS%id_Kh_h>0 .or. CS%debug) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           Kh_h(i,j,k) = Kh(i,j)
         enddo ; enddo
       endif
 
       if (CS%id_grid_Re_Kh>0) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js,je ; do i=is,ie
           KE = 0.125*((u(I,j,k)+u(I-1,j,k))**2 + (v(i,J,k)+v(i,J-1,k))**2)
           grid_Kh = max(Kh(i,j), CS%min_grid_Kh)
           grid_Re_Kh(i,j,k) = (sqrt(KE) * sqrt(CS%grid_sp_h2(i,j))) / grid_Kh
@@ -1110,13 +1152,13 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       endif
 
       if (CS%id_div_xx_h>0) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
-          div_xx_h(i,j,k) = div_xx(i,j)
+        do j=js,je ; do i=is,ie
+          div_xx_h(i,j,k) = dudx(i,j) + dvdy(i,j)
         enddo ; enddo
       endif
 
       if (CS%id_sh_xx_h>0) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js,je ; do i=is,ie
           sh_xx_h(i,j,k) = sh_xx(i,j)
         enddo ; enddo
       endif
@@ -1143,21 +1185,21 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       ! Determine the biharmonic viscosity at h points, using the
       ! largest value from several parameterizations. Also get the
       ! biharmonic component of str_xx.
-      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
         Ah(i,j) = CS%Ah_bg_xx(i,j)
       enddo ; enddo
 
       if ((CS%Smagorinsky_Ah) .or. (CS%Leith_Ah) .or. (CS%use_Leithy)) then
         if (CS%Smagorinsky_Ah) then
           if (CS%bound_Coriolis) then
-            do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+           do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
               AhSm = Shear_mag(i,j) * (CS%Biharm_const_xx(i,j) &
                   + CS%Biharm_const2_xx(i,j) * Shear_mag(i,j) &
               )
               Ah(i,j) = max(Ah(i,j), AhSm)
             enddo ; enddo
           else
-            do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+            do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
               AhSm = CS%Biharm_const_xx(i,j) * Shear_mag(i,j)
               Ah(i,j) = max(Ah(i,j), AhSm)
             enddo ; enddo
@@ -1165,7 +1207,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
         endif
 
         if (CS%Leith_Ah) then
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Del2vort_h = 0.25 * ((Del2vort_q(I,J) + Del2vort_q(I-1,J-1)) + &
                                  (Del2vort_q(I-1,J) + Del2vort_q(I,J-1)))
             AhLth = CS%Biharm6_const_xx(i,j) * abs(Del2vort_h) * inv_PI6
@@ -1175,7 +1217,8 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
         if (CS%use_Leithy) then
           ! Get m_leithy
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          if (CS%smooth_Ah) m_leithy(:,:) = 0.0 ! This is here to initialize domain edge halo values.
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Del2vort_h = 0.25 * ((Del2vort_q(I,J) + Del2vort_q(I-1,J-1)) + &
                                  (Del2vort_q(I-1,J) + Del2vort_q(I,J-1)))
             AhLth  = CS%Biharm6_const_xx(i,j) * inv_PI6 * abs(Del2vort_h)
@@ -1189,30 +1232,44 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
               endif
             endif
           enddo ; enddo
-          ! Smooth m_leithy
-          call smooth_x9(CS, G, field_h=m_leithy, zero_land=.true.)
+
+          if (CS%smooth_Ah) then
+            ! Smooth m_leithy.  A single call smoothes twice.
+            call pass_var(m_leithy, G%Domain, halo=2)
+            call smooth_x9_h(G, m_leithy, zero_land=.true.)
+            call pass_var(m_leithy, G%Domain)
+          endif
           ! Get Ah
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Del2vort_h = 0.25 * ((Del2vort_q(I,J) + Del2vort_q(I-1,J-1)) + &
                                  (Del2vort_q(I-1,J) + Del2vort_q(I,J-1)))
             AhLthy = CS%Biharm6_const_xx(i,j) * inv_PI6 * &
                     sqrt(max(0.,Del2vort_h**2 - m_leithy(i,j)*vert_vort_mag_smooth(i,j)**2))
             Ah(i,j) = max(CS%Ah_bg_xx(i,j), AhLthy)
           enddo ; enddo
-          ! Smooth Ah before applying upper bound
-          ! square, then smooth, then square root
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
-            Ah_h(i,j,k) = Ah(i,j)**2
-          enddo ; enddo
-          call smooth_x9(CS, G, field_h=Ah_h(:,:,k))
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
-            Ah_h(i,j,k) = sqrt(Ah_h(i,j,k))
-            Ah(i,j)     = Ah_h(i,j,k)
-          enddo ; enddo
+          if (CS%smooth_Ah) then
+            ! Smooth Ah before applying upper bound.  Square Ah, then smooth, then take its square root.
+            Ah_sq(:,:) = 0.0 ! This is here to initialize domain edge halo values.
+            do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
+              Ah_sq(i,j) = Ah(i,j)**2
+            enddo ; enddo
+            call pass_var(Ah_sq, G%Domain, halo=2)
+            ! A single call smoothes twice.
+            call smooth_x9_h(G, Ah_sq, zero_land=.false.)
+            call pass_var(Ah_sq, G%Domain)
+            do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
+              Ah_h(i,j,k) = max(CS%Ah_bg_xx(i,j), sqrt(max(0., Ah_sq(i,j))))
+              Ah(i,j)     = Ah_h(i,j,k)
+            enddo ; enddo
+          else
+            do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
+              Ah_h(i,j,k) = Ah(i,j)
+            enddo ; enddo
+          endif
         endif
 
         if (CS%bound_Ah .and. .not. CS%better_bound_Ah) then
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Ah(i,j) = min(Ah(i,j), CS%Ah_Max_xx(i,j))
           enddo ; enddo
         endif
@@ -1220,13 +1277,13 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
       if (use_MEKE_Au) then
         ! *Add* the MEKE contribution
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           Ah(i,j) = Ah(i,j) + MEKE%Au(i,j)
         enddo ; enddo
       endif
 
       if (CS%Re_Ah > 0.0) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           KE = 0.125*((u(I,j,k)+u(I-1,j,k))**2 + (v(i,J,k)+v(i,J-1,k))**2)
           Ah(i,j) = sqrt(KE) * CS%Re_Ah_const_xx(i,j)
         enddo ; enddo
@@ -1234,18 +1291,18 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
       if (CS%better_bound_Ah) then
         if (CS%better_bound_Kh) then
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Ah(i,j) = min(Ah(i,j), visc_bound_rem(i,j) * hrat_min(i,j) * CS%Ah_Max_xx(i,j))
           enddo ; enddo
         else
-          do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+          do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
             Ah(i,j) = min(Ah(i,j), hrat_min(i,j) * CS%Ah_Max_xx(i,j))
           enddo ; enddo
         endif
       endif
 
-      if ((CS%id_Ah_h>0) .or. CS%debug) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      if ((CS%id_Ah_h>0) .or. CS%debug .or. CS%use_Leithy) then
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
           Ah_h(i,j,k) = Ah(i,j)
         enddo ; enddo
       endif
@@ -1253,14 +1310,14 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       if (CS%use_Leithy) then
         ! Compute Leith+E Kh after bounds have been applied to Ah
         ! and after it has been smoothed. Kh = -m_leithy * Ah
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
-            Kh(i,j) = -m_leithy(i,j) * Ah(i,j)
-            Kh_h(i,j,k) = Kh(i,j)
+        do j=js_Kh,je_Kh ; do i=is_Kh,ie_Kh
+          Kh(i,j) = -m_leithy(i,j) * Ah(i,j)
+          Kh_h(i,j,k) = Kh(i,j)
         enddo ; enddo
       endif
 
       if (CS%id_grid_Re_Ah>0) then
-        do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+        do j=js,je ; do i=is,ie
           KE = 0.125 * ((u(I,j,k) + u(I-1,j,k))**2 + (v(i,J,k) + v(i,J-1,k))**2)
           grid_Ah = max(Ah(i,j), CS%min_grid_Ah)
           grid_Re_Ah(i,j,k) = (sqrt(KE) * CS%grid_sp_h3(i,j)) / grid_Ah
@@ -1462,7 +1519,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
         ! Leith+E doesn't recompute Kh at q points, it just interpolates it from h to q points
         if (CS%use_Leithy) then
-          Kh(I,J) = Kh_h(i+1,j+1,k)
+          Kh(I,J) = 0.25 * ((Kh_h(i,j,k) + Kh_h(i+1,j+1,k)) + (Kh_h(i,j+1,k) + Kh_h(i+1,j,k)))
         end if
 
         if (CS%id_Kh_q>0 .or. CS%debug) &
@@ -1569,7 +1626,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
       ! Leith+E doesn't recompute Ah at q points, it just interpolates it from h to q points
       if (CS%use_Leithy) then
         do J=js-1,Jeq ; do I=is-1,Ieq
-           Ah(I,J) = Ah_h(i+1,j+1,k)
+          Ah(I,J) = 0.25 * ((Ah_h(i,j,k) + Ah_h(i+1,j+1,k)) + (Ah_h(i,j+1,k) + Ah_h(i+1,j,k)))
         enddo ; enddo
       end if
 
@@ -1633,7 +1690,7 @@ subroutine horizontal_viscosity(u, v, h, diffu, diffv, MEKE, VarMix, G, GV, US,
 
     else ! .not. use_GME
       ! This changes the units of str_xx from [L2 T-2 ~> m2 s-2] to [H L2 T-2 ~> m3 s-2 or kg s-2].
-      do J=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+      do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
         str_xx(i,j) = str_xx(i,j) * (h(i,j,k) * CS%reduction_xx(i,j))
       enddo ; enddo
 
@@ -2185,7 +2242,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
   if (.not.CS%Laplacian) CS%use_Kh_bg_2d = .false.
   call get_param(param_file, mdl, "KH_BG_2D_BUG", CS%Kh_bg_2d_bug, &
                  "If true, retain an answer-changing horizontal indexing bug in setting "//&
-                 "the corner-point viscosities when USE_KH_BG_2D=True. This is"//&
+                 "the corner-point viscosities when USE_KH_BG_2D=True.  This is "//&
                  "not recommended.", default=.false., do_not_log=.not.CS%use_Kh_bg_2d)
 
   call get_param(param_file, mdl, "USE_GME", CS%use_GME, &
@@ -2195,13 +2252,17 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
                  "Use the split time stepping if true.", default=.true., do_not_log=.true.)
   if (CS%use_Leithy) then
     if (.not.(CS%biharmonic .and. CS%Laplacian)) then
-                   call MOM_error(FATAL, "MOM_hor_visc.F90, hor_visc_init:"//&
+                   call MOM_error(FATAL, "MOM_hor_visc.F90, hor_visc_init: "//&
                    "LAPLACIAN and BIHARMONIC must both be True when USE_LEITHY=True.")
     endif
-    call get_param(param_file, mdl, "LEITHY_CK", CS%c_K, &
-                   "Fraction of biharmonic dissipation that gets backscattered, "//&
-                   "in Leith+E.", units="nondim", default=1.0)
   endif
+  call get_param(param_file, mdl, "LEITHY_CK", CS%c_K, &
+                 "Fraction of biharmonic dissipation that gets backscattered, "//&
+                 "in Leith+E.", units="nondim", default=1.0, do_not_log=.not.CS%use_Leithy)
+  call get_param(param_file, mdl, "SMOOTH_AH", CS%smooth_Ah, &
+                 "If true, Ah and m_leithy are smoothed within Leith+E.  This requires "//&
+                 "lots of blocking communications, which can be expensive", &
+                 default=.true., do_not_log=.not.CS%use_Leithy)
 
   if (CS%use_GME .and. .not.split) call MOM_error(FATAL,"ERROR: Currently, USE_GME = True "// &
                                            "cannot be used with SPLIT=False.")
@@ -2338,7 +2399,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
     CS%dx2q(I,J) = G%dxBu(I,J)*G%dxBu(I,J) ; CS%dy2q(I,J) = G%dyBu(I,J)*G%dyBu(I,J)
     CS%DX_dyBu(I,J) = G%dxBu(I,J)*G%IdyBu(I,J) ; CS%DY_dxBu(I,J) = G%dyBu(I,J)*G%IdxBu(I,J)
   enddo ; enddo
-  do j=Jsq-1,Jeq+2 ; do i=Isq-1,Ieq+2
+  do j=js-2,Jeq+2 ; do i=is-2,Ieq+2
     CS%dx2h(i,j) = G%dxT(i,j)*G%dxT(i,j) ; CS%dy2h(i,j) = G%dyT(i,j)*G%dyT(i,j)
     CS%DX_dyT(i,j) = G%dxT(i,j)*G%IdyT(i,j) ; CS%DY_dxT(i,j) = G%dyT(i,j)*G%IdxT(i,j)
   enddo ; enddo
@@ -2379,7 +2440,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
     ! Calculate and store the background viscosity at h-points
 
     min_grid_sp_h2 = huge(1.)
-    do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+    do j=js-1,Jeq+1 ; do i=is-1,Ieq+1
       ! Static factors in the Smagorinsky and Leith schemes
       grid_sp_h2 = (2.0*CS%dx2h(i,j)*CS%dy2h(i,j)) / (CS%dx2h(i,j) + CS%dy2h(i,j))
       CS%grid_sp_h2(i,j) = grid_sp_h2
@@ -2438,11 +2499,11 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
     enddo ; enddo
   endif
   if (CS%biharmonic) then
-    do j=js-1,Jeq+1 ; do I=Isq-1,Ieq+1
+    do j=js-1,Jeq+1 ; do I=is-2,Ieq+1
       CS%Idx2dyCu(I,j) = (G%IdxCu(I,j)*G%IdxCu(I,j)) * G%IdyCu(I,j)
       CS%Idxdy2u(I,j) = G%IdxCu(I,j) * (G%IdyCu(I,j)*G%IdyCu(I,j))
     enddo ; enddo
-    do J=Jsq-1,Jeq+1 ; do i=is-1,Ieq+1
+    do J=js-2,Jeq+1 ; do i=is-1,Ieq+1
       CS%Idx2dyCv(i,J) = (G%IdxCv(i,J)*G%IdxCv(i,J)) * G%IdyCv(i,J)
       CS%Idxdy2v(i,J) = G%IdxCv(i,J) * (G%IdyCv(i,J)*G%IdyCv(i,J))
     enddo ; enddo
@@ -2454,7 +2515,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
       BoundCorConst = 1.0 / (5.0*(bound_Cor_vel*bound_Cor_vel))
 
     min_grid_sp_h4 = huge(1.)
-    do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+    do j=js-1,Jeq+1 ; do i=is-1,Ieq+1
       grid_sp_h2 = (2.0*CS%dx2h(i,j)*CS%dy2h(i,j)) / (CS%dx2h(i,j)+CS%dy2h(i,j))
       grid_sp_h3 = grid_sp_h2*sqrt(grid_sp_h2)
       CS%grid_sp_h3(i,j) = grid_sp_h3
@@ -2512,7 +2573,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
   endif
   ! The Laplacian bounds should avoid overshoots when CS%bound_coef < 1.
   if (CS%Laplacian .and. CS%better_bound_Kh) then
-    do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+    do j=js-1,Jeq+1 ; do i=is-1,Ieq+1
       denom = max( &
          (CS%dy2h(i,j) * CS%DY_dxT(i,j) * (G%IdyCu(I,j) + G%IdyCu(I-1,j)) * &
           max(G%IdyCu(I,j)*G%IareaCu(I,j), G%IdyCu(I-1,j)*G%IareaCu(I-1,j)) ), &
@@ -2540,7 +2601,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
   ! The biharmonic bounds should avoid overshoots when CS%bound_coef < 0.5, but
   ! empirically work for CS%bound_coef <~ 1.0
   if (CS%biharmonic .and. CS%better_bound_Ah) then
-    do j=js-1,Jeq+1 ; do I=Isq-1,Ieq+1
+    do j=js-1,Jeq+1 ; do I=is-2,Ieq+1
       u0u(I,j) = (CS%Idxdy2u(I,j)*(CS%dy2h(i+1,j)*CS%DY_dxT(i+1,j)*(G%IdyCu(I+1,j) + G%IdyCu(I,j))   + &
                                    CS%dy2h(i,j) * CS%DY_dxT(i,j) * (G%IdyCu(I,j) + G%IdyCu(I-1,j)) ) + &
                  CS%Idx2dyCu(I,j)*(CS%dx2q(I,J) * CS%DX_dyBu(I,J) * (G%IdxCu(I,j+1) + G%IdxCu(I,j)) + &
@@ -2550,7 +2611,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
                  CS%Idx2dyCu(I,j)*(CS%dx2q(I,J) * CS%DY_dxBu(I,J) * (G%IdyCv(i+1,J) + G%IdyCv(i,J))   + &
                                    CS%dx2q(I,J-1)*CS%DY_dxBu(I,J-1)*(G%IdyCv(i+1,J-1) + G%IdyCv(i,J-1)) ) )
     enddo ; enddo
-    do J=Jsq-1,Jeq+1 ; do i=is-1,Ieq+1
+    do J=js-2,Jeq+1 ; do i=is-1,Ieq+1
       v0u(i,J) = (CS%Idxdy2v(i,J)*(CS%dy2q(I,J) * CS%DX_dyBu(I,J) * (G%IdxCu(I,j+1) + G%IdxCu(I,j))       + &
                                    CS%dy2q(I-1,J)*CS%DX_dyBu(I-1,J)*(G%IdxCu(I-1,j+1) + G%IdxCu(I-1,j)) ) + &
                  CS%Idx2dyCv(i,J)*(CS%dx2h(i,j+1)*CS%DY_dxT(i,j+1)*(G%IdyCu(I,j+1) + G%IdyCu(I-1,j+1))   + &
@@ -2560,7 +2621,7 @@ subroutine hor_visc_init(Time, G, GV, US, param_file, diag, CS, ADp)
                  CS%Idx2dyCv(i,J)*(CS%dx2h(i,j+1)*CS%DX_dyT(i,j+1)*(G%IdxCv(i,J+1) + G%IdxCv(i,J))   + &
                                    CS%dx2h(i,j) * CS%DX_dyT(i,j) * (G%IdxCv(i,J) + G%IdxCv(i,J-1)) ) )
     enddo ; enddo
-    do j=Jsq,Jeq+1 ; do i=Isq,Ieq+1
+    do j=js-1,Jeq+1 ; do i=is-1,Ieq+1
       denom = max( &
          (CS%dy2h(i,j) * &
           (CS%DY_dxT(i,j)*(G%IdyCu(I,j)*u0u(I,j) + G%IdyCu(I-1,j)*u0u(I-1,j))  + &
@@ -2839,112 +2900,113 @@ subroutine smooth_GME(CS, G, GME_flux_h, GME_flux_q)
   enddo ! s-loop
 end subroutine smooth_GME
 
-!> Apply a 9-point smoothing filter twice to reduce horizontal two-grid-point noise
-!! Note that this subroutine does not conserve mass or angular momentum, so don't use it
-!! in situations where you need conservation. Also can't apply it to Ah and Kh in the
-!! horizontal_viscosity subroutine because they are not supposed to be halo-updated.
-!! But you _can_ apply them to Kh_h and Ah_h.
-subroutine smooth_x9(CS, G, field_h, field_u, field_v, field_q, zero_land)
-  type(hor_visc_CS),                            intent(in)    :: CS        !< Control structure
-  type(ocean_grid_type),                        intent(in)    :: G         !< Ocean grid
-  real, dimension(SZI_(G),SZJ_(G)), optional,   intent(inout) :: field_h   !< field to be smoothed
-                                                              !! at h points
-  real, dimension(SZIB_(G),SZJ_(G)), optional,  intent(inout) :: field_u   !< field to be smoothed
-                                                              !! at u points
-  real, dimension(SZI_(G),SZJB_(G)), optional,  intent(inout) :: field_v   !< field to be smoothed
-                                                              !! at v points
-  real, dimension(SZIB_(G),SZJB_(G)), optional, intent(inout) :: field_q   !< field to be smoothed
-                                                              !! at q points
-  logical, optional, intent(in)                               :: zero_land !< An optional argument
-                                                              !! indicating whether to set values
-                                                              !! on land to zero (.true.) or
-                                                              !! whether to ignore land values
-                                                              !! (.false. or not present)
-  ! local variables. It would be good to make the _original variables allocatable.
-  real, dimension(SZI_(G),SZJ_(G))   :: field_h_original
-  real, dimension(SZIB_(G),SZJ_(G))  :: field_u_original
-  real, dimension(SZI_(G),SZJB_(G))  :: field_v_original
-  real, dimension(SZIB_(G),SZJB_(G)) :: field_q_original
-  real, dimension(3,3) :: weights, local_weights ! averaging weights for smoothing, nondimensional
-  logical :: zero_land_val ! actual value of zero_land optional argument
-  integer :: i, j, s
-  integer :: is, ie, js, je, Isq, Ieq, Jsq, Jeq
+!> Apply a 9-point smoothing filter twice to a field staggered at a thickness point to reduce
+!! horizontal two-grid-point noise.
+!! Note that this subroutine does not conserve mass, so don't use it in situations where you
+!! need conservation.  Also note that it assumes that the input field has valid values in the
+!! first two halo points upon entry.
+subroutine smooth_x9_h(G, field_h, zero_land)
+  type(ocean_grid_type),            intent(in)    :: G         !< Ocean grid
+  real, dimension(SZI_(G),SZJ_(G)), intent(inout) :: field_h   !< h-point field to be smoothed [arbitrary]
+  logical,                optional, intent(in)    :: zero_land !< If present and false, return the average
+                                                               !! of the surrounding ocean points when
+                                                               !! smoothing, otherwise use a value of 0 for
+                                                               !! land points and include them in the averages.
 
+  ! Local variables
+  real :: fh_prev(SZI_(G),SZJ_(G))  ! The value of the h-point field at the previous iteration [arbitrary]
+  real :: Iwts             ! The inverse of the sum of the weights [nondim]
+  logical :: zero_land_val ! The value of the zero_land optional argument or .true. if it is absent.
+  integer :: i, j, s, is, ie, js, je
 
   is  = G%isc  ; ie  = G%iec  ; js  = G%jsc  ; je  = G%jec
-  Isq = G%IscB ; Ieq = G%IecB ; Jsq = G%JscB ; Jeq = G%JecB
 
-  weights = reshape([1., 2., 1., 2., 4., 2., 1., 2., 1.],shape(weights))/16.
+  zero_land_val = .true. ; if (present(zero_land)) zero_land_val = zero_land
+
+  do s=1,0,-1
+    fh_prev(:,:) = field_h(:,:)
+    ! apply smoothing on field_h using rotationally symmetric expressions.
+    do j=js-s,je+s ; do i=is-s,ie+s ; if (G%mask2dT(i,j) > 0.0) then
+      Iwts = 0.0625
+      if (.not. zero_land_val) &
+        Iwts = 1.0 / ( (4.0*G%mask2dT(i,j) + &
+                        ( 2.0*((G%mask2dT(i-1,j) + G%mask2dT(i+1,j)) + &
+                               (G%mask2dT(i,j-1) + G%mask2dT(i,j+1))) + &
+                         ((G%mask2dT(i-1,j-1) + G%mask2dT(i+1,j+1)) + &
+                          (G%mask2dT(i-1,j+1) + G%mask2dT(i+1,j-1))) ) ) + 1.0e-16 )
+      field_h(i,j) = Iwts * ( 4.0*G%mask2dT(i,j) * fh_prev(i,j) &
+                            + (2.0*((G%mask2dT(i-1,j) * fh_prev(i-1,j) + G%mask2dT(i+1,j) * fh_prev(i+1,j)) + &
+                                    (G%mask2dT(i,j-1) * fh_prev(i,j-1) + G%mask2dT(i,j+1) * fh_prev(i,j+1))) &
+                              + ((G%mask2dT(i-1,j-1) * fh_prev(i-1,j-1) + G%mask2dT(i+1,j+1) * fh_prev(i+1,j+1)) + &
+                                 (G%mask2dT(i-1,j+1) * fh_prev(i-1,j+1) + G%mask2dT(i+1,j-1) * fh_prev(i-1,j-1))) ))
+    endif ; enddo ; enddo
+  enddo
+
+end subroutine smooth_x9_h
+
+!> Apply a 9-point smoothing filter twice to a pair of velocity components to reduce
+!! horizontal two-grid-point noise.
+!! Note that this subroutine does not conserve angular momentum, so don't use it
+!! in situations where you need conservation.  Also note that it assumes that the
+!! input fields have valid values in the first two halo points upon entry.
+subroutine smooth_x9_uv(G, field_u, field_v, zero_land)
+  type(ocean_grid_type),             intent(in)    :: G         !< Ocean grid
+  real, dimension(SZIB_(G),SZJ_(G)), intent(inout) :: field_u   !< u-point field to be smoothed[arbitrary]
+  real, dimension(SZI_(G),SZJB_(G)), intent(inout) :: field_v   !< v-point field to be smoothed [arbitrary]
+  logical,                 optional, intent(in)    :: zero_land !< If present and false, return the average
+                                                                !! of the surrounding ocean points when
+                                                                !! smoothing, otherwise use a value of 0 for
+                                                                !! land points and include them in the averages.
+
+  ! Local variables.
+  real :: fu_prev(SZIB_(G),SZJ_(G))  ! The value of the u-point field at the previous iteration [arbitrary]
+  real :: fv_prev(SZI_(G),SZJB_(G))  ! The value of the v-point field at the previous iteration [arbitrary]
+  real :: Iwts             ! The inverse of the sum of the weights [nondim]
+  logical :: zero_land_val ! The value of the zero_land optional argument or .true. if it is absent.
+  integer :: i, j, s, is, ie, js, je, Isq, Ieq, Jsq, Jeq
 
-  if (present(zero_land)) then
-    zero_land_val = zero_land
-  else
-    zero_land_val = .false.
-  endif
-
-  if (present(field_h)) then
-    call pass_var(field_h, G%Domain, halo=2) ! Halo size 2 ensures that you can smooth twice
-    do s=1,0,-1
-      field_h_original(:,:) = field_h(:,:)
-      ! apply smoothing on field_h
-      do j=js-s,je+s ; do i=is-s,ie+s
-        ! skip land points
-        if (G%mask2dT(i,j)==0.) cycle
-        ! compute local weights
-        local_weights = weights*G%mask2dT(i-1:i+1,j-1:j+1)
-        if (zero_land_val) local_weights = local_weights/(sum(local_weights) + 1.E-16)
-        field_h(i,j) =  sum(local_weights*field_h_original(i-1:i+1,j-1:j+1))
-      enddo ; enddo
-    enddo
-    call pass_var(field_h, G%Domain)
-  endif
-
-  if (present(field_u)) then
-    call pass_vector(field_u, field_v, G%Domain, halo=2)
-    do s=1,0,-1
-      field_u_original(:,:) = field_u(:,:)
-      ! apply smoothing on field_u
-      do j=js-s,je+s ; do I=Isq-s,Ieq+s
-        ! skip land points
-        if (G%mask2dCu(I,j)==0.) cycle
-        ! compute local weights
-        local_weights = weights*G%mask2dCu(I-1:I+1,j-1:j+1)
-        if (zero_land_val) local_weights = local_weights/(sum(local_weights) + 1.E-16)
-        field_u(I,j) =  sum(local_weights*field_u_original(I-1:I+1,j-1:j+1))
-      enddo ; enddo
-
-      field_v_original(:,:) = field_v(:,:)
-      ! apply smoothing on field_v
-      do J=Jsq-s,Jeq+s ; do i=is-s,ie+s
-        ! skip land points
-        if (G%mask2dCv(i,J)==0.) cycle
-        ! compute local weights
-        local_weights = weights*G%mask2dCv(i-1:i+1,J-1:J+1)
-        if (zero_land_val) local_weights = local_weights/(sum(local_weights) + 1.E-16)
-        field_v(i,J) =  sum(local_weights*field_v_original(i-1:i+1,J-1:J+1))
-      enddo ; enddo
-    enddo
-    call pass_vector(field_u, field_v, G%Domain)
-  endif
-
-  if (present(field_q)) then
-    call pass_var(field_q, G%Domain, halo=2, position=CORNER)
-    do s=1,0,-1
-      field_q_original(:,:) = field_q(:,:)
-      ! apply smoothing on field_q
-      do J=Jsq-s,Jeq+s ; do I=Isq-s,Ieq+s
-        ! skip land points
-        if (G%mask2dBu(I,J)==0.) cycle
-        ! compute local weights
-        local_weights = weights*G%mask2dBu(I-1:I+1,J-1:J+1)
-        if (zero_land_val) local_weights = local_weights/(sum(local_weights) + 1.E-16)
-        field_q(I,J) =  sum(local_weights*field_q_original(I-1:I+1,J-1:J+1))
-      enddo ; enddo
-    enddo
-    call pass_var(field_q, G%Domain, position=CORNER)
-  endif
+  is  = G%isc  ; ie  = G%iec  ; js  = G%jsc  ; je  = G%jec
+  Isq = G%IscB ; Ieq = G%IecB ; Jsq = G%JscB ; Jeq = G%JecB
 
-end subroutine smooth_x9
+  zero_land_val = .true. ; if (present(zero_land)) zero_land_val = zero_land
+
+  do s=1,0,-1
+    fu_prev(:,:) = field_u(:,:)
+    ! apply smoothing on field_u using the original non-rotationally symmetric expressions.
+    do j=js-s,je+s ; do I=Isq-s,Ieq+s ; if (G%mask2dCu(I,j) > 0.0) then
+      Iwts = 0.0625
+      if (.not. zero_land_val) &
+        Iwts = 1.0 / ( (4.0*G%mask2dCu(I,j) + &
+                        ( 2.0*((G%mask2dCu(I-1,j) + G%mask2dCu(I+1,j)) + &
+                               (G%mask2dCu(I,j-1) + G%mask2dCu(I,j+1))) + &
+                         ((G%mask2dCu(I-1,j-1) + G%mask2dCu(I+1,j+1)) + &
+                          (G%mask2dCu(I-1,j+1) + G%mask2dCu(I+1,j-1))) ) ) + 1.0e-16 )
+      field_u(I,j) = Iwts * ( 4.0*G%mask2dCu(I,j) * fu_prev(I,j) &
+                            + (2.0*((G%mask2dCu(I-1,j) * fu_prev(I-1,j) + G%mask2dCu(I+1,j) * fu_prev(I+1,j)) + &
+                                    (G%mask2dCu(I,j-1) * fu_prev(I,j-1) + G%mask2dCu(I,j+1) * fu_prev(I,j+1))) &
+                              + ((G%mask2dCu(I-1,j-1) * fu_prev(I-1,j-1) + G%mask2dCu(I+1,j+1) * fu_prev(I+1,j+1)) + &
+                                 (G%mask2dCu(I-1,j+1) * fu_prev(I-1,j+1) + G%mask2dCu(I+1,j-1) * fu_prev(I-1,j-1))) ))
+    endif ; enddo ; enddo
+
+    fv_prev(:,:) = field_v(:,:)
+    ! apply smoothing on field_v using the original non-rotationally symmetric expressions.
+    do J=Jsq-s,Jeq+s ; do i=is-s,ie+s ; if (G%mask2dCv(i,J) > 0.0) then
+      Iwts = 0.0625
+      if (.not. zero_land_val) &
+        Iwts = 1.0 / ( (4.0*G%mask2dCv(i,J) + &
+                        ( 2.0*((G%mask2dCv(i-1,J) + G%mask2dCv(i+1,J)) + &
+                               (G%mask2dCv(i,J-1) + G%mask2dCv(i,J+1))) + &
+                         ((G%mask2dCv(i-1,J-1) + G%mask2dCv(i+1,J+1)) + &
+                          (G%mask2dCv(i-1,J+1) + G%mask2dCv(i+1,J-1))) ) ) + 1.0e-16 )
+      field_v(i,J) = Iwts * ( 4.0*G%mask2dCv(i,J) * fv_prev(i,J) &
+                            + (2.0*((G%mask2dCv(i-1,J) * fv_prev(i-1,J) + G%mask2dCv(i+1,J) * fv_prev(i+1,J)) + &
+                                    (G%mask2dCv(i,J-1) * fv_prev(i,J-1) + G%mask2dCv(i,J+1) * fv_prev(i,J+1))) &
+                              + ((G%mask2dCv(i-1,J-1) * fv_prev(i-1,J-1) + G%mask2dCv(i+1,J+1) * fv_prev(i+1,J+1)) + &
+                                 (G%mask2dCv(i-1,J+1) * fv_prev(i-1,J+1) + G%mask2dCv(i+1,J-1) * fv_prev(i-1,J-1))) ))
+    endif ; enddo ; enddo
+  enddo
+
+end subroutine smooth_x9_uv
 
 !> Deallocates any variables allocated in hor_visc_init.
 subroutine hor_visc_end(CS)