diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..700707c
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 2386f1a..89c916e 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -27,12 +27,12 @@ jobs:
           - os: macOS-latest
             arch: x86
     steps:
-      - uses: actions/checkout@v2
-      - uses: julia-actions/setup-julia@v1
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v4
         env:
           cache-name: cache-artifacts
         with:
@@ -45,6 +45,6 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v3
         with:
           file: lcov.info
diff --git a/.github/workflows/Invalidations.yml b/.github/workflows/Invalidations.yml
index 4d0004e..66c86a3 100644
--- a/.github/workflows/Invalidations.yml
+++ b/.github/workflows/Invalidations.yml
@@ -16,15 +16,15 @@ jobs:
     if: github.base_ref == github.event.repository.default_branch
     runs-on: ubuntu-latest
     steps:
-    - uses: julia-actions/setup-julia@v1
+    - uses: julia-actions/setup-julia@v2
       with:
         version: '1'
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - uses: julia-actions/julia-buildpkg@v1
     - uses: julia-actions/julia-invalidations@v1
       id: invs_pr
 
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         ref: ${{ github.event.repository.default_branch }}
     - uses: julia-actions/julia-buildpkg@v1
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ba39cc5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+Manifest.toml
diff --git a/Project.toml b/Project.toml
index 9791ee2..6a2c5a9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,19 +1,19 @@
 name = "SortingAlgorithms"
 uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "1.0.1"
+version = "1.2.1"
 
 [deps]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 
 [compat]
-julia = "1"
 DataStructures = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18"
-StatsBase = "0.33"
+julia = "1"
 
 [extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Random", "StatsBase", "Test"]
+test = ["Aqua", "Random", "StatsBase", "Test"]
diff --git a/README.md b/README.md
index d0d69e9..17067d7 100644
--- a/README.md
+++ b/README.md
@@ -1,126 +1,90 @@
 # Sorting Algorithms
 
-  [![Build status](https://github.com/JuliaLang/SortingAlgorithms.jl/workflows/CI/badge.svg)](https://github.com/JuliaLang/SortingAlgorithms.jl/actions?query=workflow%3ACI+branch%3Amaster)
+[![Build status](https://github.com/JuliaLang/SortingAlgorithms.jl/workflows/CI/badge.svg)](https://github.com/JuliaLang/SortingAlgorithms.jl/actions?query=workflow%3ACI+branch%3Amaster)
 [![Coverage Status](https://coveralls.io/repos/JuliaLang/SortingAlgorithms.jl/badge.svg)](https://coveralls.io/r/JuliaLang/SortingAlgorithms.jl)
+[![deps](https://juliahub.com/docs/SortingAlgorithms/deps.svg)](https://juliahub.com/ui/Packages/SortingAlgorithms/6dCmw?t=2)
 
-The `SortingAlgorithms` package provides three sorting algorithms that can be used with Julia's [standard sorting API](https://docs.julialang.org/en/v1/base/sort/):
+The `SortingAlgorithms` package provides four sorting algorithms that can be used with Julia's [standard sorting API](https://docs.julialang.org/en/v1/base/sort/):
 
 - [HeapSort] – an unstable, general purpose, in-place, O(n log n) comparison sort that works by heapifying an array and repeatedly taking the maximal element from the heap.
 - [TimSort] – a stable, general purpose, hybrid, O(n log n) comparison sort that adapts to different common patterns of partially ordered input data.
-- [RadixSort] – a stable, special case, O(n) non-comparison sort that works by sorting data with fixed size, one digit at a time.
+- [CombSort] – an unstable, general purpose, in-place, O(n log n) comparison sort with O(n^2) pathological cases that can attain good efficiency through SIMD instructions and instruction level parallelism on modern hardware.
+- [PagedMergeSort] – a stable, general purpose, O(n log n) time and O(sqrt n) space comparison sort.
 
-[HeapSort]:  http://en.wikipedia.org/wiki/Heapsort
-[TimSort]:   http://en.wikipedia.org/wiki/Timsort
-[RadixSort]: http://en.wikipedia.org/wiki/Radix_sort
+[HeapSort]: https://en.wikipedia.org/wiki/Heapsort
+[TimSort]:  https://en.wikipedia.org/wiki/Timsort
+[CombSort]: https://en.wikipedia.org/wiki/Comb_sort
+[PagedMergeSort]: https://link.springer.com/chapter/10.1007/BFb0016253
 
 ## Usage
 
 ```jl
-	julia> using SortingAlgorithms
+julia> using SortingAlgorithms
 
-	julia> words = map(chomp,[readlines(open("/usr/share/dict/words"))...])
-	235886-element Array{ASCIIString,1}:
-	 "A"
-	 "a"
-	 "aa"
-	 "aal"
-	 "aalii"
-	 ⋮
-	 "zythem"
-	 "Zythia"
-	 "zythum"
-	 "Zyzomys"
-	 "Zyzzogeton"
+julia> words = map(chomp,[readlines(open("/usr/share/dict/words"))...])
+235886-element Array{ASCIIString,1}:
+ "A"
+ "a"
+ "aa"
+ ⋮
+ "zythum"
+ "Zyzomys"
+ "Zyzzogeton"
 
-	julia> sort!(words, alg=TimSort)
-	235886-element Array{ASCIIString,1}:
-	 "A"
-	 "Aani"
-	 "Aaron"
-	 "Aaronic"
-	 "Aaronical"
-	 ⋮
-	 "zymotize"
-	 "zymotoxic"
-	 "zymurgy"
-	 "zythem"
-	 "zythum"
+julia> sort!(words, alg=TimSort)
+235886-element Array{ASCIIString,1}:
+ "A"
+ "Aani"
+ "Aaron"
+ ⋮
+ "zymurgy"
+ "zythem"
+ "zythum"
 
-	julia> sort!(words, alg=TimSort, by=length)
-	235886-element Array{ASCIIString,1}:
-	 "A"
-	 "B"
-	 "C"
-	 "D"
-	 "E"
-	 ⋮
-	 "formaldehydesulphoxylate"
-	 "pathologicopsychological"
-	 "scientificophilosophical"
-	 "tetraiodophenolphthalein"
-	 "thyroparathyroidectomize"
+julia> sort!(words, alg=TimSort, by=length)
+235886-element Array{ASCIIString,1}:
+ "A"
+ "B"
+ "C"
+ ⋮
+ "scientificophilosophical"
+ "tetraiodophenolphthalein"
+ "thyroparathyroidectomize"
 
-	julia> sort!(words, alg=HeapSort)
-	235886-element Array{ASCIIString,1}:
-	 "A"
-	 "Aani"
-	 "Aaron"
-	 "Aaronic"
-	 "Aaronical"
-	 ⋮
-	 "zymotize"
-	 "zymotoxic"
-	 "zymurgy"
-	 "zythem"
-	 "zythum"
+julia> sort!(words, alg=HeapSort)
+235886-element Array{ASCIIString,1}:
+ "A"
+ "Aani"
+ "Aaron"
+ ⋮
+ "zymurgy"
+ "zythem"
+ "zythum"
 
-	julia> sort!(words, alg=HeapSort, by=length)
-	235886-element Array{ASCIIString,1}:
-	 "L"
-	 "p"
-	 "U"
-	 "I"
-	 "q"
-	 ⋮
-	 "pathologicopsychological"
-	 "formaldehydesulphoxylate"
-	 "scientificophilosophical"
-	 "tetraiodophenolphthalein"
-	 "thyroparathyroidectomize"
+julia> sort!(words, alg=HeapSort, by=length)
+235886-element Array{ASCIIString,1}:
+ "L"
+ "p"
+ "U"
+ ⋮
+ "scientificophilosophical"
+ "tetraiodophenolphthalein"
+ "thyroparathyroidectomize"
 
-	julia> sort!(words, alg=RadixSort)
-	ERROR: Radix sort only sorts bits types (got ASCIIString)
-	 in error at error.jl:21
-	 in sort! at /Users/stefan/.julia/SortingAlgorithms/src/SortingAlgorithms.jl:54
-	 in sort! at sort.jl:328
-	 in sort! at sort.jl:329
-
-	julia> floats = randn(1000)
-	1000-element Array{Float64,1}:
-	  1.729
-	  0.907196
-	  0.461481
-	 -0.204763
-	 -0.16022
-	  ⋮
-	  0.700683
-	 -0.236204
-	 -2.15634
-	 -0.316188
-	 -0.171478
-
-	julia> sort!(floats, alg=RadixSort)
-	1000-element Array{Float64,1}:
-	 -2.86255
-	 -2.72041
-	 -2.58234
-	 -2.57259
-	 -2.53046
-	  ⋮
-	  3.08307
-	  3.12902
-	  3.15075
-	  3.20058
-	  3.23942
+julia> sort!(randn(1000), alg=CombSort)
+1000-element Array{Float64,1}:
+ -2.86255
+ -2.72041
+ -2.58234
+  ⋮
+  3.15075
+  3.20058
+  3.23942
 ```
 
+## Other packages that provide sorting algorithms
+
+While SortingAlgorithms.jl is the most widely used sorting package in the Julia ecosystem, other packages are available:
+- https://github.com/xiaodaigh/SortingLab.jl
+- https://github.com/JeffreySarnoff/SortingNetworks.jl
+- https://github.com/nlw0/ChipSort.jl
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000..69cb760
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1 @@
+comment: false
diff --git a/docs/pagedMerge_130_130.gif b/docs/pagedMerge_130_130.gif
new file mode 100644
index 0000000..6d8d310
Binary files /dev/null and b/docs/pagedMerge_130_130.gif differ
diff --git a/src/SortingAlgorithms.jl b/src/SortingAlgorithms.jl
index 3c54ba8..f6e1a8a 100644
--- a/src/SortingAlgorithms.jl
+++ b/src/SortingAlgorithms.jl
@@ -9,16 +9,77 @@ using Base.Order
 import Base.Sort: sort!
 import DataStructures: heapify!, percolate_down!
 
-export HeapSort, TimSort, RadixSort
+export HeapSort, TimSort, RadixSort, CombSort, PagedMergeSort
 
 struct HeapSortAlg  <: Algorithm end
 struct TimSortAlg   <: Algorithm end
 struct RadixSortAlg <: Algorithm end
+struct CombSortAlg  <: Algorithm end
+struct PagedMergeSortAlg  <: Algorithm end
 
-const HeapSort  = HeapSortAlg()
-const TimSort   = TimSortAlg()
+function maybe_optimize(x::Algorithm)
+    isdefined(Base.Sort, :InitialOptimizations) ? Base.Sort.InitialOptimizations(x) : x
+end
+const HeapSort  = maybe_optimize(HeapSortAlg())
+const TimSort   = maybe_optimize(TimSortAlg())
+# Whenever InitialOptimizations is defined, RadixSort falls
+# back to Base.DEFAULT_STABLE which already includes them.
 const RadixSort = RadixSortAlg()
 
+"""
+    CombSort
+
+Indicates that a sorting function should use the comb sort
+algorithm. Comb sort traverses the collection multiple times
+ordering pairs of elements with a given interval between them.
+The interval decreases exponentially until it becomes 1, then
+it switches to insertion sort on the whole input.
+
+Characteristics:
+ - *not stable* does not preserve the ordering of elements which
+   compare equal (e.g. "a" and "A" in a sort of letters which
+   ignores case).
+ - *in-place* in memory.
+ - *parallelizable* suitable for vectorization with SIMD instructions
+   because it performs many independent comparisons.
+ - *pathological inputs* such as `repeat(1:5.0, 4^8)` can make this algorithm perform very poorly.
+ - *`n log n` average runtime* measured for random inputs of length up to 100 million, but theoretical runtime of `Θ(n^2)` for extremely long inputs.
+
+## References
+- Dobosiewicz, Wlodzimierz, (1980). "An efficient variation of bubble sort", Information Processing Letters, 11(1), pp. 5-6, https://doi.org/10.1016/0020-0190(80)90022-8.
+ - Werneck, N. L., (2020). "ChipSort: a SIMD and cache-aware sorting module. JuliaCon Proceedings, 1(1), 12, https://doi.org/10.21105/jcon.00012
+ - H. Inoue, T. Moriyama, H. Komatsu and T. Nakatani, "AA-Sort: A New Parallel Sorting Algorithm for Multi-Core SIMD Processors," 16th International Conference on Parallel Architecture and Compilation Techniques (PACT 2007), 2007, pp. 189-198, doi: 10.1109/PACT.2007.4336211.
+"""
+const CombSort  = maybe_optimize(CombSortAlg())
+
+"""
+    PagedMergeSort
+
+Indicates that a sorting function should use the paged merge sort
+algorithm. Paged merge sort uses is a merge sort, that uses different
+merge routines to achieve stable sorting with a scratch space of size O(√n).
+The merge routine for merging large subarrays merges
+pages of size O(√n) almost in place, before reordering them using a page table.
+At deeper recursion levels, where the scratch space is big enough,
+normal merging is used, where one input is copied into the scratch space.
+When the scratch space is large enough to hold the complete subarray,
+the input is merged interleaved from both sides, which increases performance
+for random data.
+
+Characteristics:
+ - *stable*: does preserve the ordering of elements which
+   compare equal (e.g. "a" and "A" in a sort of letters which
+   ignores case).
+ - *`O(√n)`* auxilary memory usage.
+ - *`O(n log n)`* garuanteed runtime.
+
+## References
+ - Dvořák, S., Ďurian, B. (1986). Towards an efficient merging. In: Gruska, J., Rovan, B., Wiedermann,
+   J. (eds) Mathematical Foundations of Computer Science 1986. MFCS 1986. Lecture Notes in Computer Science, vol 233.
+   Springer, Berlin, Heidelberg. https://doi.org/10.1007/BFb0016253
+ - https://max-arbuzov.blogspot.com/2021/10/merge-sort-with-osqrtn-auxiliary-memory.html
+"""
+const PagedMergeSort  = maybe_optimize(PagedMergeSortAlg())
 
 ## Heap sort
 
@@ -41,105 +102,6 @@ end
 
 Base.:+(x::Int, y::Int) = Int(Int128(x) + Int128(y))
 
-## Radix sort
-
-# Map a bits-type to an unsigned int, maintaining sort order
-uint_mapping(::ForwardOrdering, x::Unsigned) = x
-for (signedty, unsignedty) in ((Int8, UInt8), (Int16, UInt16), (Int32, UInt32), (Int64, UInt64), (Int128, UInt128))
-    # In Julia 0.4 we can just use unsigned() here
-    @eval uint_mapping(::ForwardOrdering, x::$signedty) = reinterpret($unsignedty, xor(x, typemin(typeof(x))))
-end
-uint_mapping(::ForwardOrdering, x::Float32)  = (y = reinterpret(Int32, x); reinterpret(UInt32, ifelse(y < 0, ~y, xor(y, typemin(Int32)))))
-uint_mapping(::ForwardOrdering, x::Float64)  = (y = reinterpret(Int64, x); reinterpret(UInt64, ifelse(y < 0, ~y, xor(y, typemin(Int64)))))
-
-uint_mapping(::Sort.Float.Left, x::Float16)  = ~reinterpret(Int16, x)
-uint_mapping(::Sort.Float.Right, x::Float16)  = reinterpret(Int16, x)
-uint_mapping(::Sort.Float.Left, x::Float32)  = ~reinterpret(Int32, x)
-uint_mapping(::Sort.Float.Right, x::Float32)  = reinterpret(Int32, x)
-uint_mapping(::Sort.Float.Left, x::Float64)  = ~reinterpret(Int64, x)
-uint_mapping(::Sort.Float.Right, x::Float64)  = reinterpret(Int64, x)
-
-uint_mapping(rev::ReverseOrdering, x) = ~uint_mapping(rev.fwd, x)
-uint_mapping(::ReverseOrdering{ForwardOrdering}, x::Real) = ~uint_mapping(Forward, x) # maybe unnecessary; needs benchmark
-
-uint_mapping(o::By,   x     ) = uint_mapping(Forward, o.by(x))
-uint_mapping(o::Perm, i::Int) = uint_mapping(o.order, o.data[i])
-uint_mapping(o::Lt,   x     ) = error("uint_mapping does not work with general Lt Orderings")
-
-const RADIX_SIZE = 11
-const RADIX_MASK = 0x7FF
-
-function sort!(vs::AbstractVector{T}, lo::Int, hi::Int, ::RadixSortAlg, o::Ordering, ts::AbstractVector{T}=similar(vs)) where T
-    # Input checking
-    if lo >= hi;  return vs;  end
-
-    # Make sure we're sorting a bits type
-    OT = Base.Order.ordtype(o, vs)
-    if !isbitstype(OT)
-        error("Radix sort only sorts bits types (got $OT)")
-    end
-
-    # Init
-    iters = ceil(Integer, sizeof(OT)*8/RADIX_SIZE)
-    bin = zeros(UInt32, 2^RADIX_SIZE, iters)
-    if lo > 1;  bin[1,:] .= lo-1;  end
-
-    # Histogram for each element, radix
-    for i = lo:hi
-        v = uint_mapping(o, vs[i])
-        for j = 1:iters
-            idx = Int((v >> ((j-1)*RADIX_SIZE)) & RADIX_MASK) + 1
-            @inbounds bin[idx,j] += 1
-        end
-    end
-
-    # Sort!
-    swaps = 0
-    len = hi-lo+1
-    for j = 1:iters
-        # Unroll first data iteration, check for degenerate case
-        v = uint_mapping(o, vs[hi])
-        idx = Int((v >> ((j-1)*RADIX_SIZE)) & RADIX_MASK) + 1
-
-        # are all values the same at this radix?
-        if bin[idx,j] == len;  continue;  end
-
-        cbin = cumsum(bin[:,j])
-        ci = cbin[idx]
-        ts[ci] = vs[hi]
-        cbin[idx] -= 1
-
-        # Finish the loop...
-        @inbounds for i in hi-1:-1:lo
-            v = uint_mapping(o, vs[i])
-            idx = Int((v >> ((j-1)*RADIX_SIZE)) & RADIX_MASK) + 1
-            ci = cbin[idx]
-            ts[ci] = vs[i]
-            cbin[idx] -= 1
-        end
-        vs,ts = ts,vs
-        swaps += 1
-    end
-
-    if isodd(swaps)
-        vs,ts = ts,vs
-        for i = lo:hi
-            vs[i] = ts[i]
-        end
-    end
-    vs
-end
-
-function Base.Sort.Float.fpsort!(v::AbstractVector, ::RadixSortAlg, o::Ordering)
-    @static if VERSION >= v"1.7.0-DEV"
-        lo, hi = Base.Sort.Float.specials2end!(v, RadixSort, o)
-    else
-        lo, hi = Base.Sort.Float.nans2end!(v, o)
-    end
-    sort!(v, lo, hi, RadixSort, o)
-end
-
-
 # Implementation of TimSort based on the algorithm description at:
 #
 #   http://svn.python.org/projects/python/trunk/Objects/listsort.txt
@@ -147,6 +109,27 @@ end
 #
 # Original author: @kmsquire
 
+@static if v"1.9.0-alpha" <= VERSION <= v"1.9.1"
+    function Base.getindex(v::Base.Sort.WithoutMissingVector, i::UnitRange)
+        out = Vector{eltype(v)}(undef, length(i))
+        out .= v.data[i]
+        out
+    end
+
+    # skip MissingOptimization due to JuliaLang/julia#50171
+    const _FIVE_ARG_SAFE_DEFAULT_STABLE = Base.DEFAULT_STABLE.next
+
+    # Explicitly define conversion from _sort!(v, alg, order, kw) to sort!(v, lo, hi, alg, order)
+    # To avoid excessively strict dispatch loop detection
+    function Base.Sort._sort!(v::AbstractVector, a::Union{HeapSortAlg, TimSortAlg, RadixSortAlg, CombSortAlg}, o::Base.Order.Ordering, kw)
+        Base.Sort.@getkw lo hi scratch
+        sort!(v, lo, hi, a, o)
+        scratch
+    end
+else
+    const _FIVE_ARG_SAFE_DEFAULT_STABLE = Base.DEFAULT_STABLE
+end
+
 const Run = UnitRange{Int}
 
 const MIN_GALLOP = 7
@@ -558,7 +541,7 @@ function sort!(v::AbstractVector, lo::Int, hi::Int, ::TimSortAlg, o::Ordering)
             # Make a run of length minrun
             count = min(minrun, hi-i+1)
             run_range = i:i+count-1
-            sort!(v, i, i+count-1, DEFAULT_STABLE, o)
+            sort!(v, i, i+count-1, _FIVE_ARG_SAFE_DEFAULT_STABLE, o)
         else
             if !issorted(run_range)
                 run_range = last(run_range):first(run_range)
@@ -579,4 +562,375 @@ function sort!(v::AbstractVector, lo::Int, hi::Int, ::TimSortAlg, o::Ordering)
     return v
 end
 
+
+function sort!(v::AbstractVector, lo::Int, hi::Int, ::CombSortAlg, o::Ordering)
+    interval = (3 * (hi-lo+1)) >> 2
+
+    while interval > 1
+        @inbounds for j in lo:hi-interval
+            a, b = v[j], v[j+interval]
+            v[j], v[j+interval] = lt(o, b, a) ? (b, a) : (a, b)
+        end
+        interval = (3 * interval) >> 2
+    end
+
+    return sort!(v, lo, hi, InsertionSort, o)
+end
+
+
+## Radix sort
+@static if VERSION >= v"1.9.0-DEV.482" # Base introduced radixsort in 1.9
+    function sort!(vs::AbstractVector{T}, lo::Int, hi::Int, ::RadixSortAlg, o::Ordering, ts::Union{Nothing, AbstractVector{T}}=nothing) where T
+        sort!(vs, lo, hi, Base.DEFAULT_STABLE, o)
+    end
+    function sort!(vs::AbstractVector{T}, lo::Int, hi::Int, ::RadixSortAlg, o::Ordering, ts::Vector{T}) where T # disambiguate
+        sort!(vs, lo, hi, Base.DEFAULT_STABLE, o)
+    end
+else
+
+    # Map a bits-type to an unsigned int, maintaining sort order
+    uint_mapping(::ForwardOrdering, x::Unsigned) = x
+    for (signedty, unsignedty) in ((Int8, UInt8), (Int16, UInt16), (Int32, UInt32), (Int64, UInt64), (Int128, UInt128))
+        # In Julia 0.4 we can just use unsigned() here
+        @eval uint_mapping(::ForwardOrdering, x::$signedty) = reinterpret($unsignedty, xor(x, typemin(typeof(x))))
+    end
+    uint_mapping(::ForwardOrdering, x::Float32)  = (y = reinterpret(Int32, x); reinterpret(UInt32, ifelse(y < 0, ~y, xor(y, typemin(Int32)))))
+    uint_mapping(::ForwardOrdering, x::Float64)  = (y = reinterpret(Int64, x); reinterpret(UInt64, ifelse(y < 0, ~y, xor(y, typemin(Int64)))))
+
+    uint_mapping(::Sort.Float.Left, x::Float16)  = ~reinterpret(Int16, x)
+    uint_mapping(::Sort.Float.Right, x::Float16)  = reinterpret(Int16, x)
+    uint_mapping(::Sort.Float.Left, x::Float32)  = ~reinterpret(Int32, x)
+    uint_mapping(::Sort.Float.Right, x::Float32)  = reinterpret(Int32, x)
+    uint_mapping(::Sort.Float.Left, x::Float64)  = ~reinterpret(Int64, x)
+    uint_mapping(::Sort.Float.Right, x::Float64)  = reinterpret(Int64, x)
+
+    uint_mapping(rev::ReverseOrdering, x) = ~uint_mapping(rev.fwd, x)
+    uint_mapping(::ReverseOrdering{ForwardOrdering}, x::Real) = ~uint_mapping(Forward, x) # maybe unnecessary; needs benchmark
+
+    uint_mapping(o::By,   x     ) = uint_mapping(Forward, o.by(x))
+    uint_mapping(o::Perm, i::Int) = uint_mapping(o.order, o.data[i])
+    uint_mapping(o::Lt,   x     ) = error("uint_mapping does not work with general Lt Orderings")
+
+    const RADIX_SIZE = 11
+    const RADIX_MASK = 0x7FF
+
+    function sort!(vs::AbstractVector{T}, lo::Int, hi::Int, ::RadixSortAlg, o::Ordering, ts::AbstractVector{T}=similar(vs)) where T
+        # Input checking
+        if lo >= hi;  return vs;  end
+
+        # Make sure we're sorting a bits type
+        OT = Base.Order.ordtype(o, vs)
+        if !isbitstype(OT)
+            error("Radix sort only sorts bits types (got $OT)")
+        end
+
+        # Init
+        iters = ceil(Integer, sizeof(OT)*8/RADIX_SIZE)
+        bin = zeros(UInt32, 2^RADIX_SIZE, iters)
+        if lo > 1;  bin[1,:] .= lo-1;  end
+
+        # Histogram for each element, radix
+        for i = lo:hi
+            v = uint_mapping(o, vs[i])
+            for j = 1:iters
+                idx = Int((v >> ((j-1)*RADIX_SIZE)) & RADIX_MASK) + 1
+                @inbounds bin[idx,j] += 1
+            end
+        end
+
+        # Sort!
+        swaps = 0
+        len = hi-lo+1
+        for j = 1:iters
+            # Unroll first data iteration, check for degenerate case
+            v = uint_mapping(o, vs[hi])
+            idx = Int((v >> ((j-1)*RADIX_SIZE)) & RADIX_MASK) + 1
+
+            # are all values the same at this radix?
+            if bin[idx,j] == len;  continue;  end
+
+            cbin = cumsum(bin[:,j])
+            ci = cbin[idx]
+            ts[ci] = vs[hi]
+            cbin[idx] -= 1
+
+            # Finish the loop...
+            @inbounds for i in hi-1:-1:lo
+                v = uint_mapping(o, vs[i])
+                idx = Int((v >> ((j-1)*RADIX_SIZE)) & RADIX_MASK) + 1
+                ci = cbin[idx]
+                ts[ci] = vs[i]
+                cbin[idx] -= 1
+            end
+            vs,ts = ts,vs
+            swaps += 1
+        end
+
+        if isodd(swaps)
+            vs,ts = ts,vs
+            for i = lo:hi
+                vs[i] = ts[i]
+            end
+        end
+        vs
+    end
+
+    function Base.Sort.Float.fpsort!(v::AbstractVector, ::RadixSortAlg, o::Ordering)
+        @static if VERSION >= v"1.7.0-DEV"
+            lo, hi = Base.Sort.Float.specials2end!(v, RadixSort, o)
+        else
+            lo, hi = Base.Sort.Float.nans2end!(v, o)
+        end
+        sort!(v, lo, hi, RadixSort, o)
+    end
+end
+
+###
+# PagedMergeSort
+###
+
+# unsafe version of copyto!
+# as workaround for https://github.com/JuliaLang/julia/issues/50900
+function _unsafe_copyto!(dest, doffs, src, soffs, n)
+    @inbounds for i in 0:n-1
+        dest[doffs + i] = src[soffs + i]
+    end
+    dest
+end
+
+function _unsafe_copyto!(dest::Array, doffs, src::Array, soffs, n)
+    unsafe_copyto!(dest, doffs, src, soffs, n)
+end
+
+# merge v[lo:m] and v[m+1:hi] ([A;B]) using scratch[1:1+hi-lo]
+# This is faster than merge! but requires twice as much auxiliary memory.
+function twoended_merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}) where T
+    @assert lo ≤ m ≤ hi
+    @assert abs((m-lo) - (hi-(m+1))) ≤ 1 "twoended_merge! only supports balanced merges"
+    len = 1 + hi - lo
+    # input array indices
+    a_lo = lo
+    a_hi = m
+    b_lo = m + 1
+    b_hi = hi
+    # output array indices
+    k_lo = 1
+    k_hi = len
+    @inbounds begin
+        # two ended merge
+        while k_lo <= len ÷ 2
+            if lt(o, v[b_lo], v[a_lo])
+                scratch[k_lo] = v[b_lo]
+                b_lo += 1
+            else
+                scratch[k_lo] = v[a_lo]
+                a_lo += 1
+            end
+            k_lo +=1
+            if !lt(o, v[b_hi], v[a_hi])
+                scratch[k_hi] = v[b_hi]
+                b_hi -= 1
+            else
+                scratch[k_hi] = v[a_hi]
+                a_hi -= 1
+            end
+            k_hi -=1
+        end
+        # if the input length is odd,
+        # one item remains
+        if a_lo <= a_hi
+            scratch[k_lo] = v[a_lo]
+        elseif b_lo <= b_hi
+            scratch[k_lo] = v[b_lo]
+        end
+        # copy back from t to v
+        offset = lo-1
+        for i = 1:len
+            v[offset+i] = scratch[i]
+        end
+    end
+end
+
+# core merging loop used throughout PagedMergeSort
+Base.@propagate_inbounds function merge!(f::Function,
+    target::AbstractVector{T}, source_a::AbstractVector{T}, source_b::AbstractVector{T},
+    o::Ordering, a::Integer, b::Integer, k::Integer) where T
+    @inbounds while f(a,b,k)
+        if lt(o, source_b[b], source_a[a])
+            target[k] = source_b[b]
+            b += 1
+        else
+            target[k] = source_a[a]
+            a += 1
+        end
+        k += 1
+    end
+    a,b,k
+end
+
+# merge v[lo:m] and v[m+1:hi] using scratch[1:1+m-lo]
+# based on Base.Sort MergeSort
+function merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}) where {T}
+    _unsafe_copyto!(scratch, 1, v, lo, m - lo + 1)
+    f(_, b, k) = k < b <= hi
+    a, b, k = merge!(f, v, scratch, v, o, 1, m + 1, lo)
+    _unsafe_copyto!(v, k, scratch, a, b - k)
+end
+
+struct Pages
+    current::Int       # current page being merged into
+    currentNumber::Int # number of current page (=index in pageLocations)
+    nextA::Int         # next possible page in A
+    nextB::Int         # next possible page in B
+end
+
+next_page_A(pages::Pages) = Pages(pages.nextA, pages.currentNumber + 1, pages.nextA + 1, pages.nextB)
+next_page_B(pages::Pages) = Pages(pages.nextB, pages.currentNumber + 1, pages.nextA, pages.nextB + 1)
+
+Base.@propagate_inbounds function next_page!(pageLocations, pages, pagesize, lo, a)
+    if a > pages.nextA * pagesize + lo
+        pages = next_page_A(pages)
+    else
+        pages = next_page_B(pages)
+    end
+    pageLocations[pages.currentNumber] = pages.current
+    pages
+end
+
+Base.@propagate_inbounds function permute_pages!(f, v, pageLocations, page_offset, pagesize, page)
+    while f(page)
+        plc = pageLocations[page-3] # plc has data belonging to page
+        pageLocations[page-3] = page
+        _unsafe_copyto!(v, page_offset(page) + 1, v, page_offset(plc) + 1, pagesize)
+        page = plc
+    end
+    page
+end
+
+# merge v[lo:m] (A) and v[m+1:hi] (B) using scratch[] in O(sqrt(n)) space
+function paged_merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}, pageLocations::AbstractVector{<:Integer}) where {T}
+    @assert lo < m < hi
+    lenA = 1 + m - lo
+    lenB = hi - m
+
+    # this function only supports merges with length(A) <= length(B),
+    # which is guaranteed by pagedmergesort!
+    @assert lenA <= lenB
+
+    # regular merge if scratch is big enough
+    lenA <= length(scratch) && return merge!(v, lo, m, hi, o, scratch)
+
+    len = lenA + lenB
+    pagesize = isqrt(len)
+    nPages = len ÷ pagesize # a partial page at the end does not count
+    @assert length(scratch) >= 3pagesize
+    @assert length(pageLocations) >= nPages - 3
+
+    @inline page_offset(page) = (page - 1) * pagesize + lo - 1
+
+    @inbounds begin
+        ##################
+        # merge
+        ##################
+        # merge the first 3 pages into scratch
+        a, b, _ = merge!((_, _, k) -> k <= 3pagesize, scratch, v, v, o, lo, m + 1, 1)
+        # initialize variables for merging into pages
+        pages = Pages(-17, 0, 1, (m - lo) ÷ pagesize + 2) # first argument is unused
+        # more efficient loop while more than pagesize elements of A and B are remaining
+        while_condition1(offset) = (_, _, k) -> k <= offset + pagesize
+        while a < m - pagesize && b < hi - pagesize
+            pages = next_page!(pageLocations, pages, pagesize, lo, a)
+            offset = page_offset(pages.current)
+            a, b, _ = merge!(while_condition1(offset), v, v, v, o, a, b, offset + 1)
+        end
+        # merge until either A or B is empty or the last page is reached
+        k, offset = nothing, nothing
+        while_condition2(offset) = (a, b, k) -> k <= offset + pagesize && a <= m && b <= hi
+        while a <= m && b <= hi && pages.currentNumber + 3 < nPages
+            pages = next_page!(pageLocations, pages, pagesize, lo, a)
+            offset = page_offset(pages.current)
+            a, b, k = merge!(while_condition2(offset), v, v, v, o, a, b, offset + 1)
+        end
+        # if the last page is reached, merge the remaining elements into the final partial page
+        if pages.currentNumber + 3 == nPages && a <= m && b <= hi
+            a, b, k = merge!((a, b, _) -> a <= m && b <= hi, v, v, v, o, a, b, nPages * pagesize + lo)
+            _unsafe_copyto!(v, k, v, a <= m ? a : b, hi - k + 1)
+        else
+            use_a = a <= m
+            # copy the incomplete page
+            partial_page_size = offset + pagesize - k + 1
+            _unsafe_copyto!(v, k, v, use_a ? a : b, partial_page_size)
+            use_a && (a += partial_page_size)
+            use_a || (b += partial_page_size)
+            # copy the remaining full pages
+            while use_a ? a <= m - pagesize + 1 : b <= hi - pagesize + 1
+                pages = next_page!(pageLocations, pages, pagesize, lo, a)
+                offset = page_offset(pages.current)
+                _unsafe_copyto!(v, offset + 1, v, use_a ? a : b, pagesize)
+                use_a && (a += pagesize)
+                use_a || (b += pagesize)
+            end
+            # copy the final partial page only if sourcing from A.
+            # If sourcing from B, it is already in place.
+            use_a && _unsafe_copyto!(v, hi - m + a, v, a, m - a + 1)
+        end
+
+        ##################
+        # rearrange pages
+        ##################
+        # copy pages belonging to the 3 permutation chains ending with a page in the scratch space
+        nextA, nextB = pages.nextA, pages.nextB
+
+        for _ in 1:3
+            page = (nextB > nPages ? (nextA += 1) : (nextB += 1)) - 1
+            page = permute_pages!(>(3), v, pageLocations, page_offset, pagesize, page)
+            _unsafe_copyto!(v, page_offset(page) + 1, scratch, (page - 1) * pagesize + 1, pagesize)
+        end
+
+        # copy remaining permutation cycles
+        for donePageIndex = 5:nPages
+            # linear scan through pageLocations to make sure no cycle is missed
+            page = pageLocations[donePageIndex-3]
+            page == donePageIndex && continue
+
+            # copy the data belonging to donePageIndex into scratch
+            _unsafe_copyto!(scratch, 1, v, page_offset(page) + 1, pagesize)
+
+            # follow the cycle starting with the newly freed page
+            permute_pages!(!=(donePageIndex), v, pageLocations, page_offset, pagesize, page)
+            _unsafe_copyto!(v, page_offset(donePageIndex) + 1, scratch, 1, pagesize)
+        end
+    end
+end
+
+# midpoint was added to Base.sort in version 1.4 and later moved to Base
+# -> redefine for compatibility with earlier versions
+midpoint(lo::Integer, hi::Integer) = lo + ((hi - lo) >>> 0x01)
+
+function pagedmergesort!(v::AbstractVector{T}, lo::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}, pageLocations) where {T}
+    len = hi + 1 - lo
+    if len <= Base.SMALL_THRESHOLD
+        return Base.Sort.sort!(v, lo, hi, Base.Sort.InsertionSortAlg(), o)
+    end
+    m = midpoint(lo, hi - 1) # hi-1: ensure midpoint is rounded down. OK, because lo < hi is satisfied here
+    pagedmergesort!(v, lo, m, o, scratch, pageLocations)
+    pagedmergesort!(v, m + 1, hi, o, scratch, pageLocations)
+    if len <= length(scratch)
+        twoended_merge!(v, lo, m, hi, o, scratch)
+    else
+        paged_merge!(v, lo, m, hi, o, scratch, pageLocations)
+    end
+    return v
+end
+
+function sort!(v::AbstractVector, lo::Integer, hi::Integer, ::PagedMergeSortAlg, o::Ordering)
+    lo >= hi && return v
+    n = hi + 1 - lo
+    pagesize = isqrt(n)
+    scratch = Vector{eltype(v)}(undef, 3pagesize)
+    nPages = n ÷ pagesize
+    pageLocations = Vector{Int}(undef, max(0, nPages - 3))
+    pagedmergesort!(v, lo, hi, o, scratch, pageLocations)
+    return v
+end
 end # module
diff --git a/test/aqua.jl b/test/aqua.jl
new file mode 100644
index 0000000..2fcd2a2
--- /dev/null
+++ b/test/aqua.jl
@@ -0,0 +1,4 @@
+using Aqua
+Aqua.test_all(SortingAlgorithms, deps_compat=false)
+# It's okay to use the latest version of test deps:
+Aqua.test_deps_compat(SortingAlgorithms, check_extras=false)
diff --git a/test/runtests.jl b/test/runtests.jl
index 52d8904..aab5738 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,9 +3,13 @@ using Test
 using StatsBase
 using Random
 
+stable_algorithms = [TimSort, RadixSort, PagedMergeSort]
+unstable_algorithms = [HeapSort, CombSort]
+
 a = rand(1:10000, 1000)
+am = [rand() < .9 ? i : missing for i in a]
 
-for alg in [TimSort, HeapSort, RadixSort]
+for alg in [stable_algorithms; unstable_algorithms; SortingAlgorithms.TimSortAlg()]
     b = sort(a, alg=alg)
     @test issorted(b)
     ix = sortperm(a, alg=alg)
@@ -13,6 +17,9 @@ for alg in [TimSort, HeapSort, RadixSort]
     @test issorted(b)
     @test a[ix] == b
 
+    # legacy 3-argument calling convention
+    @test b == sort!(copy(a), alg, Base.Order.Forward)
+
     b = sort(a, alg=alg, rev=true)
     @test issorted(b, rev=true)
     ix = sortperm(a, alg=alg, rev=true)
@@ -34,9 +41,26 @@ for alg in [TimSort, HeapSort, RadixSort]
     invpermute!(c, ix)
     @test c == a
 
-    if alg != RadixSort  # RadixSort does not work with Lt orderings
+    if alg != RadixSort  # RadixSort does not work with Lt orderings or missing
         c = sort(a, alg=alg, lt=(>))
         @test b == c
+
+        # Issue https://github.com/JuliaData/DataFrames.jl/issues/3340
+        bm1 = sort(am, alg=alg)
+        @test issorted(bm1)
+        @test count(ismissing, bm1) == count(ismissing, am)
+
+        bm2 = am[sortperm(am, alg=alg)]
+        @test issorted(bm2)
+        @test count(ismissing, bm2) == count(ismissing, am)
+
+        bm3 = am[sortperm!(collect(eachindex(am)), am, alg=alg)]
+        @test issorted(bm3)
+        @test count(ismissing, bm3) == count(ismissing, am)
+
+        if alg == TimSort # Stable
+            @test all(bm1 .=== bm2 .=== bm3)
+        end
     end
 
     c = sort(a, alg=alg, by=x->1/x)
@@ -73,8 +97,7 @@ for n in [0:10..., 100, 101, 1000, 1001]
         invpermute!(c, pi)
         @test c == v
 
-        # stable algorithms
-        for alg in [TimSort, RadixSort]
+        for alg in stable_algorithms
             p = sortperm(v, alg=alg, order=ord)
             @test p == pi
             s = copy(v)
@@ -84,8 +107,7 @@ for n in [0:10..., 100, 101, 1000, 1001]
             @test s == v
         end
 
-        # unstable algorithms
-        for alg in [HeapSort]
+        for alg in unstable_algorithms
             p = sortperm(v, alg=alg, order=ord)
             @test isperm(p)
             @test v[p] == si
@@ -99,10 +121,14 @@ for n in [0:10..., 100, 101, 1000, 1001]
 
     v = randn_with_nans(n,0.1)
     for ord in [Base.Order.Forward, Base.Order.Reverse],
-        alg in [TimSort, HeapSort, RadixSort]
+        alg in [stable_algorithms; unstable_algorithms]
         # test float sorting with NaNs
         s = sort(v, alg=alg, order=ord)
         @test issorted(s, order=ord)
+
+        # This tests that NaNs (which compare equivalent) are treated stably
+        # even when the underlying algorithm is unstable. That it happens to
+        # pass is not a part of the public API:
         @test reinterpret(UInt64, v[map(isnan, v)]) == reinterpret(UInt64, s[map(isnan, s)])
 
         # test float permutation with NaNs
@@ -113,3 +139,24 @@ for n in [0:10..., 100, 101, 1000, 1001]
         @test reinterpret(UInt64,vp) == reinterpret(UInt64,s)
     end
 end
+
+for T in (Float64, Int, UInt8)
+    for alg in stable_algorithms
+        for ord in [Base.Order.By(identity), Base.Order.By(_ -> 0), Base.Order.By(Base.Fix2(÷, 100))]
+            for n in vcat(0:31, 40:11:100, 110:51:1000)
+                v = rand(T, n)
+                # use MergeSort to guarantee stable sorting in Julia 1.0
+                @test sort(v, alg=alg, order=ord) == sort(v, alg=MergeSort, order=ord)
+            end
+        end
+    end
+end
+
+# PagedMergeSort with small input without InitialOptimizations
+# (https://github.com/JuliaCollections/SortingAlgorithms.jl/pull/71#discussion_r1292774352)
+if isdefined(Base.Sort, :InitialOptimizations)
+    v = [0,1]
+    @test sort(v, alg=SortingAlgorithms.PagedMergeSortAlg()) == sort(v, alg=MergeSort)
+end
+
+include("aqua.jl")