diff --git a/CMakeLists.txt b/CMakeLists.txt
index 545ec65ad..e4d8a936c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,11 +168,11 @@ option(GUNROCK_GENCODE_SM20
 
 option(GUNROCK_GENCODE_SM30
   "ON to generate code for Compute Capability 3.0 devices (e.g. Tesla K10)"
-  ON)
+  OFF)
 
 option(GUNROCK_GENCODE_SM35
   "ON to generate code for Compute Capability 3.5 devices (e.g. Tesla K20)"
-  OFF)
+  ON)
 
 option(GUNROCK_GENCODE_SM37
   "ON to generate code for Compute Capability 3.7 devices (e.g. Tesla K80)"
diff --git a/gunrock/app/bc/bc_app.cu b/gunrock/app/bc/bc_app.cu
index 01a034cbb..d1d0e4b90 100644
--- a/gunrock/app/bc/bc_app.cu
+++ b/gunrock/app/bc/bc_app.cu
@@ -240,18 +240,18 @@ void runBC(GRGraph* output, BC_Parameter *parameter)
 void dispatchBC(
     GRGraph*        grapho,
     const GRGraph*  graphi,
-    const GRSetup   config,
+    const GRSetup*  config,
     const GRTypes   data_t,
     ContextPtr*     context,
     cudaStream_t*   streams)
 {
     BC_Parameter* parameter = new BC_Parameter;
     parameter->src = (long long*)malloc(sizeof(long long));
-    parameter->g_quiet  = config.quiet;
+    parameter->g_quiet  = config -> quiet;
     parameter->context  = context;
     parameter->streams  = streams;
-    parameter->num_gpus = config.num_devices;
-    parameter->gpu_idx  = config.device_list;
+    parameter->num_gpus = config -> num_devices;
+    parameter->gpu_idx  = config -> device_list;
 
     switch (data_t.VTXID_TYPE)
     {
@@ -286,7 +286,7 @@ void dispatchBC(
                 parameter->graph = &csr;
 
                 // determine source vertex to start
-                switch (config.source_mode)
+                switch (config -> source_mode)
                 {
                 case randomize:
                 {
@@ -301,7 +301,7 @@ void dispatchBC(
                 }
                 case manually:
                 {
-                    parameter->src[0] = config.source_vertex[0];
+                    parameter->src[0] = config -> source_vertex[0];
                     break;
                 }
                 default:
@@ -341,7 +341,7 @@ void dispatchBC(
 void gunrock_bc(
     GRGraph       *grapho,
     const GRGraph *graphi,
-    const GRSetup  config,
+    const GRSetup *config,
     const GRTypes  data_t)
 {
     // GPU-related configurations
@@ -350,20 +350,20 @@ void gunrock_bc(
     ContextPtr    *context = NULL;
     cudaStream_t  *streams = NULL;
 
-    num_gpus = config.num_devices;
+    num_gpus = config -> num_devices;
     gpu_idx  = new int [num_gpus];
     for (int i = 0; i < num_gpus; ++i)
     {
-        gpu_idx[i] = config.device_list[i];
+        gpu_idx[i] = config -> device_list[i];
     }
 
     // Create streams and MordernGPU context for each GPU
     streams = new cudaStream_t[num_gpus * num_gpus * 2];
     context = new ContextPtr[num_gpus * num_gpus];
-    if (!config.quiet) { printf(" using %d GPUs:", num_gpus); }
+    if (!config -> quiet) { printf(" using %d GPUs:", num_gpus); }
     for (int gpu = 0; gpu < num_gpus; ++gpu)
     {
-        if (!config.quiet) { printf(" %d ", gpu_idx[gpu]); }
+        if (!config -> quiet) { printf(" %d ", gpu_idx[gpu]); }
         util::SetDevice(gpu_idx[gpu]);
         for (int i = 0; i < num_gpus * 2; ++i)
         {
@@ -378,7 +378,7 @@ void gunrock_bc(
             }
         }
     }
-    if (!config.quiet) { printf("\n"); }
+    if (!config -> quiet) { printf("\n"); }
 
     dispatchBC(grapho, graphi, config, data_t, context, streams);
 }
@@ -406,8 +406,8 @@ void bc(
     data_t.SIZET_TYPE = SIZET_INT;    // integer graph size type
     data_t.VALUE_TYPE = VALUE_FLOAT;  // float attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);  // primitive-specific configures
-    config.source_vertex[0] = source;        // source vertex to start
+    struct GRSetup *config = InitSetup(1, NULL);  // primitive-specific configures
+    config -> source_vertex[0] = source;        // source vertex to start
 
     struct GRGraph *grapho = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graphi = (struct GRGraph*)malloc(sizeof(struct GRGraph));
diff --git a/gunrock/app/bc/bc_enactor.cuh b/gunrock/app/bc/bc_enactor.cuh
index 81e7230ed..d777b55b9 100644
--- a/gunrock/app/bc/bc_enactor.cuh
+++ b/gunrock/app/bc/bc_enactor.cuh
@@ -2226,6 +2226,7 @@ public:
             else if (traversal_mode == "LB_LIGHT_CULL")
                  return MODE_SWITCH<SizeT, gunrock::oprtr::advance::LB_LIGHT_CULL>
                     ::Init(*this, context, problem, max_grid_size);
+            else printf("Traversal mode %s is not supported by BC at the moment\n", traversal_mode.c_str());
         }
 
         //to reduce compile time, get rid of other architecture for now
diff --git a/gunrock/app/bc/bc_functor.cuh b/gunrock/app/bc/bc_functor.cuh
index 193fb871e..c85869d5c 100644
--- a/gunrock/app/bc/bc_functor.cuh
+++ b/gunrock/app/bc/bc_functor.cuh
@@ -16,6 +16,7 @@
 
 #include <gunrock/app/problem_base.cuh>
 #include <gunrock/app/bc/bc_problem.cuh>
+#include <gunrock/util/device_intrinsics.cuh>
 
 namespace gunrock {
 namespace app {
@@ -92,7 +93,7 @@ struct ForwardFunctor {
             //VertexId label_d;
             //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
             //    label_d, d_data_slice->labels + d_id);
-            //label_d = __ldg(d_data_slice -> labels + d_id);
+            //label_d = _ldg(d_data_slice -> labels + d_id);
             //if (label_d == label /*+ 1*/) {
                 //Accumulate sigma value
                 atomicAdd(d_data_slice->sigmas + d_id, d_data_slice->sigmas[s_id]);
@@ -229,10 +230,10 @@ struct BackwardFunctor {
         VertexId d_label;
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    s_label, d_data_slice->labels + s_id);
-        s_label = __ldg(d_data_slice -> labels + s_id);
+        s_label = _ldg(d_data_slice -> labels + s_id);
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    d_label, d_data_slice->labels + d_id);
-        d_label = __ldg(d_data_slice -> labels + d_id);
+        d_label = _ldg(d_data_slice -> labels + d_id);
         return (d_label == s_label + 1);
     }
 
@@ -265,17 +266,17 @@ struct BackwardFunctor {
         Value from_sigma;
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    from_sigma, d_data_slice->sigmas + s_id);
-        from_sigma = __ldg(d_data_slice -> sigmas + s_id);
+        from_sigma = _ldg(d_data_slice -> sigmas + s_id);
 
         Value to_sigma;
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    to_sigma, d_data_slice->sigmas + d_id);
-        to_sigma = __ldg(d_data_slice -> sigmas + d_id);
+        to_sigma = _ldg(d_data_slice -> sigmas + d_id);
 
         Value to_delta;
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    to_delta, d_data_slice->deltas + d_id);
-        to_delta = __ldg(d_data_slice -> deltas + d_id);
+        to_delta = _ldg(d_data_slice -> deltas + d_id);
 
         Value result = from_sigma / to_sigma * (1.0 + to_delta);
 
diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index d4ff55a7b..dae82b2da 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -36,12 +36,14 @@ struct BFS_Parameter : gunrock::app::TestParameter_Base
 public:
     bool   mark_predecessors ;  // mark src-distance vs. parent vertices
     bool   enable_idempotence;  // enable idempotence operation
+    bool   direction_optimized; // enable direction optimization
     double max_queue_sizing1 ;  // maximum queue sizing factor
 
     BFS_Parameter()
     {
         mark_predecessors  = false;
         enable_idempotence = false;
+        direction_optimized = false;
         max_queue_sizing1  = -1.0f;
     }
 
@@ -173,10 +175,12 @@ float runBFS(GRGraph* output, BFS_Parameter *parameter)
     float         partition_factor     = parameter -> partition_factor;
     int           partition_seed       = parameter -> partition_seed;
     bool          g_stream_from_host   = parameter -> g_stream_from_host;
-    int           traversal_mode       = parameter -> traversal_mode;
+    std::string   traversal_mode       = parameter -> traversal_mode;
     bool          instrument           = parameter -> instrumented;
     bool          debug                = parameter -> debug;
     bool          size_check           = parameter -> size_check;
+    bool          undirected           = parameter -> g_undirected;
+    bool          direction_optimized  = parameter -> direction_optimized;
     size_t       *org_size             = new size_t  [num_gpus];
     // Allocate host-side label array
     VertexId     *h_labels             = new VertexId[graph->nodes];
@@ -193,7 +197,7 @@ float runBFS(GRGraph* output, BFS_Parameter *parameter)
         cudaSetDevice(gpu_idx[gpu]);
         cudaMemGetInfo(&(org_size[gpu]), &dummy);
     }
-    Problem *problem = new Problem;  // Allocate problem on GPU
+    Problem *problem = new Problem(direction_optimized, undirected);  // Allocate problem on GPU
 
     util::GRError( problem->Init(
         g_stream_from_host,
@@ -210,7 +214,7 @@ float runBFS(GRGraph* output, BFS_Parameter *parameter)
         "Problem BFS Initialization Failed", __FILE__, __LINE__);
 
     Enactor *enactor = new Enactor(
-        num_gpus, gpu_idx, instrument, debug, size_check);  // BFS enactor map
+        num_gpus, gpu_idx, instrument, debug, size_check, direction_optimized);  // BFS enactor map
     util::GRError(
         enactor->Init(context, problem, max_grid_size, traversal_mode),
         "BFS Enactor init failed", __FILE__, __LINE__);
@@ -276,21 +280,21 @@ float runBFS(GRGraph* output, BFS_Parameter *parameter)
 float dispatch_bfs(
     GRGraph*       grapho,
     const GRGraph* graphi,
-    const GRSetup  config,
+    const GRSetup* config,
     const GRTypes  data_t,
     ContextPtr*    context,
     cudaStream_t*  streams)
 {
     BFS_Parameter *parameter = new BFS_Parameter;
-    parameter->iterations = config.num_iters;
-    parameter->src = (long long*)malloc(sizeof(long long)*config.num_iters);
+    parameter->iterations = config -> num_iters;
+    parameter->src = (long long*)malloc(sizeof(long long)*config -> num_iters);
     parameter->context  = context;
     parameter->streams  = streams;
-    parameter->g_quiet  = config.quiet;
-    parameter->num_gpus = config.num_devices;
-    parameter->gpu_idx  = config.device_list;
-    parameter->mark_predecessors  = config.mark_predecessors;
-    parameter->enable_idempotence = config.enable_idempotence;
+    parameter->g_quiet  = config -> quiet;
+    parameter->num_gpus = config -> num_devices;
+    parameter->gpu_idx  = config -> device_list;
+    parameter->mark_predecessors  = config -> mark_predecessors;
+    parameter->enable_idempotence = config -> enable_idempotence;
 
     float elapsed_time;
 
@@ -315,7 +319,7 @@ float dispatch_bfs(
                 parameter->graph = &csr;
 
                 // determine source vertex to start
-                switch (config.source_mode)
+                switch (config -> source_mode)
                 {
                 case randomize:
                 {
@@ -329,7 +333,7 @@ float dispatch_bfs(
                 {
                     int max_deg = 0;
                     int node_id = csr.GetNodeWithHighestDegree(max_deg);
-                    for (int i = 0; i < config.num_iters; ++i)
+                    for (int i = 0; i < config -> num_iters; ++i)
                     {
                         parameter->src[i] = node_id;
                     }
@@ -339,7 +343,7 @@ float dispatch_bfs(
                 {
                     for (int i = 0; i < parameter->iterations; ++i)
                     {
-                        parameter->src[i] = config.source_vertex[i];
+                        parameter->src[i] = config -> source_vertex[i];
                     }
                     break;
                 }
@@ -355,7 +359,7 @@ float dispatch_bfs(
                 if (!parameter->g_quiet)
                 {
                     printf(" source: %lld", (long long) parameter->src[0]);
-                    for (int i = 1; i < config.num_iters; ++i)
+                    for (int i = 1; i < config -> num_iters; ++i)
                     {
                         printf(",%lld", (long long) parameter->src[i]);
                     }
@@ -403,7 +407,7 @@ float dispatch_bfs(
 float gunrock_bfs(
     GRGraph*       grapho,
     const GRGraph* graphi,
-    const GRSetup  config,
+    const GRSetup* config,
     const GRTypes  data_t)
 {
     // GPU-related configurations
@@ -412,20 +416,20 @@ float gunrock_bfs(
     ContextPtr    *context = NULL;
     cudaStream_t  *streams = NULL;
 
-    num_gpus = config.num_devices;
+    num_gpus = config -> num_devices;
     gpu_idx  = new int [num_gpus];
     for (int i = 0; i < num_gpus; ++i)
     {
-        gpu_idx[i] = config.device_list[i];
+        gpu_idx[i] = config -> device_list[i];
     }
 
     // Create streams and MordernGPU context for each GPU
     streams = new cudaStream_t[num_gpus * num_gpus * 2];
     context = new ContextPtr[num_gpus * num_gpus];
-    if (!config.quiet) { printf(" using %d GPUs:", num_gpus); }
+    if (!config -> quiet) { printf(" using %d GPUs:", num_gpus); }
     for (int gpu = 0; gpu < num_gpus; ++gpu)
     {
-        if (!config.quiet) { printf(" %d ", gpu_idx[gpu]); }
+        if (!config -> quiet) { printf(" %d ", gpu_idx[gpu]); }
         util::SetDevice(gpu_idx[gpu]);
         for (int i = 0; i < num_gpus * 2; ++i)
         {
@@ -440,7 +444,7 @@ float gunrock_bfs(
             }
         }
     }
-    if (!config.quiet) { printf("\n"); }
+    if (!config -> quiet) { printf("\n"); }
 
     return dispatch_bfs(grapho, graphi, config, data_t, context, streams);
 }
@@ -476,10 +480,10 @@ float bfs(
     data_t.SIZET_TYPE = SIZET_INT;  // integer graph size type
     data_t.VALUE_TYPE = VALUE_INT;  // integer attributes type
 
-    struct GRSetup config = InitSetup(num_iters, source);  // primitive-specific configures
-    config.mark_predecessors  = mark_predecessors;    // do not mark predecessors
-    config.enable_idempotence = enable_idempotence;    // wether enable idempotence
-    config.source_mode = source_mode;
+    struct GRSetup* config = InitSetup(num_iters, source);  // primitive-specific configures
+    config -> mark_predecessors  = mark_predecessors;    // do not mark predecessors
+    config -> enable_idempotence = enable_idempotence;    // wether enable idempotence
+    config -> source_mode = source_mode;
 
     struct GRGraph *grapho = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graphi = (struct GRGraph*)malloc(sizeof(struct GRGraph));
diff --git a/gunrock/app/bfs/bfs_enactor.cuh b/gunrock/app/bfs/bfs_enactor.cuh
index f77b2dbb6..2d47b7c74 100644
--- a/gunrock/app/bfs/bfs_enactor.cuh
+++ b/gunrock/app/bfs/bfs_enactor.cuh
@@ -19,7 +19,6 @@
 #include <gunrock/util/multithread_utils.cuh>
 #include <gunrock/util/kernel_runtime_stats.cuh>
 #include <gunrock/util/test_utils.cuh>
-#include <gunrock/util/device_intrinsics.cuh>
 
 #include <gunrock/oprtr/advance/kernel.cuh>
 #include <gunrock/oprtr/filter/kernel.cuh>
@@ -128,7 +127,7 @@ struct LoadLabel<long long, SizeT>
     static __device__ __forceinline__ long long Load
         (long long *&d_labels, SizeT &pos)
     {
-        return __ldg(d_labels + pos);
+        return _ldg(d_labels + pos);
     }
 };
 
@@ -170,7 +169,7 @@ __global__ void Expand_Incoming_Kernel(
         //MaskT tex_mask_byte;
         if (x < num_elements)
         {
-            key = __ldg(d_keys_in + x);
+            key = _ldg(d_keys_in + x);
             if (KernelPolicy::Problem::ENABLE_IDEMPOTENCE)
             {
                 mask_pos = (key & KernelPolicy::LOAD_BALANCED_CULL::ELEMENT_ID_MASK) >> (2+sizeof(MaskT));
@@ -200,7 +199,7 @@ __global__ void Expand_Incoming_Kernel(
             //if (to_process)
             //{
             //    if (tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets,  key + 1) == tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets,  key))
-                //if (__ldg(d_row_offsets + key) == __ldg(d_row_offsets + (key+1)))
+                //if (_ldg(d_row_offsets + key) == _ldg(d_row_offsets + (key+1)))
             //        to_process = false;
             //}
         } else to_process = false;
@@ -317,7 +316,7 @@ __global__ void From_Unvisited_Queue_IDEM(
                 if (key >= num_nodes) break;
 
                 //if (tex1Dfetch(gunrock::oprtr::cull_filter::LabelsTex<VertexId>::labels, key) != util::MaxValue<VertexId>())
-                if (__ldg(d_labels + key) != util::MaxValue<VertexId>())
+                if (_ldg(d_labels + key) != util::MaxValue<VertexId>())
                 {
                     //to_process = false;
                     mask_byte |= mask_bit;
@@ -327,7 +326,7 @@ __global__ void From_Unvisited_Queue_IDEM(
                 //if (to_process)
                 { // only works for undirected graph
                     //if (tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets, key) == tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets, key+1)) to_process = false;
-                    if (__ldg(d_row_offsets + key) == __ldg(d_row_offsets + (key+1)))
+                    if (_ldg(d_row_offsets + key) == _ldg(d_row_offsets + (key+1)))
                         continue;
                 }
                 //if (to_process)
@@ -506,7 +505,7 @@ __global__ void Inverse_Expand(
 
         if (x < num_unvisited_vertices)
         {
-            key = __ldg(d_unvisited_key_in + x);
+            key = _ldg(d_unvisited_key_in + x);
         } else to_process = false;
 
         if (to_process && Problem::ENABLE_IDEMPOTENCE)
@@ -537,9 +536,9 @@ __global__ void Inverse_Expand(
         if (to_process)
         {
             SizeT edge_start = //tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets, key);
-                __ldg(d_inverse_row_offsets + key);
+                _ldg(d_inverse_row_offsets + key);
             SizeT edge_end = //tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets, key+1);
-                __ldg(d_inverse_row_offsets + (key+1));
+                _ldg(d_inverse_row_offsets + (key+1));
             for (SizeT edge_id = edge_start; edge_id < edge_end; edge_id++)
             {
                 VertexId neighbor = d_inverse_column_indices[edge_id];
@@ -618,7 +617,7 @@ __global__ void Update_Mask_Kernel(
 
     while ( x < end_mask_pos)
     {
-        MaskT mask = __ldg(visited_masks + x);
+        MaskT mask = _ldg(visited_masks + x);
         VertexId v = (x << 3) * sizeof(MaskT);
         bool has_change = false;
         #pragma unroll
@@ -627,7 +626,7 @@ __global__ void Update_Mask_Kernel(
             MaskT mask_bit = 1 << i;
             if ((!(mask & mask_bit)) && (v < num_nodes))
             {
-                if (__ldg(labels + v) != util::MaxValue<VertexId>())
+                if (_ldg(labels + v) != util::MaxValue<VertexId>())
                 {
                     mask |= mask_bit;
                     has_change = true;
@@ -682,7 +681,7 @@ __global__ void Combind_Masks(
             {
                 MaskT in_mask = 0;
                 if (d_out_key_length[gpu] != 0)
-                    in_mask = __ldg(s_mask_ins[gpu] + x);
+                    in_mask = _ldg(s_mask_ins[gpu] + x);
                 //printf("(%d, %d) : in_mask = %#x\n", blockIdx.x, threadIdx.x, in_mask);
                 new_mask |= in_mask;
             }
@@ -698,7 +697,7 @@ __global__ void Combind_Masks(
                     MaskT mask_bit = 1 << i;
                     if ((update_mask & mask_bit) && (v < num_nodes))
                     {
-                        if (__ldg(d_partition_table + v) == 0)
+                        if (_ldg(d_partition_table + v) == 0)
                         {
                             l_vertices[l_vertex_counter] = v;
                             l_vertex_counter ++;
diff --git a/gunrock/app/bfs/bfs_functor.cuh b/gunrock/app/bfs/bfs_functor.cuh
index d807b10d5..0422dfa33 100644
--- a/gunrock/app/bfs/bfs_functor.cuh
+++ b/gunrock/app/bfs/bfs_functor.cuh
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <gunrock/util/track_utils.cuh>
+#include <gunrock/util/device_intrinsics.cuh>
 #include <gunrock/app/problem_base.cuh>
 #include <gunrock/app/bfs/bfs_problem.cuh>
 
diff --git a/gunrock/app/cc/cc_app.cu b/gunrock/app/cc/cc_app.cu
index 7b011c84f..ca406e4d9 100644
--- a/gunrock/app/cc/cc_app.cu
+++ b/gunrock/app/cc/cc_app.cu
@@ -94,6 +94,7 @@ void runCC(GRGraph* output, CC_Parameter *parameter)
     bool          instrument         = parameter -> instrumented;
     bool          debug              = parameter -> debug;
     bool          size_check         = parameter -> size_check;
+    std::string   traversal_mode     = parameter -> traversal_mode;
     size_t       *org_size           = new size_t  [num_gpus];
     // Allocate host-side label array
     VertexId    *h_component_ids     = new VertexId[graph->nodes];
@@ -105,7 +106,7 @@ void runCC(GRGraph* output, CC_Parameter *parameter)
         cudaMemGetInfo(&(org_size[gpu]), &dummy);
     }
 
-    Problem* problem = new Problem(false);  // Allocate problem on GPU
+    Problem* problem = new Problem;  // Allocate problem on GPU
     util::GRError(
         problem->Init(
             g_stream_from_host,
@@ -124,7 +125,7 @@ void runCC(GRGraph* output, CC_Parameter *parameter)
     Enactor* enactor = new Enactor(
         num_gpus, gpu_idx, instrument, debug, size_check);  // CC enactor map
     util::GRError(
-        enactor->Init(context, problem, max_grid_size),
+        enactor->Init(context, problem, traversal_mode, max_grid_size),
         "CC Enactor Init failed", __FILE__, __LINE__);
 
     // Perform CC
@@ -176,7 +177,7 @@ void runCC(GRGraph* output, CC_Parameter *parameter)
 void dispatch_cc(
     GRGraph*       grapho,
     const GRGraph* graphi,
-    const GRSetup  config,
+    const GRSetup* config,
     const GRTypes  data_t,
     ContextPtr*    context,
     cudaStream_t*  streams)
@@ -184,9 +185,9 @@ void dispatch_cc(
     CC_Parameter *parameter = new CC_Parameter;
     parameter->context  = context;
     parameter->streams  = streams;
-    parameter->g_quiet  = config.quiet;
-    parameter->num_gpus = config.num_devices;
-    parameter->gpu_idx  = config.device_list;
+    parameter->g_quiet  = config -> quiet;
+    parameter->num_gpus = config -> num_devices;
+    parameter->gpu_idx  = config -> device_list;
 
     switch (data_t.VTXID_TYPE)
     {
@@ -245,7 +246,7 @@ void dispatch_cc(
 void gunrock_cc(
     GRGraph       *grapho,
     const GRGraph *graphi,
-    const GRSetup  config,
+    const GRSetup *config,
     const GRTypes  data_t)
 {
     // GPU-related configurations
@@ -254,20 +255,20 @@ void gunrock_cc(
     ContextPtr    *context = NULL;
     cudaStream_t  *streams = NULL;
 
-    num_gpus = config.num_devices;
+    num_gpus = config -> num_devices;
     gpu_idx  = new int [num_gpus];
     for (int i = 0; i < num_gpus; ++i)
     {
-        gpu_idx[i] = config.device_list[i];
+        gpu_idx[i] = config -> device_list[i];
     }
 
     // Create streams and MordernGPU context for each GPU
     streams = new cudaStream_t[num_gpus * num_gpus * 2];
     context = new ContextPtr[num_gpus * num_gpus];
-    if (!config.quiet) { printf(" using %d GPUs:", num_gpus); }
+    if (!config -> quiet) { printf(" using %d GPUs:", num_gpus); }
     for (int gpu = 0; gpu < num_gpus; ++gpu)
     {
-        if (!config.quiet) { printf(" %d ", gpu_idx[gpu]); }
+        if (!config -> quiet) { printf(" %d ", gpu_idx[gpu]); }
         util::SetDevice(gpu_idx[gpu]);
         for (int i = 0; i < num_gpus * 2; ++i)
         {
@@ -282,7 +283,7 @@ void gunrock_cc(
             }
         }
     }
-    if (!config.quiet) { printf("\n"); }
+    if (!config -> quiet) { printf("\n"); }
 
     dispatch_cc(grapho, graphi, config, data_t, context, streams);
 }
@@ -309,7 +310,7 @@ int cc(
     data_t.SIZET_TYPE = SIZET_INT;  // integer graph size type
     data_t.VALUE_TYPE = VALUE_INT;  // integer attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);  // primitive-specific configures
+    struct GRSetup *config = InitSetup(1, NULL);  // primitive-specific configures
 
     struct GRGraph *grapho = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graphi = (struct GRGraph*)malloc(sizeof(struct GRGraph));
diff --git a/gunrock/app/cc/cc_enactor.cuh b/gunrock/app/cc/cc_enactor.cuh
index 67066c8c6..cfeca6a9c 100644
--- a/gunrock/app/cc/cc_enactor.cuh
+++ b/gunrock/app/cc/cc_enactor.cuh
@@ -49,9 +49,9 @@ __global__ void Expand_Incoming_Kernel(
 
     while (x < num_elements)
     {
-        VertexId key      = __ldg(keys_in + x);
-        VertexId new_pred = __ldg(vertex_associate_in + x);
-        VertexId old_pred = __ldg(vertex_associate_org + key);
+        VertexId key      = _ldg(keys_in + x);
+        VertexId new_pred = _ldg(vertex_associate_in + x);
+        VertexId old_pred = _ldg(vertex_associate_org + key);
         if (new_pred != old_pred)
         {
             if (new_pred < old_pred) vertex_associate_org[old_pred] = new_pred;
@@ -140,14 +140,14 @@ __global__ void Make_Output_Kernel(
         VertexId old_cid = 0, new_cid = 0, min_cid = 0;
         if (x < num_vertices)
         {
-            old_cid = __ldg(old_component_ids + x);
-            new_cid = __ldg(component_ids     + x);
+            old_cid = _ldg(old_component_ids + x);
+            new_cid = _ldg(component_ids     + x);
             min_cid = min(new_cid, old_cid);
             if (old_cid == min_cid)
                 to_process = false;
             else {
                 old_component_ids[x] = min_cid;
-                VertexId old_grandparent = __ldg(component_ids + old_cid);
+                VertexId old_grandparent = _ldg(component_ids + old_cid);
                 if (min_cid != old_grandparent)
                 {
                     //printf("%d\t Make_Output : not updated, old_cid = %d, min_cid = %d, old_grandparent = %d\n",
diff --git a/gunrock/app/cc/cc_functor.cuh b/gunrock/app/cc/cc_functor.cuh
index f5ead0d0f..8b87f3324 100644
--- a/gunrock/app/cc/cc_functor.cuh
+++ b/gunrock/app/cc/cc_functor.cuh
@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <gunrock/app/problem_base.cuh>
 #include <gunrock/app/cc/cc_problem.cuh>
+#include <gunrock/util/device_intrinsics.cuh>
 
 namespace gunrock {
 namespace app {
@@ -253,10 +254,10 @@ struct HookMinFunctor {
             VertexId to_node;
             util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
                 from_node, d_data_slice->froms + node);
-            //from_node = __ldg(d_data_slice -> froms + node);
+            //from_node = _ldg(d_data_slice -> froms + node);
             util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
                 to_node, d_data_slice->tos + node);
-            //to_node = __ldg(d_data_slice -> tos + node);
+            //to_node = _ldg(d_data_slice -> tos + node);
             VertexId parent_from;
             VertexId parent_to;
             util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
@@ -343,18 +344,18 @@ struct HookMaxFunctor {
             VertexId to_node;
             util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
                 from_node, d_data_slice->froms + node);
-            //from_node = __ldg(d_data_slice -> froms + node);
+            //from_node = _ldg(d_data_slice -> froms + node);
             util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
                 to_node, d_data_slice->tos + node);
-            //to_node = __ldg(d_data_slice -> tos + node);
+            //to_node = _ldg(d_data_slice -> tos + node);
             VertexId parent_from;
             VertexId parent_to;
             //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
             //    parent_from, d_data_slice -> component_ids + from_node);
-            parent_from = __ldg(d_data_slice -> component_ids + from_node);
+            parent_from = _ldg(d_data_slice -> component_ids + from_node);
             //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
             //    parent_to  , d_data_slice -> component_ids + to_node);
-            parent_to = __ldg(d_data_slice -> component_ids + to_node);
+            parent_to = _ldg(d_data_slice -> component_ids + to_node);
             //VertexId max_node = parent_from > parent_to ? parent_from : parent_to;
             //VertexId min_node = parent_from + parent_to - max_node;
             //if (max_node == min_node) 
@@ -405,7 +406,7 @@ struct HookMaxFunctor {
         util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
             mark, d_data_slice -> marks + edge_id);
         if (mark) return false;
-        //if (__ldg(d_data_slice -> marks + edge_id)) return false;
+        //if (_ldg(d_data_slice -> marks + edge_id)) return false;
 
         //VertexId from_node; = s_id
         //VertexId to_node; = d_id
@@ -516,11 +517,11 @@ struct PtrJumpFunctor {
         VertexId parent;
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    parent, d_data_slice -> component_ids + node);
-        parent = __ldg(d_data_slice -> component_ids + node);
+        parent = _ldg(d_data_slice -> component_ids + node);
         VertexId grand_parent;
         //util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
         //    grand_parent, d_data_slice -> component_ids + parent);
-        grand_parent = __ldg(d_data_slice -> component_ids + parent);
+        grand_parent = _ldg(d_data_slice -> component_ids + parent);
         if (parent != grand_parent) {
             util::io::ModifiedStore<Problem::QUEUE_WRITE_MODIFIER>::St(
                 0, d_data_slice ->vertex_flag + 0);
diff --git a/gunrock/app/dobfs/dobfs_enactor.cuh b/gunrock/app/dobfs/dobfs_enactor.cuh
index 4aa69e333..ca9685942 100644
--- a/gunrock/app/dobfs/dobfs_enactor.cuh
+++ b/gunrock/app/dobfs/dobfs_enactor.cuh
@@ -276,6 +276,7 @@ public:
                     enactor_stats[0],
                     frontier_attribute[0],
                     enactor_stats->iteration+1,
+                    data_slice,
                     d_data_slice,
                     (VertexId*)NULL,
                     (bool*    )NULL,
@@ -494,10 +495,11 @@ public:
                     enactor_stats -> nodes_queued[0] += frontier_attribute -> queue_length;
                     // Edge Map
                     gunrock::oprtr::advance::LaunchKernel
-                        <BackwardAdvanceKernelPolicy, Problem, RBFSFunctor>(
+                        <BackwardAdvanceKernelPolicy, Problem, RBFSFunctor, gunrock::oprtr::advance::V2V>(
                         enactor_stats[0],
                         frontier_attribute[0],
                         enactor_stats->iteration+1,
+                        data_slice,
                         d_data_slice,
                         data_slice->d_index_queue,
                         data_slice->d_frontier_map_in,
@@ -689,9 +691,10 @@ public:
                     enactor_stats -> nodes_queued[0] += frontier_attribute -> queue_length;
                     // Edge Map
                     gunrock::oprtr::advance::LaunchKernel
-                        <AdvanceKernelPolicy, Problem, BfsFunctor>(
+                        <AdvanceKernelPolicy, Problem, BfsFunctor, gunrock::oprtr::advance::V2V>(
                         enactor_stats[0],
                         frontier_attribute[0],
+                        data_slice,
                         d_data_slice,
                         (VertexId*)NULL,
                         (bool*    )NULL,
diff --git a/gunrock/app/dobfs/dobfs_functor.cuh b/gunrock/app/dobfs/dobfs_functor.cuh
index 60b024f5d..3b8b0e789 100644
--- a/gunrock/app/dobfs/dobfs_functor.cuh
+++ b/gunrock/app/dobfs/dobfs_functor.cuh
@@ -41,10 +41,10 @@ namespace dobfs {
  * @tparam ProblemData         Problem data type which contains data slice for BFS problem
  *
  */
-template<typename VertexId, typename SizeT, typename Value, typename ProblemData, typename _LabelT = VertexId>
+template<typename VertexId, typename SizeT, typename Value, typename Problem, typename _LabelT = VertexId>
 struct PrepareInputFrontierMapFunctor
 {
-    typedef typename ProblemData::DataSlice DataSlice;
+    typedef typename Problem::DataSlice DataSlice;
     typedef _LabelT LabelT;
 
     /**
@@ -57,7 +57,14 @@ struct PrepareInputFrontierMapFunctor
      *
      * \return Whether to load the apply function for the node and include it in the outgoing vertex frontier.
      */
-    static __device__ __forceinline__ bool CondFilter(VertexId node, DataSlice *problem, Value v =0, SizeT nid=0)
+    static __device__ __forceinline__ bool CondFilter(//VertexId node, DataSlice *problem, Value v =0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
        return true; 
     }
@@ -65,10 +72,17 @@ struct PrepareInputFrontierMapFunctor
     /**
      * @brief Vertex mapping apply function. Set frontier_map_in
      */
-    static __device__ __forceinline__ void ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+    static __device__ __forceinline__ void ApplyFilter(//VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
-        util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-            true, problem->d_frontier_map_in + node);
+        util::io::ModifiedStore<Problem::QUEUE_WRITE_MODIFIER>::St(
+            true, d_data_slice->d_frontier_map_in + node);
     }
 };
 
@@ -80,10 +94,10 @@ struct PrepareInputFrontierMapFunctor
  * @tparam ProblemData         Problem data type which contains data slice for BFS problem
  *
  */
-template<typename VertexId, typename SizeT, typename Value, typename ProblemData, typename _LabelT = VertexId>
+template<typename VertexId, typename SizeT, typename Value, typename Problem, typename _LabelT = VertexId>
 struct PrepareUnvisitedQueueFunctor
 {
-    typedef typename ProblemData::DataSlice DataSlice;
+    typedef typename Problem::DataSlice DataSlice;
     typedef _LabelT LabelT;
 
     /**
@@ -96,18 +110,32 @@ struct PrepareUnvisitedQueueFunctor
      *
      * \return Whether to load the apply function for the node and include it in the outgoing vertex frontier.
      */
-    static __device__ __forceinline__ bool CondFilter(VertexId node, DataSlice *problem, Value v =0, SizeT nid=0)
+    static __device__ __forceinline__ bool CondFilter(//VertexId node, DataSlice *problem, Value v =0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
-        VertexId label;
-        util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
-            label, problem->labels + node);
-        return (label == -1 || label == util::MaxValue<Value>()-1);
+        VertexId new_label;
+        util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
+            new_label, d_data_slice->labels + node);
+        return (new_label == -1 || new_label == util::MaxValue<Value>()-1);
     }
 
     /**
      * @brief Vertex mapping apply function. Doing nothing.
      */
-    static __device__ __forceinline__ void ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+    static __device__ __forceinline__ void ApplyFilter(//VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
         // Doing nothing here
     }
@@ -126,10 +154,10 @@ struct PrepareUnvisitedQueueFunctor
  * @tparam ProblemData         Problem data type which contains data slice for BFS problem
  *
  */
-template<typename VertexId, typename SizeT, typename Value, typename ProblemData, typename _LabelT = VertexId>
+template<typename VertexId, typename SizeT, typename Value, typename Problem, typename _LabelT = VertexId>
 struct ReverseBFSFunctor
 {
-    typedef typename ProblemData::DataSlice DataSlice;
+    typedef typename Problem::DataSlice DataSlice;
     typedef _LabelT LabelT;
     /**
      * @brief Forward Edge Mapping condition function. Check if the destination node
@@ -143,13 +171,21 @@ struct ReverseBFSFunctor
      *
      * \return Whether to load the apply function for the edge and include the destination node in the next frontier.
      */
-    static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
+    static __device__ __forceinline__ bool CondEdge(//VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
+        VertexId s_id,
+        VertexId d_id,
+        DataSlice *d_data_slice,
+        SizeT    edge_id   ,
+        VertexId input_item,
+        LabelT   label     ,
+        SizeT    input_pos ,
+        SizeT   &output_pos)
     {
         // Check if the destination node has been claimed as someone's child
         //return (atomicCAS(&problem->d_preds[d_id], -2, s_id) == -2) ? true : false;
-        if (ProblemData::MARK_PREDECESSORS)
-            util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-                    s_id, problem->preds + d_id);
+        if (Problem::MARK_PREDECESSORS)
+            util::io::ModifiedStore<Problem::QUEUE_WRITE_MODIFIER>::St(
+                    s_id, d_data_slice->preds + d_id);
         return true; 
     }
 
@@ -165,15 +201,23 @@ struct ReverseBFSFunctor
      * @param[in] e_id_in input edge id
      *
      */
-    static __device__ __forceinline__ void ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
+    static __device__ __forceinline__ void ApplyEdge(//VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
+        VertexId s_id,
+        VertexId d_id,
+        DataSlice *d_data_slice,
+        SizeT    edge_id   ,
+        VertexId input_item,
+        LabelT   label     ,
+        SizeT    input_pos ,
+        SizeT   &output_pos)
     {
         //set d_labels[d_id] to be d_labels[s_id]+1
         VertexId label = s_id;
-        if (ProblemData::MARK_PREDECESSORS)
-        util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
-            label, problem->labels + s_id);
-        util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-            label+1, problem->labels + d_id);
+        if (Problem::MARK_PREDECESSORS)
+        util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
+            label, d_data_slice->labels + s_id);
+        util::io::ModifiedStore<Problem::QUEUE_WRITE_MODIFIER>::St(
+            label+1, d_data_slice->labels + d_id);
         
         //printf("src:%d, dst:%d, label:%d\n", s_id, d_id, problem->d_labels[d_id]);
     }
@@ -188,7 +232,14 @@ struct ReverseBFSFunctor
      *
      * \return Whether to load the apply function for the node and include it in the outgoing vertex frontier.
      */
-    static __device__ __forceinline__ bool CondFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+    static __device__ __forceinline__ bool CondFilter(//VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
         return (node != -1);
     }
@@ -196,7 +247,14 @@ struct ReverseBFSFunctor
     /**
      * @brief Vertex mapping apply function. Doing nothing.
      */
-    static __device__ __forceinline__ void ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+    static __device__ __forceinline__ void ApplyFilter(//VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
         // Doing nothing here
     }
@@ -214,10 +272,10 @@ struct ReverseBFSFunctor
  * @tparam ProblemData         Problem data type which contains data slice for BFS problem
  *
  */
-template<typename VertexId, typename SizeT, typename Value, typename ProblemData, typename _LabelT = VertexId>
+template<typename VertexId, typename SizeT, typename Value, typename Problem, typename _LabelT = VertexId>
 struct SwitchToNormalFunctor
 {
-    typedef typename ProblemData::DataSlice DataSlice; 
+    typedef typename Problem::DataSlice DataSlice; 
     typedef _LabelT LabelT;
 
     /**
@@ -230,18 +288,32 @@ struct SwitchToNormalFunctor
      *
      * \return Whether to load the apply function for the node and include it in the outgoing vertex frontier.
      */
-    static __device__ __forceinline__ bool CondFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+    static __device__ __forceinline__ bool CondFilter(//VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
         bool flag;
-        util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
-            flag, problem->d_frontier_map_out + node);
+        util::io::ModifiedLoad<Problem::COLUMN_READ_MODIFIER>::Ld(
+            flag, d_data_slice->d_frontier_map_out + node);
         return (flag);
     }
 
     /**
      * @brief Vertex mapping apply function. Doing nothing.
      */
-    static __device__ __forceinline__ void ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+    static __device__ __forceinline__ void ApplyFilter(//VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
+        VertexId   v,  
+        VertexId   node,
+        DataSlice *d_data_slice,
+        SizeT      nid  ,
+        LabelT     label,
+        SizeT      input_pos,
+        SizeT      output_pos)
     {
         // Doing nothing here
     }
diff --git a/gunrock/app/dobfs/dobfs_problem.cuh b/gunrock/app/dobfs/dobfs_problem.cuh
index bdfce1b7e..0749d073c 100644
--- a/gunrock/app/dobfs/dobfs_problem.cuh
+++ b/gunrock/app/dobfs/dobfs_problem.cuh
@@ -55,6 +55,7 @@ struct DOBFSProblem : ProblemBase<VertexId, SizeT, Value,
         MARK_PREDECESSORS, ENABLE_IDEMPOTENCE> BaseProblem; 
     typedef DataSliceBase<VertexId, SizeT, Value,
         MAX_NUM_VERTEX_ASSOCIATES, MAX_NUM_VALUE__ASSOCIATES> BaseDataSlice;
+    typedef unsigned char MaskT;
 
     //Helper structures
 
diff --git a/gunrock/app/enactor_loop.cuh b/gunrock/app/enactor_loop.cuh
index 6dc995fe9..7e9b23a79 100644
--- a/gunrock/app/enactor_loop.cuh
+++ b/gunrock/app/enactor_loop.cuh
@@ -100,12 +100,12 @@ void Iteration_Loop(
                  *scanned_edges_       =   NULL;
     int           peer, peer_, peer__, gpu_, i, iteration_, wait_count;
     bool          over_sized;
-    int           communicate_latency  =   enactor -> communicate_latency;
+    SizeT         communicate_latency  =   enactor -> communicate_latency;
     float         communicate_multipy  =   enactor -> communicate_multipy;
-    int           expand_latency       =   enactor -> expand_latency;
-    int           subqueue_latency     =   enactor -> subqueue_latency;
-    int           fullqueue_latency    =   enactor -> fullqueue_latency;
-    int           makeout_latency      =   enactor -> makeout_latency;
+    SizeT         expand_latency       =   enactor -> expand_latency;
+    SizeT         subqueue_latency     =   enactor -> subqueue_latency;
+    SizeT         fullqueue_latency    =   enactor -> fullqueue_latency;
+    SizeT         makeout_latency      =   enactor -> makeout_latency;
 
     if (enactor -> debug)
     {
diff --git a/gunrock/app/pr/pr_app.cu b/gunrock/app/pr/pr_app.cu
index 2e9ffdf9b..526d4c260 100644
--- a/gunrock/app/pr/pr_app.cu
+++ b/gunrock/app/pr/pr_app.cu
@@ -141,7 +141,7 @@ void runPageRank(GRGraph *output, PR_Parameter *parameter)
     Value         delta              = parameter -> delta;
     Value         error              = parameter -> error;
     SizeT         max_iter           = parameter -> max_iter;
-    int           traversal_mode     = parameter -> traversal_mode;
+    std::string   traversal_mode     = parameter -> traversal_mode;
     bool          instrument         = parameter -> instrumented;
     bool          debug              = parameter -> debug;
     bool          size_check         = parameter -> size_check;
@@ -167,6 +167,7 @@ void runPageRank(GRGraph *output, PR_Parameter *parameter)
             gpu_idx,
             partition_method,
             streams,
+            context,
             max_queue_sizing,
             max_in_sizing,
             partition_factor,
@@ -233,7 +234,7 @@ void runPageRank(GRGraph *output, PR_Parameter *parameter)
 void dispatchPageRank(
     GRGraph       *grapho,
     const GRGraph *graphi,
-    const GRSetup  config,
+    const GRSetup *config,
     const GRTypes  data_t,
     ContextPtr*    context,
     cudaStream_t*  streams)
@@ -243,13 +244,13 @@ void dispatchPageRank(
     parameter->src[0] = -1;
     parameter->context      =  context;
     parameter->streams      =  streams;
-    parameter->g_quiet      = config.quiet;
-    parameter->num_gpus     = config.num_devices;
-    parameter->gpu_idx      = config.device_list;
-    parameter->delta        = config.pagerank_delta;
-    parameter->error        = config.pagerank_error;
-    parameter->max_iter     = config.max_iters;
-    parameter->normalized   = config.pagerank_normalized;
+    parameter->g_quiet      = config -> quiet;
+    parameter->num_gpus     = config -> num_devices;
+    parameter->gpu_idx      = config -> device_list;
+    parameter->delta        = config -> pagerank_delta;
+    parameter->error        = config -> pagerank_error;
+    parameter->max_iter     = config -> max_iters;
+    parameter->normalized   = config -> pagerank_normalized;
     parameter->g_undirected = true;
 
     switch (data_t.VTXID_TYPE)
@@ -310,7 +311,7 @@ void dispatchPageRank(
 void gunrock_pagerank(
     GRGraph       *grapho,
     const GRGraph *graphi,
-    const GRSetup  config,
+    const GRSetup *config,
     const GRTypes  data_t)
 {
     // GPU-related configurations
@@ -319,20 +320,20 @@ void gunrock_pagerank(
     ContextPtr    *context = NULL;
     cudaStream_t  *streams = NULL;
 
-    num_gpus = config.num_devices;
+    num_gpus = config -> num_devices;
     gpu_idx  = new int [num_gpus];
     for (int i = 0; i < num_gpus; ++i)
     {
-        gpu_idx[i] = config.device_list[i];
+        gpu_idx[i] = config -> device_list[i];
     }
 
     // Create streams and MordernGPU context for each GPU
     streams = new cudaStream_t[num_gpus * num_gpus * 2];
     context = new ContextPtr[num_gpus * num_gpus];
-    if (!config.quiet) { printf(" using %d GPUs:", num_gpus); }
+    if (!config -> quiet) { printf(" using %d GPUs:", num_gpus); }
     for (int gpu = 0; gpu < num_gpus; ++gpu)
     {
-        if (!config.quiet) { printf(" %d ", gpu_idx[gpu]); }
+        if (!config -> quiet) { printf(" %d ", gpu_idx[gpu]); }
         util::SetDevice(gpu_idx[gpu]);
         for (int i = 0; i < num_gpus * 2; ++i)
         {
@@ -347,7 +348,7 @@ void gunrock_pagerank(
             }
         }
     }
-    if (!config.quiet) { printf("\n"); }
+    if (!config -> quiet) { printf("\n"); }
 
     dispatchPageRank(grapho, graphi, config, data_t, context, streams);
 }
@@ -377,9 +378,9 @@ void pagerank(
     data_t.SIZET_TYPE = SIZET_INT;    // integer graph size type
     data_t.VALUE_TYPE = VALUE_FLOAT;  // float attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);  // primitive-specific configures
-    config.top_nodes      = 10;           // number of top nodes
-    config.pagerank_normalized     = normalized;
+    struct GRSetup *config = InitSetup(1, NULL);  // primitive-specific configures
+    config -> top_nodes      = 10;           // number of top nodes
+    config -> pagerank_normalized     = normalized;
 
     struct GRGraph *grapho = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graphi = (struct GRGraph*)malloc(sizeof(struct GRGraph));
diff --git a/gunrock/app/problem_base.cuh b/gunrock/app/problem_base.cuh
index 099cc8075..1318ee69d 100644
--- a/gunrock/app/problem_base.cuh
+++ b/gunrock/app/problem_base.cuh
@@ -1282,7 +1282,7 @@ public:
     float         partition_factor  ; // Partition factor
     int           partition_seed    ; // Partition seed
     int           iterations        ; // Number of repeats
-    int           traversal_mode    ; // Load-balanced or Dynamic cooperative
+    std::string   traversal_mode    ; // Load-balanced or Dynamic cooperative
 
     /**
      * @brief TestParameter_Base constructor
@@ -1310,7 +1310,7 @@ public:
         partition_factor   = -1;
         partition_seed     = -1;
         iterations         = 1;
-        traversal_mode     = -1;
+        traversal_mode     = "LB";
     }  // end TestParameter_Base()
 
     /**
diff --git a/gunrock/app/sssp/sssp_app.cu b/gunrock/app/sssp/sssp_app.cu
index bad85572e..ab4e45572 100644
--- a/gunrock/app/sssp/sssp_app.cu
+++ b/gunrock/app/sssp/sssp_app.cu
@@ -139,7 +139,7 @@ void runSSSP(GRGraph* output, SSSP_Parameter *parameter)
     int           partition_seed     = parameter -> partition_seed;
     bool          g_stream_from_host = parameter -> g_stream_from_host;
     int           delta_factor       = parameter -> delta_factor;
-    int           traversal_mode     = parameter -> traversal_mode;
+    std::string   traversal_mode     = parameter -> traversal_mode;
     bool          instrument         = parameter -> instrumented;
     bool          debug              = parameter -> debug;
     bool          size_check         = parameter -> size_check;
@@ -147,6 +147,7 @@ void runSSSP(GRGraph* output, SSSP_Parameter *parameter)
     // Allocate host-side distance arrays
     Value    *h_distances = new Value[graph->nodes];
     VertexId *h_preds  = MARK_PREDECESSORS ? new VertexId[graph->nodes] : NULL;
+    if (max_queue_sizing < 1.2) max_queue_sizing=1.2;
 
     for (int gpu = 0; gpu < num_gpus; gpu++)
     {
@@ -227,7 +228,7 @@ void runSSSP(GRGraph* output, SSSP_Parameter *parameter)
 void dispatchSSSP(
     GRGraph*       grapho,
     const GRGraph* graphi,
-    const GRSetup  config,
+    const GRSetup* config,
     const GRTypes  data_t,
     ContextPtr*    context,
     cudaStream_t*  streams)
@@ -236,12 +237,12 @@ void dispatchSSSP(
     parameter->src = (long long*)malloc(sizeof(long long));
     parameter->context  = context;
     parameter->streams  = streams;
-    parameter->g_quiet  = config.quiet;
-    parameter->num_gpus = config.num_devices;
-    parameter->gpu_idx  = config.device_list;
-    parameter->delta_factor = config.delta_factor;
-    parameter->traversal_mode = config.traversal_mode;
-    parameter->mark_predecessors  = config.mark_predecessors;
+    parameter->g_quiet  = config -> quiet;
+    parameter->num_gpus = config -> num_devices;
+    parameter->gpu_idx  = config -> device_list;
+    parameter->delta_factor = config -> delta_factor;
+    parameter->traversal_mode = std::string(config -> traversal_mode);
+    parameter->mark_predecessors  = config -> mark_predecessors;
 
     switch (data_t.VTXID_TYPE)
     {
@@ -264,7 +265,7 @@ void dispatchSSSP(
                 parameter->graph = &csr;
 
                 // determine source vertex to start
-                switch (config.source_mode)
+                switch (config -> source_mode)
                 {
                 case randomize:
                 {
@@ -279,7 +280,7 @@ void dispatchSSSP(
                 }
                 case manually:
                 {
-                    parameter->src[0] = config.source_vertex[0];
+                    parameter->src[0] = config -> source_vertex[0];
                     break;
                 }
                 default:
@@ -335,7 +336,7 @@ void dispatchSSSP(
 void gunrock_sssp(
     GRGraph*       grapho,
     const GRGraph* graphi,
-    const GRSetup  config,
+    const GRSetup* config,
     const GRTypes  data_t)
 {
     // GPU-related configurations
@@ -344,20 +345,20 @@ void gunrock_sssp(
     ContextPtr    *context = NULL;
     cudaStream_t  *streams = NULL;
 
-    num_gpus = config.num_devices;
+    num_gpus = config -> num_devices;
     gpu_idx  = new int [num_gpus];
     for (int i = 0; i < num_gpus; ++i)
     {
-        gpu_idx[i] = config.device_list[i];
+        gpu_idx[i] = config -> device_list[i];
     }
 
     // Create streams and MordernGPU context for each GPU
     streams = new cudaStream_t[num_gpus * num_gpus * 2];
     context = new ContextPtr[num_gpus * num_gpus];
-    if (!config.quiet) { printf(" using %d GPUs:", num_gpus); }
+    if (!config -> quiet) { printf(" using %d GPUs:", num_gpus); }
     for (int gpu = 0; gpu < num_gpus; ++gpu)
     {
-        if (!config.quiet) { printf(" %d ", gpu_idx[gpu]); }
+        if (!config -> quiet) { printf(" %d ", gpu_idx[gpu]); }
         util::SetDevice(gpu_idx[gpu]);
         for (int i = 0; i < num_gpus * 2; ++i)
         {
@@ -372,7 +373,7 @@ void gunrock_sssp(
             }
         }
     }
-    if (!config.quiet) { printf("\n"); }
+    if (!config -> quiet) { printf("\n"); }
 
     dispatchSSSP(grapho, graphi, config, data_t, context, streams);
 }
@@ -401,9 +402,9 @@ void sssp(
     data_t.SIZET_TYPE = SIZET_INT;  // integer graph size type
     data_t.VALUE_TYPE = VALUE_INT;  // integer attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);  // primitive-specific configures
-    config.source_vertex[0]     = source;    // source vertex to start
-    config.mark_predecessors = false;     // do not mark predecessors
+    struct GRSetup *config = InitSetup(1, NULL);  // primitive-specific configures
+    config -> source_vertex[0]     = source;    // source vertex to start
+    config -> mark_predecessors = false;     // do not mark predecessors
 
     struct GRGraph *grapho = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graphi = (struct GRGraph*)malloc(sizeof(struct GRGraph));
diff --git a/gunrock/app/sssp/sssp_enactor.cuh b/gunrock/app/sssp/sssp_enactor.cuh
index d9e67d18f..a5aafb3d5 100644
--- a/gunrock/app/sssp/sssp_enactor.cuh
+++ b/gunrock/app/sssp/sssp_enactor.cuh
@@ -721,7 +721,7 @@ struct SSSPIteration : public IterationBase <
                     over_sized, thread_num, iteration, peer_, false)) return;
             if (enactor_stats->retval =
                 Check_Size</*true,*/ SizeT, VertexId > (
-                    true, "queue3", graph_slice->nodes+2,
+                    true, "queue3", graph_slice->nodes * 1.2 + 2,
                     &frontier_queue->keys  [selector  ],
                     over_sized, thread_num, iteration, peer_, true )) return;
             if (enactor -> problem -> use_double_buffer)
@@ -733,7 +733,7 @@ struct SSSPIteration : public IterationBase <
                         over_sized, thread_num, iteration, peer_, false)) return;
                 if (enactor_stats->retval =
                     Check_Size</*true,*/ SizeT, Value> (
-                        true, "queue3", graph_slice->nodes+2,
+                        true, "queue3", graph_slice->nodes * 1.2 + 2,
                         &frontier_queue->values[selector  ],
                         over_sized, thread_num, iteration, peer_, true )) return;
             }
@@ -1318,7 +1318,7 @@ public:
             else if (traversal_mode == "LB_LIGHT_CULL")
                  return MODE_SWITCH<SizeT, gunrock::oprtr::advance::LB_LIGHT_CULL>
                     ::Init(*this, context, problem, max_grid_size);
-
+            else printf("Traversal_mode %s is undefined for SSSP\n", traversal_mode.c_str());
 //            if (traversal_mode == 0)
 //                return InitSSSP< LBAdvanceKernelPolicy, FilterKernelPolicy>(
 //                    context, problem, max_grid_size);
diff --git a/gunrock/graphio/grmat.cuh b/gunrock/graphio/grmat.cuh
index eb5e4cb96..17324dacf 100644
--- a/gunrock/graphio/grmat.cuh
+++ b/gunrock/graphio/grmat.cuh
@@ -150,6 +150,7 @@ __global__ void Rmat_Kernel(
     }
 }
 
+template <typename SizeT>
 __global__ void Rand_Init(
     unsigned int seed,
     curandState *d_states)
@@ -245,6 +246,7 @@ cudaError_t BuildRmatGraph(
         if (retval = rand_states[gpu].Allocate(block_size * grid_size, util::DEVICE))
             return retval;
         Rand_Init
+            <SizeT>
             <<<grid_size, block_size, 0, streams[gpu]>>>
             (seed_, rand_states[gpu].GetPointer(util::DEVICE));
 
@@ -389,6 +391,7 @@ cudaError_t BuildMetaRmatGraph(
         if (retval = rand_states[gpu].Allocate(block_size * grid_size, util::DEVICE))
             return retval;
         Rand_Init
+            <SizeT>
             <<<grid_size, block_size, 0, streams[gpu]>>>
             (seed_, rand_states[gpu].GetPointer(util::DEVICE));
 
diff --git a/gunrock/graphio/utils.cuh b/gunrock/graphio/utils.cuh
index 44d222fe3..541744cc6 100644
--- a/gunrock/graphio/utils.cuh
+++ b/gunrock/graphio/utils.cuh
@@ -100,9 +100,10 @@ void RemoveStandaloneNodes(
         if (thread_num == 0) block_offsets = new SizeT[num_threads + 1];
         #pragma omp barrier
 
-        displacements[node_start] = 0;
+        if (node_end > node_start) displacements[node_start] = 0;
         for (VertexId node = node_start; node < node_end - 1; node++)
             displacements[node + 1] = displacements[node] + 1 - marker[node];
+        #pragma omp barrier
         if (node_end != 0)
             block_offsets[thread_num + 1] = displacements[node_end - 1] + 1 - marker[node_end - 1];
         else block_offsets[thread_num + 1] = 1 - marker[0];
@@ -119,10 +120,18 @@ void RemoveStandaloneNodes(
         {
             if (marker[node] == 0) continue;
             VertexId node_ = node - block_offsets[thread_num] - displacements[node];
+            //printf("thread_num = %d, node = %d, block_offsets[] = %d, displacements[] = %d, node_ = %d\n",
+            //    thread_num, node, block_offsets[thread_num], displacements[node], node_);
             new_nodes  [node ] = node_;
             new_offsets[node_] = row_offsets[node];
             if (values != NULL) new_values[node_] = values[node];
         }
+
+        //#pragma omp barrier
+        //for (SizeT edge = edge_start; edge < edge_end; edge++)
+        //{
+        //    column_indices[edge] = new_nodes[column_indices[edge]];
+        //}
     }
 
     for (SizeT edge = 0; edge < edges; edge++)
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index 165f57b34..58f13ea89 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -17,6 +17,7 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdio.h>
+#include <string.h>
 
 /**
  * @brief VertexId data type enumerators.
@@ -103,7 +104,7 @@ struct GRSetup
     float     pagerank_error;  // PageRank specific value
     bool pagerank_normalized;  // PageRank specific flag
     float   max_queue_sizing;  // Setting frontier queue size
-    int       traversal_mode;  // Traversal mode: 0 for LB, 1 TWC
+    char* traversal_mode;  // Traversal mode: 0 for LB, 1 TWC
     enum SrcMode source_mode;  // Source mode rand/largest_degree
 };
 
@@ -117,12 +118,12 @@ struct GRSetup
 // Link mentions is an issue with C99, not a clang specific issue
 static
 #endif
-inline struct GRSetup InitSetup(int num_iters, int* source)
+inline struct GRSetup* InitSetup(int num_iters, int* source)
 {
-    struct GRSetup configurations;
-    configurations.quiet = true;
-    configurations.mark_predecessors = true;
-    configurations.enable_idempotence = false;
+    struct GRSetup *configurations = (struct GRSetup*)malloc(sizeof(struct GRSetup));
+    configurations -> quiet = true;
+    configurations -> mark_predecessors = true;
+    configurations -> enable_idempotence = false;
     int* sources = (int*)malloc(sizeof(int)*num_iters);
     int i;
     if (source == NULL)
@@ -132,20 +133,22 @@ inline struct GRSetup InitSetup(int num_iters, int* source)
     {
         for (i = 0; i < num_iters; ++i) sources[i] = source[i];
     }
-    configurations.source_vertex = sources;
-    configurations.delta_factor = 32;
-    configurations.num_devices = 1;
-    configurations.max_iters = 50;
-    configurations.num_iters = num_iters;
-    configurations.top_nodes = 10;
-    configurations.pagerank_delta = 0.85f;
-    configurations.pagerank_error = 0.01f;
-    configurations.pagerank_normalized = false;
-    configurations.max_queue_sizing = 1.0;
-    configurations.traversal_mode = 0;
-    configurations.source_mode = manually;
+    configurations -> source_vertex = sources;
+    configurations -> delta_factor = 32;
+    configurations -> num_devices = 1;
+    configurations -> max_iters = 50;
+    configurations -> num_iters = num_iters;
+    configurations -> top_nodes = 10;
+    configurations -> pagerank_delta = 0.85f;
+    configurations -> pagerank_error = 0.01f;
+    configurations -> pagerank_normalized = false;
+    configurations -> max_queue_sizing = 1.0;
+    configurations -> traversal_mode = (char*)malloc(sizeof(char) * 3);
+    strcpy(configurations -> traversal_mode, "LB");
+    configurations -> traversal_mode[2] = '\0';
+    configurations -> source_mode = manually;
     int* gpu_idx = (int*)malloc(sizeof(int)); gpu_idx[0] = 0;
-    configurations.device_list = gpu_idx;
+    configurations -> device_list = gpu_idx;
     return configurations;
 }
 
@@ -164,7 +167,7 @@ extern "C" {
 float gunrock_bfs(
     struct GRGraph*       grapho,   // Output graph / results
     const struct GRGraph* graphi,   // Input graph structure
-    const struct GRSetup  config,   // Flag configurations
+    const struct GRSetup* config,   // Flag configurations
     const struct GRTypes  data_t);  // Data type Configurations
 
 /*
@@ -204,7 +207,7 @@ float bfs(
 void gunrock_bc(
     struct GRGraph*       grapho,   // Output graph / results
     const struct GRGraph* graphi,   // Input graph structure
-    const struct GRSetup  config,   // Flag configurations
+    const struct GRSetup* config,   // Flag configurations
     const struct GRTypes  data_t);  // Data type Configurations
 
 /**
@@ -236,7 +239,7 @@ void bc(
 void gunrock_cc(
     struct GRGraph*       grapho,   // Output graph / results
     const struct GRGraph* graphi,   // Input graph structure
-    const struct GRSetup  config,   // Flag configurations
+    const struct GRSetup* config,   // Flag configurations
     const struct GRTypes  data_t);  // Data type Configurations
 
 /**
@@ -268,7 +271,7 @@ int cc(
 void gunrock_sssp(
     struct GRGraph*       grapho,   // Output graph / results
     const struct GRGraph* graphi,   // Input graph structure
-    const struct GRSetup  config,   // Flag configurations
+    const struct GRSetup* config,   // Flag configurations
     const struct GRTypes  data_t);  // Data type Configurations
 
 /**
@@ -302,7 +305,7 @@ void sssp(
 void gunrock_pagerank(
     struct GRGraph*       grapho,   // Output graph / results
     const struct GRGraph* graphi,   // Input graph structure
-    const struct GRSetup  config,   // Flag configurations
+    const struct GRSetup* config,   // Flag configurations
     const struct GRTypes  data_t);  // Data type Configurations
 
 /**
diff --git a/gunrock/oprtr/all_edges_advance/kernel.cuh b/gunrock/oprtr/all_edges_advance/kernel.cuh
index 34e8b1420..db78eae23 100644
--- a/gunrock/oprtr/all_edges_advance/kernel.cuh
+++ b/gunrock/oprtr/all_edges_advance/kernel.cuh
@@ -14,6 +14,7 @@
  */
 
 #pragma once
+#include <gunrock/util/device_intrinsics.cuh>
 
 #include <gunrock/oprtr/edge_map_partitioned/cta.cuh>
 #include <gunrock/oprtr/advance/kernel_policy.cuh>
@@ -78,7 +79,7 @@ struct Dispatch<KernelPolicy, Problem, Functor,
             SizeT mid_point = (lower_bound + upper_bound) >> 1;
             //printf("(%d, %d) looking for %d, current range [%d, %d], mid_point = %d\n",
             //    blockIdx.x, threadIdx.x, item_to_find, lower_bound, upper_bound, data[mid_point]);
-            if (__ldg(data + mid_point) < item_to_find)
+            if (_ldg(data + mid_point) < item_to_find)
                 lower_bound = mid_point + 1;
             else upper_bound = mid_point;
         }
@@ -87,7 +88,7 @@ struct Dispatch<KernelPolicy, Problem, Functor,
         if (upper_bound == lower_bound)
         {
             /*if (data[upper_bound] == item_to_find) retval = upper_bound;
-            else*/ if (item_to_find < __ldg(data + upper_bound)) retval = upper_bound -1;
+            else*/ if (item_to_find < _ldg(data + upper_bound)) retval = upper_bound -1;
             else retval = upper_bound;
         } else retval = util::InvalidValue<SizeT>();
 
@@ -139,7 +140,7 @@ struct Dispatch<KernelPolicy, Problem, Functor,
             if (x < input_queue_length)
             {
                 edge_id = (d_edges_in == NULL) ? x : d_edges_in[x];
-                des = __ldg(d_column_indices + edge_id);
+                des = _ldg(d_column_indices + edge_id);
                 src = Binary_Search(d_row_offsets, edge_id, 0, nodes);
 
                 //printf("(%d, %d) : Edge %d, %d -> %d, input_queue_len = %d\n",
diff --git a/gunrock/oprtr/edge_map_partitioned/kernel.cuh b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
index 0ba854592..01a45648b 100644
--- a/gunrock/oprtr/edge_map_partitioned/kernel.cuh
+++ b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
@@ -17,6 +17,7 @@
 #include <gunrock/util/cta_work_distribution.cuh>
 #include <gunrock/util/cta_work_progress.cuh>
 #include <gunrock/util/kernel_runtime_stats.cuh>
+#include <gunrock/util/device_intrinsics.cuh>
 
 #include <gunrock/oprtr/edge_map_partitioned/cta.cuh>
 
@@ -97,11 +98,11 @@ struct Dispatch<KernelPolicy, Problem, Functor,
         SizeT first  = /*(d_vertex_id >= max_vertex) ?
             max_edge :*/ //d_row_offsets[d_vertex_id];
             //tex1Dfetch(RowOffsetsTex<SizeT>::row_offsets,  vertex_id);
-            __ldg(d_row_offsets + vertex_id);
+            _ldg(d_row_offsets + vertex_id);
         SizeT second = /*(d_vertex_id + 1 >= max_vertex) ?
             max_edge :*/ //d_row_offsets[d_vertex_id+1];
             //tex1Dfetch(RowOffsetsTex<SizeT>::row_offsets,  vertex_id + 1);
-            __ldg(d_row_offsets + (vertex_id + 1));
+            _ldg(d_row_offsets + (vertex_id + 1));
 
         //printf(" d_vertex_id = %d, max_vertex = %d, max_edge = %d, first = %d, second = %d\n",
         //       d_vertex_id, max_vertex, max_edge, first, second);
@@ -267,7 +268,7 @@ struct Dispatch<KernelPolicy, Problem, Functor,
                     if (input_item >= 0)
                         smem_storage.row_offset[threadIdx.x]= //row_offsets[input_item];
                             //tex1Dfetch(RowOffsetsTex<SizeT>::row_offsets,  input_item);
-                            __ldg(row_offsets + input_item);
+                            _ldg(row_offsets + input_item);
                     else smem_storage.row_offset[threadIdx.x] = util::MaxValue<SizeT>();
                 }
                 else if (ADVANCE_TYPE == gunrock::oprtr::advance::E2V ||
diff --git a/gunrock/oprtr/edge_map_partitioned_cull/kernel.cuh b/gunrock/oprtr/edge_map_partitioned_cull/kernel.cuh
index c2cbeb114..fa9c4fa73 100644
--- a/gunrock/oprtr/edge_map_partitioned_cull/kernel.cuh
+++ b/gunrock/oprtr/edge_map_partitioned_cull/kernel.cuh
@@ -17,6 +17,7 @@
 #include <gunrock/util/cta_work_distribution.cuh>
 #include <gunrock/util/cta_work_progress.cuh>
 #include <gunrock/util/kernel_runtime_stats.cuh>
+#include <gunrock/util/device_intrinsics.cuh>
 
 #include <gunrock/oprtr/edge_map_partitioned/kernel.cuh>
 #include <gunrock/oprtr/cull_filter/cta.cuh>
@@ -52,7 +53,7 @@ struct LoadRowOffset<VertexId, long long>
     static __device__ __forceinline__ long long Load 
         (long long *&d_row_offsets, VertexId &pos)
     {    
-        return __ldg(d_row_offsets + pos);
+        return _ldg(d_row_offsets + pos);
     }    
 };
 
@@ -170,12 +171,12 @@ struct Dispatch<KernelPolicy, Problem, Functor,
             /*(d_vertex_id >= max_vertex) ?
             max_edge :*/ //d_row_offsets[d_vertex_id];
             //tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets,  d_vertex_id);
-            //__ldg(d_row_offsets + (d_vertex_id));
+            //_ldg(d_row_offsets + (d_vertex_id));
         SizeT second = LoadRowOffset<VertexId, SizeT>::Load(d_row_offsets, d_vertex_id + 1);
             /*(d_vertex_id + 1 >= max_vertex) ?
             max_edge :*/ //d_row_offsets[d_vertex_id+1];
             //tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets,  d_vertex_id + 1);
-            //__ldg(d_row_offsets + (d_vertex_id+1));
+            //_ldg(d_row_offsets + (d_vertex_id+1));
 
         //printf(" d_vertex_id = %d, max_vertex = %d, max_edge = %d, first = %d, second = %d\n",
         //       d_vertex_id, max_vertex, max_edge, first, second);
@@ -449,12 +450,12 @@ struct Dispatch<KernelPolicy, Problem, Functor,
                         //smem_storage.vertices [threadIdx.x] = input_item;
                         if (input_item >= 0)
                             smem_storage.row_offset[threadIdx.x]= (output_inverse_graph) ?
-                                __ldg(d_inverse_row_offsets + input_item) :
+                                _ldg(d_inverse_row_offsets + input_item) :
                                 LoadRowOffset<VertexId, SizeT>::Load(d_row_offsets, input_item);
                             
                                 //row_offsets[input_item];
                                 //tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets,  input_item);
-                                //__ldg(((output_inverse_graph) ? d_inverse_row_offsets :
+                                //_ldg(((output_inverse_graph) ? d_inverse_row_offsets :
                                 //     d_row_offsets) + input_item);
                         else smem_storage.row_offset[threadIdx.x] = util::MaxValue<SizeT>();
                     }
@@ -772,11 +773,11 @@ struct Dispatch<KernelPolicy, Problem, Functor,
                         //smem_storage.vertices[threadIdx.x] = input_item;
                         if (input_item >= 0)
                             smem_storage.row_offset[threadIdx.x] =  (output_inverse_graph) ?
-                                __ldg(d_inverse_row_offsets + input_item) :
+                                _ldg(d_inverse_row_offsets + input_item) :
                                 LoadRowOffset<VertexId, SizeT>::Load(d_row_offsets, input_item);
                                 //row_offsets[input_item];
                                 //tex1Dfetch(gunrock::oprtr::edge_map_partitioned::RowOffsetsTex<SizeT>::row_offsets,  input_item);
-                                //__ldg(row_offsets + input_item);
+                                //_ldg(row_offsets + input_item);
                         else smem_storage.row_offset[threadIdx.x] = util::MaxValue<SizeT>();
                     } else if (ADVANCE_TYPE == gunrock::oprtr::advance::E2V ||
                         ADVANCE_TYPE == gunrock::oprtr::advance::E2E)
diff --git a/gunrock/util/device_intrinsics.cuh b/gunrock/util/device_intrinsics.cuh
index dad0f8c9b..b58e1a118 100644
--- a/gunrock/util/device_intrinsics.cuh
+++ b/gunrock/util/device_intrinsics.cuh
@@ -12,7 +12,10 @@
  * @brief Common device intrinsics (potentially specialized by architecture)
  */
 
-#pragma once
+//#pragma once
+
+#ifndef DEVICE_INTRINSICS_CUH
+#define DEVICE_INTRINSICS_CUH
 
 #include <gunrock/util/cuda_properties.cuh>
 #include <gunrock/util/types.cuh>
@@ -49,13 +52,32 @@ __device__ static long long atomicAdd(long long *addr, long long val)
         (unsigned long long )val);
 }
 
+#if __GR_CUDA_ARCH__ <= 300
 // TODO: only works if both *addr and val are non-negetive
-//__device__ static long long atomicMin(long long *addr, long long val)
-//{
-//    return (long long)atomicMin(
-//        (unsigned long long*)addr,
-//        (unsigned long long )val);
-//}
+/*__device__ static signed long long int atomicMin(signed long long int* addr, signed long long int val)
+{
+    unsigned long long int pre_value = (unsigned long long int)val;
+    unsigned long long int old_value = (unsigned long long int)val;
+    while (true)
+    {
+        old_value = atomicCAS((unsigned long long int*)addr, pre_value, (unsigned long long int)val);
+        if (old_value <= (unsigned long long int)val) break;
+        if (old_value == pre_value) break;
+        pre_value = old_value;
+    }
+    return old_value;
+}*/
+#endif
+
+template <typename T>
+__device__ __forceinline__ T _ldg(T* addr)
+{
+#if __GR_CUDA_ARCH__ >= 350
+    return __ldg(addr);
+#else 
+    return *addr;
+#endif
+}
 
 namespace gunrock {
 namespace util {
@@ -159,6 +181,7 @@ __device__ int BinarySearch(KeyType i, ArrayType *queue)
 } // namespace util
 } // namespace gunrock
 
+#endif
 // Leave this at the end of the file
 // Local Variables:
 // mode:c++
diff --git a/gunrock/util/latency_utils.cuh b/gunrock/util/latency_utils.cuh
index dfa6a2822..8c3fd105e 100644
--- a/gunrock/util/latency_utils.cuh
+++ b/gunrock/util/latency_utils.cuh
@@ -21,9 +21,10 @@ namespace latency {
 #define NUM_BLOCKS 120
 #define BLOCK_SIZE 1024
 
+template <typename SizeT>
 __global__ void Load_Kernel(
-    long long num_repeats,
-    long long num_elements,
+    SizeT num_repeats,
+    SizeT num_elements,
     int *d_data)
 {
     int pos = blockDim.x * blockIdx.x + threadIdx.x;
@@ -45,11 +46,12 @@ __global__ void Load_Kernel(
     }
 }
 
+template <typename SizeT>
 cudaError_t Get_BaseLine(
     //int num_blocks,
     //int block_size,
-    long long num_repeats,
-    long long num_elements,
+    SizeT num_repeats,
+    SizeT num_elements,
     cudaStream_t stream,
     float &elapsed_ms,
     int *d_data)
@@ -92,11 +94,12 @@ cudaError_t Test_BaseLine(
     return retval;
 }
 
+template <typename SizeT>
 cudaError_t Insert_Latency(
     //int num_blocks,
     //int block_size,
-    long long num_repeats,
-    long long num_elements,
+    SizeT num_repeats,
+    SizeT num_elements,
     cudaStream_t stream,
     int *d_data)
 {
diff --git a/shared_lib_tests/shared_lib_bc.c b/shared_lib_tests/shared_lib_bc.c
index c3bf257de..5ee445cbc 100644
--- a/shared_lib_tests/shared_lib_bc.c
+++ b/shared_lib_tests/shared_lib_bc.c
@@ -14,7 +14,7 @@ int main(int argc, char* argv[])
     data_t.SIZET_TYPE = SIZET_INT;         // graph size type
     data_t.VALUE_TYPE = VALUE_FLOAT;       // attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);   // gunrock configurations
+    struct GRSetup *config = InitSetup(1, NULL);   // gunrock configurations
 
     int num_nodes = 7, num_edges = 26;
     int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
diff --git a/shared_lib_tests/shared_lib_bfs.c b/shared_lib_tests/shared_lib_bfs.c
index 728e29e6f..dbc0fdc59 100644
--- a/shared_lib_tests/shared_lib_bfs.c
+++ b/shared_lib_tests/shared_lib_bfs.c
@@ -15,7 +15,7 @@ int main(int argc, char* argv[])
     data_t.VALUE_TYPE = VALUE_INT;         // attributes type
     int srcs[1] = {1};
 
-    struct GRSetup config = InitSetup(1, srcs);   // gunrock configurations
+    struct GRSetup *config = InitSetup(1, srcs);   // gunrock configurations
 
     int num_nodes = 7, num_edges = 15;  // number of nodes and edges
     int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
diff --git a/shared_lib_tests/shared_lib_cc.c b/shared_lib_tests/shared_lib_cc.c
index 337e49e55..1a2e5cc0c 100644
--- a/shared_lib_tests/shared_lib_cc.c
+++ b/shared_lib_tests/shared_lib_cc.c
@@ -14,7 +14,7 @@ int main(int argc, char* argv[])
     data_t.SIZET_TYPE = SIZET_INT;         // graph size type
     data_t.VALUE_TYPE = VALUE_INT;         // attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);   // gunrock configurations
+    struct GRSetup *config = InitSetup(1, NULL);   // gunrock configurations
 
     int num_nodes = 7, num_edges = 26;
     int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
diff --git a/shared_lib_tests/shared_lib_pr.c b/shared_lib_tests/shared_lib_pr.c
index 6d43249b1..8c05e0bc6 100644
--- a/shared_lib_tests/shared_lib_pr.c
+++ b/shared_lib_tests/shared_lib_pr.c
@@ -14,7 +14,7 @@ int main(int argc, char* argv[])
     data_t.SIZET_TYPE = SIZET_INT;         // graph size type
     data_t.VALUE_TYPE = VALUE_FLOAT;       // attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);   // gunrock configurations
+    struct GRSetup *config = InitSetup(1, NULL);   // gunrock configurations
 
     int num_nodes = 7, num_edges = 26;
     int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
@@ -35,7 +35,7 @@ int main(int argc, char* argv[])
     float *top_ranks = (float*)malloc(sizeof(float) * graphi->num_nodes);
     top_nodes = (  int*)grapho->node_value2;
     top_ranks = (float*)grapho->node_value1;
-    int node; for (node = 0; node < config.top_nodes; ++node)
+    int node; for (node = 0; node < config -> top_nodes; ++node)
         printf("Node_ID [%d] : Score: [%f]\n", top_nodes[node], top_ranks[node]);
 
     if (graphi) free(graphi);
diff --git a/shared_lib_tests/shared_lib_sssp.c b/shared_lib_tests/shared_lib_sssp.c
index f7c07ad40..60eb589bd 100644
--- a/shared_lib_tests/shared_lib_sssp.c
+++ b/shared_lib_tests/shared_lib_sssp.c
@@ -14,7 +14,7 @@ int main(int argc, char* argv[])
     data_t.SIZET_TYPE = SIZET_INT;         // graph size type
     data_t.VALUE_TYPE = VALUE_INT;         // attributes type
 
-    struct GRSetup config = InitSetup(1, NULL);   // gunrock configurations
+    struct GRSetup *config = InitSetup(1, NULL);   // gunrock configurations
 
     int num_nodes = 7, num_edges = 15;  // number of nodes and edges
     int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
diff --git a/tests/bc/CMakeLists.txt b/tests/bc/CMakeLists.txt
index 35c6db4f8..416714b79 100644
--- a/tests/bc/CMakeLists.txt
+++ b/tests/bc/CMakeLists.txt
@@ -12,6 +12,12 @@ set (mgpu_SOURCE_FILES
   ${mgpu_SOURCE_DIRS}/mgpucontext.cu
   ${mgpu_SOURCE_DIRS}/mgpuutil.cpp)
 
+if (cub_INCLUDE_DIRS)
+  include_directories(${cub_INCLUDE_DIRS})
+else()
+  message(SEND_ERROR "CUB include directory not set.")
+endif()
+
 CUDA_ADD_EXECUTABLE(betweenness_centrality
   test_bc.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu
diff --git a/tests/bc/test_bc.cu b/tests/bc/test_bc.cu
index 4b933ddd7..bf66bed3e 100644
--- a/tests/bc/test_bc.cu
+++ b/tests/bc/test_bc.cu
@@ -442,6 +442,7 @@ cudaError_t RunTests(Info<VertexId, SizeT, Value> *info)
     int      fullqueue_latency      = info->info["fullqueue_latency" ].get_int ();
     int      makeout_latency        = info->info["makeout_latency"   ].get_int ();
     std::string traversal_mode      = info->info["traversal_mode"    ].get_str ();
+    if (traversal_mode == "TWC") traversal_mode = "LB";
     if (communicate_multipy > 1) max_in_sizing *= communicate_multipy;
 
     CpuTimer cpu_timer;
diff --git a/tests/bfs/CMakeLists.txt b/tests/bfs/CMakeLists.txt
index f8f5f5e1c..1b671e53d 100644
--- a/tests/bfs/CMakeLists.txt
+++ b/tests/bfs/CMakeLists.txt
@@ -14,6 +14,12 @@ set (mgpu_SOURCE_FILES
   ${mgpu_SOURCE_DIRS}/mgpucontext.cu
   ${mgpu_SOURCE_DIRS}/mgpuutil.cpp)
 
+if (cub_INCLUDE_DIRS)
+  include_directories(${cub_INCLUDE_DIRS})
+else()
+  message(SEND_ERROR "CUB include directory not set.")
+endif()
+
 CUDA_ADD_EXECUTABLE(breadth_first_search
   test_bfs.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu
diff --git a/tests/bfs/test_bfs.cu b/tests/bfs/test_bfs.cu
index 521cd96af..3db23a1d3 100644
--- a/tests/bfs/test_bfs.cu
+++ b/tests/bfs/test_bfs.cu
@@ -426,7 +426,9 @@ cudaError_t RunTests(Info<VertexId, SizeT, Value> *info)
     if (!quiet_mode)
         printf("Using traversal-mode %s\n", traversal_mode.c_str());
 
-    json_spirit::mArray source_list = info->info["source_list"].get_array();
+    json_spirit::mArray source_list;
+    if (src_type == "list")
+        source_list = info->info["source_list"].get_array();
     for (int iter = 0; iter < iterations; ++iter)
     {
         if (src_type == "random2")
@@ -440,12 +442,13 @@ cudaError_t RunTests(Info<VertexId, SizeT, Value> *info)
             }
         } else if (src_type == "list")
         {
-            if (source_list.size() == 0) {
+            if (source_list.size() == 0) 
+            {
                 if (!quiet_mode)
                     printf("No source list found. Use 0 as source.\n");
                 src = 0;
             } else {
-            src = source_list[iter].get_int();
+                src = source_list[iter].get_int();
             }
         }
 
@@ -875,7 +878,7 @@ template <
     typename VertexId>
 int main_SizeT(CommandLineArgs *args)
 {
-// disabled to reduce compile time
+// can be disabled to reduce compile time
     if (args -> CheckCmdLineFlag("64bit-SizeT") || sizeof(VertexId) > 4)
         return main_Value<VertexId, long long>(args);
     else
@@ -884,9 +887,17 @@ int main_SizeT(CommandLineArgs *args)
 
 int main_VertexId(CommandLineArgs *args)
 {
-// disabled, because oprtr::filter::KernelPolicy::SmemStorage is too large for 64bit VertexId
+// can be disabled to reduce compile time
+// atomicMin(long long) is only available for compute capability 3.5 or higher
     if (args -> CheckCmdLineFlag("64bit-VertexId"))
+#if __GR_CUDA_ARCH__ <= 300
+    {
+        printf("64bit-VertexId disabled, because atomicMin(long long) is only supported by compute capability 3.5 or higher\n");
+        return 1;
+    }
+#else
         return main_SizeT<long long>(args);
+#endif
     else
         return main_SizeT<int      >(args);
 }
diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt
index 2eb15b781..8b10119db 100644
--- a/tests/cc/CMakeLists.txt
+++ b/tests/cc/CMakeLists.txt
@@ -12,6 +12,12 @@ set (mgpu_SOURCE_FILES
   ${mgpu_SOURCE_DIRS}/mgpucontext.cu
   ${mgpu_SOURCE_DIRS}/mgpuutil.cpp)
 
+if (cub_INCLUDE_DIRS)
+  include_directories(${cub_INCLUDE_DIRS})
+else()
+  message(SEND_ERROR "CUB include directory not set.")
+endif()
+
 CUDA_ADD_EXECUTABLE(connected_component
   test_cc.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu
diff --git a/tests/sssp/CMakeLists.txt b/tests/sssp/CMakeLists.txt
index 849c6a176..9bbb80037 100644
--- a/tests/sssp/CMakeLists.txt
+++ b/tests/sssp/CMakeLists.txt
@@ -12,6 +12,12 @@ set (mgpu_SOURCE_FILES
   ${mgpu_SOURCE_DIRS}/mgpucontext.cu
   ${mgpu_SOURCE_DIRS}/mgpuutil.cpp)
 
+if (cub_INCLUDE_DIRS)
+  include_directories(${cub_INCLUDE_DIRS})
+else()
+  message(SEND_ERROR "CUB include directory not set.")
+endif()
+
 CUDA_ADD_EXECUTABLE(single_source_shortest_path
   test_sssp.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu
diff --git a/tests/sssp/test_sssp.cu b/tests/sssp/test_sssp.cu
index 001795130..667b36fbb 100644
--- a/tests/sssp/test_sssp.cu
+++ b/tests/sssp/test_sssp.cu
@@ -331,6 +331,7 @@ cudaError_t RunTests(Info<VertexId, SizeT, Value> *info)
 
     CpuTimer    cpu_timer;
     cudaError_t retval              = cudaSuccess;
+    if (max_queue_sizing < 1.2) max_queue_sizing=1.2;
 
     cpu_timer.Start();
     json_spirit::mArray device_list = info->info["device_list"].get_array();