diff --git a/src/llama.cpp b/src/llama.cpp
index fc683237..b8d9dbad 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17746,6 +17746,11 @@ static int llama_decode_internal(
                 }   
             }
 
+            if (i > 0) {
+                // ensure ggml_backend_tensor_get_async of the previous subgraph has finished
+                ggml_backend_sched_synchronize(lctx.sched[i - 1]);
+            }
+
             llama_set_inputs(lctx, ubatch);
 
             {   // compute graph
@@ -17769,9 +17774,15 @@ static int llama_decode_internal(
 
             // send the result to the next node (or the master)
             if (n_world == 1 || (my_rank == 0 && is_last_l)) {
-                size_t buf_size = sub_gf_out->ne[0]*sub_gf_out->ne[1]*sizeof(float);
-                float * embd_buf = is_last_l ? ubatch.out_embd : ubatch.backend_embd;
-                ggml_backend_tensor_get(sub_gf_out, embd_buf, 0, buf_size);
+                size_t         buf_size = sub_gf_out->ne[0] * sub_gf_out->ne[1] * sizeof(float);
+                float *        embd_buf = is_last_l ? ubatch.out_embd : ubatch.backend_embd;
+                ggml_backend_t backend  = ggml_backend_sched_get_tensor_backend(lctx.sched[i], sub_gf_out);
+
+                GGML_ASSERT(buf_size <= ggml_nbytes(sub_gf_out));
+                GGML_ASSERT(backend  != nullptr);
+                GGML_ASSERT(embd_buf != nullptr);
+                
+                ggml_backend_tensor_get_async(backend, sub_gf_out, embd_buf, 0, buf_size);
             } else {
                 input_tensors tensors;
                 tensors.sub_gf_out = sub_gf_out;
@@ -21800,7 +21811,9 @@ int32_t llama_decode(
 }
 
 void llama_synchronize(struct llama_context * ctx) {
-    ggml_backend_sched_synchronize(ctx->sched.at(0)); // todo.
+    for (ggml_backend_sched_t sched : ctx->sched) {
+        ggml_backend_sched_synchronize(sched);
+    }
 
     // FIXME: if multiple single tokens are evaluated without a synchronization,
     // the stats will be added to the prompt evaluation stats