diff --git a/rtl_lib/arbitrary_width_memory.py b/rtl_lib/arbitrary_width_memory.py
index 12253fe5b6056b1be3926e50b4d0c322bfe0561b..c81bbf7a31dd2c93d9c1e0bbe4ef31ad4ce09e88 100644
--- a/rtl_lib/arbitrary_width_memory.py
+++ b/rtl_lib/arbitrary_width_memory.py
@@ -108,6 +108,7 @@ class ArbitraryWidthMemory(Elaboratable):
             unwrapped_bit_index = Signal(range(self.backing_memory_length*self.backing_memory_data_width)) #probably cannot be made shorter.
 
             LS_bit_index = Signal(self.backing_memory_data_width)
+            LS_bit_index_internal = Signal(self.backing_memory_data_width)
             MS_bit_index = Signal(self.backing_memory_data_width)
             end_bit_pseudo_index = Signal(range(self.backing_memory_length*self.backing_memory_data_width)) # can be made shorter
             additional_words = Signal(self.backing_memory_address_width) # can also be amde shorter
@@ -171,23 +172,26 @@ class ArbitraryWidthMemory(Elaboratable):
                         # We start our cut at the unwrapped bit index modulo the memory data width.
                         # since the memory data width is a power of two, this is the K-least-significant-bits
                         # of the unwrapped bit index
-                        m.d.comb += LS_bit_index.eq(unwrapped_bit_index[:self.backing_memory_data_width_bits])
+                        m.d.comb += LS_bit_index_internal.eq(unwrapped_bit_index[:self.backing_memory_data_width_bits])
+                        m.d.sync += LS_bit_index.eq(unwrapped_bit_index[:self.backing_memory_data_width_bits])
+
 
                         # Here's where they start trying to trick you. We need to handle the case where the end of the
                         # fake word goes beyond a real memory word.
-                        m.d.comb += end_bit_pseudo_index.eq(LS_bit_index + self.fake_data_width - 1)
+                        m.d.comb += end_bit_pseudo_index.eq(LS_bit_index_internal + self.fake_data_width - 1)
 
                         # So here we determine if there's any need for additional memory words:
                         m.d.comb += additional_words.eq(end_bit_pseudo_index[self.backing_memory_data_width_bits:])
 
                         m.d.sync += shreg.eq(current_slice)
-
                         with m.If(additional_words == 0):
                             # No additional words, calculate which bits we need from the sole word we're fetching:
-                            m.d.comb += MS_bit_index.eq(end_bit_pseudo_index[:self.backing_memory_data_width_bits])
+                            m.d.sync += MS_bit_index.eq(end_bit_pseudo_index[:self.backing_memory_data_width_bits])
+                            m.d.sync += last_r_data_valid.eq(1)
+
                         with m.Else():
                             # Additional words needed, so we fetch the entire remaining part of this word
-                            m.d.comb += MS_bit_index.eq(self.backing_memory_data_width-1)
+                            m.d.sync += MS_bit_index.eq(self.backing_memory_data_width-1)
                             # and we register state for the next cycle[s]
                             m.d.sync += next_address.eq(fetch_address + 1)
                             # rather than keeping track of the next address and the final address,
@@ -195,6 +199,8 @@ class ArbitraryWidthMemory(Elaboratable):
                             # maybe that saves a few LUTs by avoiding wide compares, who knows.
                             m.d.sync += additional_words_regd.eq(additional_words)
                             m.d.sync += end_bit_pseudo_index_regd.eq(end_bit_pseudo_index)
+                            m.d.sync += last_r_data_valid.eq(0)
+
                             m.next="ADD"
 
 
@@ -205,15 +211,16 @@ class ArbitraryWidthMemory(Elaboratable):
                     # we handle both the full-word fetches and the final (potentially partial word) fetch here
                     with m.If(additional_words_regd == 1):
                         # We start from zero...
-                        m.d.comb += LS_bit_index.eq(0)
+                        m.d.sync += LS_bit_index.eq(0)
                         # But this is the last word, so we may not have to include the whole word!
-                        m.d.comb += MS_bit_index.eq(end_bit_pseudo_index_regd[:self.backing_memory_data_width_bits])
+                        m.d.sync += MS_bit_index.eq(end_bit_pseudo_index_regd[:self.backing_memory_data_width_bits])
                         m.d.comb += fetch_address.eq(next_address)
+                        m.d.sync += last_r_data_valid.eq(1)
                         m.next = "STALL"
                     with m.Else():
                         # non-special case, fetch the whole word
-                        m.d.comb += LS_bit_index.eq(0)
-                        m.d.comb += MS_bit_index.eq(self.backing_memory_data_width-1)
+                        m.d.sync += LS_bit_index.eq(0)
+                        m.d.sync += MS_bit_index.eq(self.backing_memory_data_width-1)
                         # and increment the address and decrement the remaining words counter
                         m.d.sync += next_address.eq(next_address + 1)
                         m.d.sync += additional_words_regd.eq(additional_words_regd - 1)
@@ -266,8 +273,8 @@ class DummyPlug(Elaboratable):
     def elaborate(self, platform):
         m = Module()
 
-        m.submodules.FakeAWMem = FakeAWMem = ArbitraryWidthMemory(fake_data_width=4,
-                            fake_address_width=8, initial_data=refolder([10,9,8,7,6,5,4,3,2,1],4, 8),
+        m.submodules.FakeAWMem = FakeAWMem = ArbitraryWidthMemory(fake_data_width=16,
+                            fake_address_width=8, initial_data=[0xAB, 0xCD], # refolder([10,9,8,7,6,5,4,3,2,1],4, 8),
                             backing_memory_data_width=8, backing_memory_address_width=8)
         counter = Signal(8, reset=0)