diff --git a/rtl_lib/arbitrary_width_memory.py b/rtl_lib/arbitrary_width_memory.py
index c81bbf7a31dd2c93d9c1e0bbe4ef31ad4ce09e88..95c39bdf7e5e61465652ca0aa6e0a1454328aec0 100644
--- a/rtl_lib/arbitrary_width_memory.py
+++ b/rtl_lib/arbitrary_width_memory.py
@@ -139,22 +139,32 @@ class ArbitraryWidthMemory(Elaboratable):
             valid_shreg_bits = Signal(range(self.fake_data_width))
             shreg_new_bits = Signal(range(self.backing_memory_data_width+1))
 
+            lingering_txn = Signal(1)
+
+            # This is the slicer; it takes the word from the memory read port and only keeps
+            # bits [MS_bit_index inclusive, LS_bit_index inclusive] from it. If we want a whole
+            # byte, for example, we set MS_bit_index=7, LS_bit_index=0.
+            # If we want the bit above the least-significant bit, we set MS_bit_index=1=LS_bit_index
+
+            # Note that this leaves the top bits zero, which is excellent because it means that
+            # we can shift it into the shift register without any trouble.
+
             m.d.comb += lower_bits_cut.eq(read_port.data>>LS_bit_index)
             m.d.comb += top_cut.eq(self.backing_memory_data_width-MS_bit_index-1+LS_bit_index)
             m.d.comb += current_slice.eq(((lower_bits_cut<<top_cut)&0xff)>>top_cut)
+            #                                                        ^ FIXME, this needs to be generated at synthesis time and needs to be all-ones as wide as the memory word!
 
+            # Here we calculate the number of bits in the sliced word, which determines the
+            # shift that needs to be done on the shift register:
             m.d.comb += shreg_new_bits.eq(MS_bit_index-LS_bit_index+1)
 
             with m.FSM() as fsm:
                 with m.State("RESET"):
                     m.next ="READY"
 
-                with m.State("READY"):
+                with m.State("SINGLE"):
                     m.d.comb += bus.ready_out.eq(1)
 
-                    with m.If(last_r_data_valid == 1):
-                        m.d.comb += bus.valid_out.eq(1)
-                        m.d.comb += bus.r_data.eq(last_r_data)
 
                     with m.If(bus.valid_in == 1):
                         # the memory address and bit-index computation goes as follows: