From 16e930975c9596961a37ce018817f166e6683be2 Mon Sep 17 00:00:00 2001
From: dhothem <dhothem@sandia.gov>
Date: Mon, 20 Jan 2025 15:49:08 -0800
Subject: [PATCH 001/141] Adding in crosstalk-free GST eDesigns.

---
 .../crosstalkfreeexperimentdesign.py          | 486 ++++++++++++++++++
 1 file changed, 486 insertions(+)
 create mode 100644 pygsti/protocols/crosstalkfreeexperimentdesign.py

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
new file mode 100644
index 000000000..0e714dac2
--- /dev/null
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -0,0 +1,486 @@
+import numpy as np
+from pygsti.protocols import CircuitListsDesign, CombinedExperimentDesign, HasProcessorSpec
+import copy
+
+def find_neighbors(vertices: list, edges: list) -> dict:
+    """
+    Find the neighbors of each vertex in a graph.
+
+    This function takes a list of vertices and a dictionary of edges, 
+    where edges are represented as tuples. It returns a dictionary 
+    mapping each vertex to a list of its neighbors.
+
+    Parameters:
+    vertices (list): A list of vertices in the graph.
+    edges (dict): A symmetric list of the edges in the graph [e.g., (v1, v2) and (v2, v1) are elements].
+
+    Returns:
+    dict: A dictionary where each key is a vertex and the value is a 
+          list of neighboring vertices.
+    """
+    neighbors = {v: [] for v in vertices}
+    for e in edges:
+        neighbors[e[0]].append(e[1])
+    return neighbors
+
+# # NOTE: Ignore this function. I'm pretty sure it is not needed.
+# def generate_edge_colorings(vertices: list, edges: list) -> list:
+#     """
+#     Generate a set of edge colorings for a graph until all edges are colored.
+
+#     This function takes an edge set of a simple undirected graph and repeatedly 
+#     applies the Misra & Gries edge coloring algorithm until every edge is 
+#     contained in some edge coloring. It returns a dictionary mapping colors 
+#     to the edges colored with that color.
+
+#     Parameters:
+#     vertices (list): A list of vertices in the graph.
+#     edges (list): A list of edges represented as tuples (u, v) where u and v 
+#                   are vertices in the graph.
+
+#     Returns:
+#     list: A list of edge colorings (dictionaries whose keys are colors and items are lists colored edges)
+#     """
+#     list_of_edge_colorings = []
+#     uncolored_edges = set(edges)
+
+#     while uncolored_edges:
+#         # Determine which vertices are neighbors in a graph with only uncolored edges
+#         # Could call find_neighbors here...
+#         updated_neighbors = {v: [] for v in vertices}
+#         for u, v in uncolored_edges:
+#             updated_neighbors[u].append(v)
+
+#         # Calculate the maximum degree of the graph
+#         deg = max(len(updated_neighbors[v]) for v in vertices)
+
+#         # Find an edge coloring
+#         new_color_patches = find_edge_coloring(deg, vertices, list(uncolored_edges), updated_neighbors)
+
+#         # Update color patches and remove newly colored edges from uncolored_edges
+#         list_of_edge_colorings.append(new_color_patches)
+#         for _, edge_list in new_color_patches.items():
+#             uncolored_edges.difference_update(edge_list)
+#             uncolored_edges.difference_update([(v,u) for u, v in edge_list]) # need to symmetrize
+
+#     return list_of_edge_colorings
+
+# # NOTE: This class is superfluous. Keeping it around in case I realize that it isn't - Daniel H.
+# class CrosstalkFreeCombinedExperimentDesign(CombinedExperimentDesign, HasProcessorSpec):
+#     def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, seed = None, interleave = False):
+        
+#         HasProcessorSpec.__init__(self, processor_spec)
+        
+#         randstate = np.random.RandomState(seed)
+#         self.interleave = interleave
+#         self.oneq_gstdesign = oneq_gstdesign
+#         self.twoq_gstdesign = twoq_gstdesign
+#         self.vertices = self.processor_spec.qubit_labels
+#         self.edges = self.processor_spec.compute_2Q_connectivity().edges()
+#         self.neighbors = find_neighbors(self.vertices, self.edges)
+#         self.deg = max([len(self.neighbors[v]) for v in self.vertices])
+
+
+#         # Generate the sub-experiment designs
+#         self.edge_colorings = generate_edge_colorings(self.vertices, self.edges)
+#         self.sub_designs = [CrosstalkFreeSubExperimentDesign(self.processor_spec, 
+#                                                             self.oneq_gstdesign,
+#                                                             self.twoq_gstdesign,
+#                                                             edge_coloring,
+#                                                             randstate) for edge_coloring in self.edge_colorings]
+#         CombinedExperimentDesign.__init__(self, sub_designs = self.sub_designs, qubit_labels = self.vertices, interleave = self.interleave)
+
+
+def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list, 
+                                       oneq_gstdesign, twoq_gstdesign, randstate: int) -> tuple:
+    '''
+    Generate crosstalk-free GST circuits by stitching together 1Q and 2Q GST circuits for 
+    each color patch.
+
+    This function combines 1Q and 2Q GST circuits based on the specified color patches.
+    For each germ power L, it randomizes the order of the 2Q GST circuits and the 1Q GST 
+    circuits for each edge and unused qubit. The circuits are then stitched together to 
+    form the final circuit lists.
+
+    Parameters:
+    color_patches (dict): A dictionary mapping color patches to their corresponding edge sets.
+                          A 'color patch' is a set of similarly colored edges in an edge coloring.
+    vertices (list): A list of vertices in the graph.
+    oneq_gstdesign: A GST edesign containing the 1Q GST circuits.
+    twoq_gstdesign: An GST edesign containing the 2Q GST circuit.
+    randstate: A random state object used for randomization.
+
+    Returns:
+    tuple: A tuple containing:
+        - circuit_lists (list): A list of crosstalk-free GST circuits for each germ power.
+        - aux_info (dict): Auxiliary information mapping circuits to their corresponding edges and vertices.
+    '''
+
+    circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
+    aux_info = {}
+
+    for patch, edge_set in color_patches.items():
+        used_qubits = np.flatten(np.array(edge_set))
+        unused_qubits = np.setdiff1d(np.array(vertices), used_qubits)
+
+        for L, (oneq_circuits, twoq_circuits) in enumerate(zip(oneq_gstdesign.circuit_lists, twoq_gstdesign.circuit_lists)):   # assumes that they use the same L 
+            oneq_len = len(oneq_circuits)
+            twoq_len = len(twoq_circuits)
+
+            max_len = max(oneq_len, twoq_len)
+            min_len = min(oneq_len, twoq_len)
+            num_batches = (max_len // min_len) + 1
+
+            if oneq_len <= twoq_len:
+                # 2Q GST circuit list is longer
+                edge_permutations = [randstate.permutation(max_len) for _ in edge_set] # Randomize the order in which we place 2Q GST circuits on each edge
+                vertex_permutations = [[] for _ in unused_qubits] 
+                for _ in range(num_batches):
+                    for perm in vertex_permutations:
+                        perm.extend([randstate.permutation(min_len)])
+                vertex_permutations = [mp[:max_len] for mp in vertex_permutations] # Randomize the order in which we place 1Q GST circuits on each isolated qubit
+            else:
+                # 1Q GST circuit list is longer
+                vertex_permutations = [randstate.permutation(max_len) for _ in unused_qubits]
+                edge_permutations = [[] for _ in edge_set]
+                for _ in range(num_batches):
+                    for perm in edge_permutations:
+                        perm.extend([randstate.permutation(min_len)])
+                edge_permutations = [mp[:max_len] for mp in edge_permutations]
+                    
+            edge_permutations = np.array(edge_permutations)
+            vertex_permutations = np.array(vertex_permutations)
+
+            for j in range(max_len): # range(max(edge_permutations.shape[1], vertex_permutations.shape[1])): 
+                # Pick the initial subcircuit
+                if len(edge_permutations):
+                    c = twoq_circuits[edge_permutations[0,j]]
+                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, edge_set[0])}
+                    c = c.map_state_space_labels(map_dict)
+                    edge_start = 1
+                    vertex_start = 0
+                else:
+                    c = oneq_circuits[vertex_permutations[0,j]]
+                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, (unused_qubits[0],))}
+                    c = c.map_state_space_labels(map_dict)
+                    edge_start = 0
+                    vertex_start = 1
+                        
+                # Tensor together the other subcircuits
+                for i in range(edge_start, edge_permutations.shape[0]):
+                    c2 = twoq_circuits[edge_permutations[i,j]] # Fix col
+                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, edge_set[i])}
+                    c2 = c2.map_state_space_labels(map_dict)
+                    c.tensor_circuit_inplace(c2) # c is already a copy due to map_state_space_labels above
+
+                for i in range(vertex_start, vertex_permutations.shape[0]):
+                    c2 = twoq_circuits[vertex_permutations[i,j]] # Fix col
+                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, (unused_qubits[i],))}
+                    c2 = c2.map_state_space_labels(map_dict)
+                    c.tensor_circuit_inplace(c2) # c is already a copy due to map_state_space_labels above
+                    
+                circuit_lists[L].append(c)
+
+                aux_info[c] = {'edges': edge_set, 'vertices': unused_qubits} #YOLO
+
+    return circuit_lists, aux_info
+
+class CrosstalkFreeExperimentDesign(CircuitListsDesign, HasProcessorSpec):
+    '''
+    This class initializes a crosstalk-free GST experiment design by combining 
+    1Q and 2Q GST designs based on a specified edge coloring. It assumes that 
+    the GST designs share the same germ powers (Ls) and utilizes a specified 
+    circuit stitcher to generate the final circuit lists.
+
+    Attributes:
+    processor_spec: Specification of the processor, including qubit labels and connectivity.
+    oneq_gstdesign: The design for one-qubit GST circuits.
+    twoq_gstdesign: The design for two-qubit GST circuits.
+    edge_coloring (dict): A dictionary mapping color patches to their corresponding edge sets.
+    circuit_stitcher (callable): A function to stitch circuits together (default: stitch_circuits_by_germ_power_only).
+    seed (int, optional): Seed for random number generation.
+
+    circuit_lists (list): The generated list of stitched circuits.
+    aux_info (dict): Auxiliary information mapping circuits to their corresponding edges and vertices.
+    '''
+    def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring, 
+                 circuit_stitcher = stitch_circuits_by_germ_power_only, seed = None):
+        '''
+        Assume that the GST designs have the same Ls.
+
+        TODO: Update the init function so that it handles different circuit stitchers better (i.e., by using stitcher_kwargs, etc.)
+        '''
+        HasProcessorSpec.__init__(self, processor_spec)
+         
+        randstate = np.random.RandomState(seed)
+        self.oneq_gstdesign = oneq_gstdesign
+        self.twoq_gstdesign = twoq_gstdesign
+        self.vertices = self.processor_spec.qubit_labels
+        self.edges = self.processor_spec.compute_2Q_connectivity().edges()
+        self.neighbors = find_neighbors(self.vertices, self.edges)
+        self.deg = max([len(self.neighbors[v]) for v in self.vertices])
+        self.color_patches = edge_coloring
+        self.circuit_stitcher = circuit_stitcher
+    
+        self.circuit_lists, self.aux_info = circuit_stitcher(self.color_patches, self.vertices, 
+                                                            self.oneq_gstdesign, self.twoq_gstdesign, 
+                                                            randstate,
+        )
+        
+        CircuitListsDesign.__init__(self, self.circuit_lists, qubit_labels=self.vertices)
+
+'''
+Everything below is used to find an edge coloring of a graph.
+'''
+import copy
+
+def order(u, v):
+    """
+    Return a tuple containing the two input values in sorted order.
+
+    This function takes two values, `u` and `v`, and returns them as a 
+    tuple in ascending order. The smaller value will be the first element 
+    of the tuple.
+
+    Parameters:
+    u: The first value to be ordered.
+    v: The second value to be ordered.
+
+    Returns:
+    tuple: A tuple containing the values `u` and `v` in sorted order.
+    """
+    a = [u, v]
+    a.sort()
+    return tuple(a)
+
+def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, free_colors: dict) -> list:
+    '''
+    Selects candidate vertices to be added to a fan.
+
+    This function returns vertices connected to the anchor vertex `u` 
+    where the edge (u, v) is colored with a color that is free on the 
+    last vertex in the fan.
+
+    Parameters:
+    fan (list): A list of vertices representing the current fan.
+    u (int): The anchor vertex of the fan.
+    vertices (list): A list of all vertices in the graph.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+    free_colors (dict): A dictionary mapping vertices to their available colors.
+
+    Returns:
+    list: A list of candidate vertices that can be colored with free colors 
+          available to the last vertex in the fan.
+    '''
+    last_vertex = fan[-1]
+    free_vertex_colors = free_colors[last_vertex]
+    return [v for v in vertices if edge_colors[(u, v)] in free_vertex_colors]
+
+def build_maximal_fan(u: int, v: int, vertex_neighbors: dict, 
+                      free_colors: dict, edge_colors: dict) -> list:
+    '''
+    Construct a maximal fan of vertex u starting with vertex v.
+
+    A fan is a sequence of distinct neighbors of u that satisfies the following:
+    1. The edge (u, v) is uncolored.
+    2. Each subsequent edge (u, F[i+1]) is free on F[i] for 1 <= i < k.
+
+    Parameters:
+    u (int): The vertex from which the fan is built.
+    v (int): The first vertex in the fan.
+    vertex_neighbors (dict): A dictionary mapping vertices to their neighbors.
+    free_colors (dict): A dictionary mapping vertices to their available colors.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+
+    Returns:
+    list: A list representing the maximal fan of vertex u.
+    '''
+    u_neighbors = copy.deepcopy(vertex_neighbors[u])
+    fan = [v]
+    u_neighbors.remove(v)
+
+    candidate_vertices = find_fan_candidates(fan, u, u_neighbors, edge_colors, free_colors)
+    while len(candidate_vertices) != 0:
+        fan.append(candidate_vertices[0])
+        u_neighbors.remove(candidate_vertices[0])
+        candidate_vertices = find_fan_candidates(fan, u, u_neighbors, edge_colors, free_colors)
+    return fan
+
+def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge_colors: dict):
+    '''
+    Finds, if it exists, the next vertex in a cd_u path. It does so by finding the neighbor 
+    of the current vertex which is attached by an edge of the right color.
+
+    Parameters:
+    current_vertex (int): The last vertext added to the cd_u path.
+    color (int): The desired color of the next possible edge in the cd_u path.
+    neighbors (dict): A dictionary mapping each vertex to its neighboring vertices.
+    edge_colors (dict): A dictionary mapping edges to their curren colors.
+
+    Returns: 
+    int or None: The next vertex in the cd_u path that is connected by an edge of the specified color, 
+                 or None if no such vertex exists.
+    '''
+    
+    for vertex in neighbors[current_vertex]:
+        if edge_colors[(current_vertex, vertex)] == color:
+            return vertex
+    return None
+    
+def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors: dict) -> list:
+    '''
+    Finds the cd_u path.
+
+    The cd_u path is a path passing through u of edges whose colors alternate between c and d.
+    Every cd_u path in the Misra & Gries algorithm starts at u with an edge of color 'd', 
+    because 'c' was chosen to be free on u. Assuming that a cd_u path exists.
+
+    Parameters:
+    u (int): The starting vertex of the path.
+    v (int): The target vertex (not used in path finding).
+    c (int): The color that is free on vertex `u`.
+    d (int): The color that is initially used for the first edge from `u`.
+    neighbors (dict): A dictionary mapping each vertex to its neighboring vertices.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+
+    Returns:
+    list: A list of tuples representing the edges in the cd_u path.
+    '''
+    cdu_path = []
+    current_color = d
+    current_vertex = u
+    next_vertex = find_next_path_vertex(u, d, neighbors, edge_colors)
+    next_color = {c: d, d: c}
+
+    while next_vertex is not None:
+        cdu_path.append((current_vertex, next_vertex))
+        current_vertex = next_vertex
+        current_color = next_color[current_color]
+        next_vertex = find_next_path_vertex(current_vertex, current_color, neighbors, edge_colors)
+    return cdu_path
+
+def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_patches: dict):
+    '''
+    Rotate the colors in a fan of vertices connected to a specified vertex.
+
+    This function shifts the colors in the fan over by one position, updating the 
+    edge colorings, free colors for each vertex, and the associated color patches. 
+    After rotation, the edge connected to the specified vertex `u` and the first 
+    vertex in the fan receives the color of the next vertex in the fan, while the 
+    color of the last vertex in the fan is removed.
+
+    Parameters:
+    fan (list): A list of vertices representing the fan to be rotated.
+    u (int): The vertex anchoring the fan that is being rotated.
+    edge_colors (dict): A dictionary mapping edges to their current colors.
+    free_colors (dict): A dictionary mapping vertices to their available colors.
+    color_patches (dict): A dictionary mapping colors to lists of edges colored with that color.
+
+    Returns:
+    tuple: Updated dictionaries for edge_colors, free_colors, and color_patches after rotation.
+    '''
+    for i in range(len(fan) - 1):
+        current_vertex = fan[i]
+        next_vertex = fan[i+1]
+        next_color = edge_colors[(u, next_vertex)]
+        
+        edge_colors[(u, current_vertex)] = next_color
+        edge_colors[(current_vertex, u)] = next_color
+        edge_colors[(u, next_vertex)] = -1
+        edge_colors[(next_vertex, u)] = -1
+
+        free_colors[current_vertex].remove(next_color)
+        free_colors[next_vertex].append(next_color)
+
+        color_patches[next_color].append(order(u, current_vertex))
+        color_patches[next_color].remove(order(u, next_vertex))
+
+    return edge_colors, free_colors, color_patches
+    
+
+def find_edge_coloring(deg: int, vertices: list, edges: list, neighbors: dict) -> dict:
+    '''
+    Implements Misra & Gries' edge coloring algorithm for a simple undirected graph.
+
+    This function colors the edges of a simple undirected graph using at most 
+    d or d+1 colors, where d is the maximum degree of the graph. The algorithm 
+    is optimal (or off by 1) for all simple, undirected graphs, as stated by 
+    Vizing's theorem.
+
+    Parameters:
+    deg (int): The maximum degree of the graph.
+    vertices (list): A list of vertices in the graph.
+    edges (list): A list of edges represented as tuples of vertices [assumed to be symmetric, i.e., (u,v) and (v,u) are elements].
+    neighbors (dict): A dictionary mapping each vertex to its neighboring vertices.
+
+    Returns:
+    color_patches (dict): A dictionary mapping each color to a list of edges colored with that color.
+                          Unlike with edges, the items in color_patches are NOT symmetric [i.e., it only contains (v1, v2) for v1 < v2]
+    '''
+
+    edges = copy.deepcopy(edges)
+    free_colors = {u: [i for i in range(deg+1)] for u in vertices} # Keeps track of which colors are free on each vertex
+    color_patches = {i: [] for i in range(deg+1)} # Keeps track of the edges (items) that have been assigned a color (keys)
+    
+    edge_colors = {edge: -1 for edge in edges} # Keeps track of which color (item) an edge has been assigned to (key)
+    edges = [list(edge) for edge in edges]
+
+    for edge in edges:
+        edge.sort()
+    edges = list(set([tuple(edge) for edge in edges]))
+
+    # Loop the edges in G.
+    # You will color a new edge each time.
+    for edge in edges:
+        # Find a maximal fan F of vertex 'u' with F[1] = 'v.'
+        u, v = edge
+        max_fan = build_maximal_fan(u, v, neighbors, free_colors, edge_colors)
+        
+        # Pick free colors c and d on u and k, the last vertex in the fan.
+        # Find the cd_u path, i.e., the maximal path through u of edges whose colors alternate between c and d.
+        k = max_fan[-1]
+        c, d = free_colors[u][-1], free_colors[k][-1] # c is free on u, while d is free on the last entry in the fan
+        cdu_path = find_color_path(u, k, c, d, neighbors, edge_colors)        
+        
+        # invert the cd_u path
+        for i in range(len(cdu_path)):
+            path_edge = cdu_path[i]
+            # path should be colored as d, c, d, c, etc... because c was free on u
+            current_color = [d, c][i%2]
+            other_color = [d, c][(i+1)%2]
+            if order(path_edge[0], path_edge[1]) in color_patches[current_color]:
+                color_patches[current_color].remove(order(path_edge[0], path_edge[1]))
+                #color_patches[current_color].remove((path_edge[1], path_edge[0]))
+            color_patches[other_color].append(order(path_edge[0], path_edge[1]))
+            #color_patches[other_color].append((path_edge[1], path_edge[0]))
+            edge_colors[path_edge] = other_color
+            edge_colors[(path_edge[1], path_edge[0])] = other_color
+        if len(cdu_path) > 0: 
+            free_colors[u].remove(c)
+            free_colors[u].append(d)
+            final_color, final_vertex = edge_colors[cdu_path[-1]], cdu_path[-1][-1]
+            free_colors[final_vertex].remove(final_color)
+            free_colors[final_vertex].append(list(np.setdiff1d([c, d], [final_color]))[0])        
+        
+        # Find a subfan of u, F' = F[1:w] for which the color d is free on w.
+        w_index = 0
+        for i in range(len(max_fan)):
+            if d in free_colors[max_fan[i]]: w_index = i
+        w, sub_fan = max_fan[w_index], max_fan[:w_index + 1]
+        
+        # Rotate the subfan. If it exists, then
+        # you have now colored the edge (u,v) with whatever color was on (u, F[2])
+        if len(sub_fan) > 1: # rotate the subfan
+            edge_colors, free_colors, color_patches = rotate_fan(sub_fan, u, edge_colors, free_colors, color_patches)
+
+        # Set the color of (u, w) to d.
+        edge_colors[(u, w)] = d
+        edge_colors[(w, u)] = d
+        color_patches[d].append(order(u, w))
+        if d in free_colors[u]:
+            free_colors[u].remove(d)
+        if d in free_colors[w]:
+            free_colors[w].remove(d)
+
+    return color_patches
+        
\ No newline at end of file

From 56242ee2a7543e34176a69ba0048ec567e4b2434 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 21 Jan 2025 08:38:18 -0500
Subject: [PATCH 002/141] style

---
 .../crosstalkfreeexperimentdesign.py          | 92 +++----------------
 pygsti/protocols/unused_xfgst.py              | 66 +++++++++++++
 2 files changed, 81 insertions(+), 77 deletions(-)
 create mode 100644 pygsti/protocols/unused_xfgst.py

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 0e714dac2..bdb5a179e 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -1,7 +1,8 @@
 import numpy as np
-from pygsti.protocols import CircuitListsDesign, CombinedExperimentDesign, HasProcessorSpec
+from pygsti.protocols import CircuitListsDesign, HasProcessorSpec
 import copy
 
+
 def find_neighbors(vertices: list, edges: list) -> dict:
     """
     Find the neighbors of each vertex in a graph.
@@ -23,73 +24,6 @@ def find_neighbors(vertices: list, edges: list) -> dict:
         neighbors[e[0]].append(e[1])
     return neighbors
 
-# # NOTE: Ignore this function. I'm pretty sure it is not needed.
-# def generate_edge_colorings(vertices: list, edges: list) -> list:
-#     """
-#     Generate a set of edge colorings for a graph until all edges are colored.
-
-#     This function takes an edge set of a simple undirected graph and repeatedly 
-#     applies the Misra & Gries edge coloring algorithm until every edge is 
-#     contained in some edge coloring. It returns a dictionary mapping colors 
-#     to the edges colored with that color.
-
-#     Parameters:
-#     vertices (list): A list of vertices in the graph.
-#     edges (list): A list of edges represented as tuples (u, v) where u and v 
-#                   are vertices in the graph.
-
-#     Returns:
-#     list: A list of edge colorings (dictionaries whose keys are colors and items are lists colored edges)
-#     """
-#     list_of_edge_colorings = []
-#     uncolored_edges = set(edges)
-
-#     while uncolored_edges:
-#         # Determine which vertices are neighbors in a graph with only uncolored edges
-#         # Could call find_neighbors here...
-#         updated_neighbors = {v: [] for v in vertices}
-#         for u, v in uncolored_edges:
-#             updated_neighbors[u].append(v)
-
-#         # Calculate the maximum degree of the graph
-#         deg = max(len(updated_neighbors[v]) for v in vertices)
-
-#         # Find an edge coloring
-#         new_color_patches = find_edge_coloring(deg, vertices, list(uncolored_edges), updated_neighbors)
-
-#         # Update color patches and remove newly colored edges from uncolored_edges
-#         list_of_edge_colorings.append(new_color_patches)
-#         for _, edge_list in new_color_patches.items():
-#             uncolored_edges.difference_update(edge_list)
-#             uncolored_edges.difference_update([(v,u) for u, v in edge_list]) # need to symmetrize
-
-#     return list_of_edge_colorings
-
-# # NOTE: This class is superfluous. Keeping it around in case I realize that it isn't - Daniel H.
-# class CrosstalkFreeCombinedExperimentDesign(CombinedExperimentDesign, HasProcessorSpec):
-#     def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, seed = None, interleave = False):
-        
-#         HasProcessorSpec.__init__(self, processor_spec)
-        
-#         randstate = np.random.RandomState(seed)
-#         self.interleave = interleave
-#         self.oneq_gstdesign = oneq_gstdesign
-#         self.twoq_gstdesign = twoq_gstdesign
-#         self.vertices = self.processor_spec.qubit_labels
-#         self.edges = self.processor_spec.compute_2Q_connectivity().edges()
-#         self.neighbors = find_neighbors(self.vertices, self.edges)
-#         self.deg = max([len(self.neighbors[v]) for v in self.vertices])
-
-
-#         # Generate the sub-experiment designs
-#         self.edge_colorings = generate_edge_colorings(self.vertices, self.edges)
-#         self.sub_designs = [CrosstalkFreeSubExperimentDesign(self.processor_spec, 
-#                                                             self.oneq_gstdesign,
-#                                                             self.twoq_gstdesign,
-#                                                             edge_coloring,
-#                                                             randstate) for edge_coloring in self.edge_colorings]
-#         CombinedExperimentDesign.__init__(self, sub_designs = self.sub_designs, qubit_labels = self.vertices, interleave = self.interleave)
-
 
 def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list, 
                                        oneq_gstdesign, twoq_gstdesign, randstate: int) -> tuple:
@@ -185,6 +119,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
 
     return circuit_lists, aux_info
 
+
 class CrosstalkFreeExperimentDesign(CircuitListsDesign, HasProcessorSpec):
     '''
     This class initializes a crosstalk-free GST experiment design by combining 
@@ -229,10 +164,10 @@ def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring
         
         CircuitListsDesign.__init__(self, self.circuit_lists, qubit_labels=self.vertices)
 
+
 '''
 Everything below is used to find an edge coloring of a graph.
 '''
-import copy
 
 def order(u, v):
     """
@@ -249,9 +184,8 @@ def order(u, v):
     Returns:
     tuple: A tuple containing the values `u` and `v` in sorted order.
     """
-    a = [u, v]
-    a.sort()
-    return tuple(a)
+    return (min(u, v), max(u, v))
+
 
 def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, free_colors: dict) -> list:
     '''
@@ -276,6 +210,7 @@ def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, fr
     free_vertex_colors = free_colors[last_vertex]
     return [v for v in vertices if edge_colors[(u, v)] in free_vertex_colors]
 
+
 def build_maximal_fan(u: int, v: int, vertex_neighbors: dict, 
                       free_colors: dict, edge_colors: dict) -> list:
     '''
@@ -306,6 +241,7 @@ def build_maximal_fan(u: int, v: int, vertex_neighbors: dict,
         candidate_vertices = find_fan_candidates(fan, u, u_neighbors, edge_colors, free_colors)
     return fan
 
+
 def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge_colors: dict):
     '''
     Finds, if it exists, the next vertex in a cd_u path. It does so by finding the neighbor 
@@ -327,6 +263,7 @@ def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge
             return vertex
     return None
     
+
 def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors: dict) -> list:
     '''
     Finds the cd_u path.
@@ -359,6 +296,7 @@ def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors
         next_vertex = find_next_path_vertex(current_vertex, current_color, neighbors, edge_colors)
     return cdu_path
 
+
 def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_patches: dict):
     '''
     Rotate the colors in a fan of vertices connected to a specified vertex.
@@ -380,19 +318,19 @@ def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_pa
     tuple: Updated dictionaries for edge_colors, free_colors, and color_patches after rotation.
     '''
     for i in range(len(fan) - 1):
-        current_vertex = fan[i]
+        curr_vertex = fan[i]
         next_vertex = fan[i+1]
         next_color = edge_colors[(u, next_vertex)]
         
-        edge_colors[(u, current_vertex)] = next_color
-        edge_colors[(current_vertex, u)] = next_color
+        edge_colors[(u, curr_vertex)] = next_color
+        edge_colors[(curr_vertex, u)] = next_color
         edge_colors[(u, next_vertex)] = -1
         edge_colors[(next_vertex, u)] = -1
 
-        free_colors[current_vertex].remove(next_color)
+        free_colors[curr_vertex].remove(next_color)
         free_colors[next_vertex].append(next_color)
 
-        color_patches[next_color].append(order(u, current_vertex))
+        color_patches[next_color].append(order(u, curr_vertex))
         color_patches[next_color].remove(order(u, next_vertex))
 
     return edge_colors, free_colors, color_patches
diff --git a/pygsti/protocols/unused_xfgst.py b/pygsti/protocols/unused_xfgst.py
new file mode 100644
index 000000000..5a4963372
--- /dev/null
+++ b/pygsti/protocols/unused_xfgst.py
@@ -0,0 +1,66 @@
+# # NOTE: Ignore this function. I'm pretty sure it is not needed.
+# def generate_edge_colorings(vertices: list, edges: list) -> list:
+#     """
+#     Generate a set of edge colorings for a graph until all edges are colored.
+
+#     This function takes an edge set of a simple undirected graph and repeatedly 
+#     applies the Misra & Gries edge coloring algorithm until every edge is 
+#     contained in some edge coloring. It returns a dictionary mapping colors 
+#     to the edges colored with that color.
+
+#     Parameters:
+#     vertices (list): A list of vertices in the graph.
+#     edges (list): A list of edges represented as tuples (u, v) where u and v 
+#                   are vertices in the graph.
+
+#     Returns:
+#     list: A list of edge colorings (dictionaries whose keys are colors and items are lists colored edges)
+#     """
+#     list_of_edge_colorings = []
+#     uncolored_edges = set(edges)
+
+#     while uncolored_edges:
+#         # Determine which vertices are neighbors in a graph with only uncolored edges
+#         # Could call find_neighbors here...
+#         updated_neighbors = {v: [] for v in vertices}
+#         for u, v in uncolored_edges:
+#             updated_neighbors[u].append(v)
+
+#         # Calculate the maximum degree of the graph
+#         deg = max(len(updated_neighbors[v]) for v in vertices)
+
+#         # Find an edge coloring
+#         new_color_patches = find_edge_coloring(deg, vertices, list(uncolored_edges), updated_neighbors)
+
+#         # Update color patches and remove newly colored edges from uncolored_edges
+#         list_of_edge_colorings.append(new_color_patches)
+#         for _, edge_list in new_color_patches.items():
+#             uncolored_edges.difference_update(edge_list)
+#             uncolored_edges.difference_update([(v,u) for u, v in edge_list]) # need to symmetrize
+
+#     return list_of_edge_colorings
+
+# # NOTE: This class is superfluous. Keeping it around in case I realize that it isn't - Daniel H.
+# class CrosstalkFreeCombinedExperimentDesign(CombinedExperimentDesign, HasProcessorSpec):
+#     def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, seed = None, interleave = False):
+        
+#         HasProcessorSpec.__init__(self, processor_spec)
+        
+#         randstate = np.random.RandomState(seed)
+#         self.interleave = interleave
+#         self.oneq_gstdesign = oneq_gstdesign
+#         self.twoq_gstdesign = twoq_gstdesign
+#         self.vertices = self.processor_spec.qubit_labels
+#         self.edges = self.processor_spec.compute_2Q_connectivity().edges()
+#         self.neighbors = find_neighbors(self.vertices, self.edges)
+#         self.deg = max([len(self.neighbors[v]) for v in self.vertices])
+
+
+#         # Generate the sub-experiment designs
+#         self.edge_colorings = generate_edge_colorings(self.vertices, self.edges)
+#         self.sub_designs = [CrosstalkFreeSubExperimentDesign(self.processor_spec, 
+#                                                             self.oneq_gstdesign,
+#                                                             self.twoq_gstdesign,
+#                                                             edge_coloring,
+#                                                             randstate) for edge_coloring in self.edge_colorings]
+#         CombinedExperimentDesign.__init__(self, sub_designs = self.sub_designs, qubit_labels = self.vertices, interleave = self.interleave)

From d249c80f7f1a82ff4375552256178d2f01fe5098 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 23 Jan 2025 11:17:44 -0500
Subject: [PATCH 003/141] restore commented-out line with update to function
 name

---
 pygsti/circuits/circuit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 7624d1f98..e4bd19c36 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -181,7 +181,7 @@ def to_label(x):
     """
     if isinstance(x, _Label): return x
     # # do this manually when desired, as it "boxes" a circuit being inserted
-    #elif isinstance(x,Circuit): return x.to_circuit_label()
+    elif isinstance(x,Circuit): return x.to_label()
     else: return _Label(x)
 
 

From dc964ad345738fb4210785627e5839af63775ece Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 23 Jan 2025 11:18:44 -0500
Subject: [PATCH 004/141] whitespace and docstring

---
 pygsti/models/model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index 6dd0016d8..d24ef5090 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -1829,10 +1829,10 @@ def complete_circuits(self, circuits, prep_lbl_to_prepend=None, povm_lbl_to_appe
             already has a prep label this argument will be ignored.
 
         povm_lbl_to_append : Label, optional (default None)
-            Optional user specified prep label to prepend. If not
+            Optional user specified povm label to prepend. If not
             specified will use the default value as given by
             :meth:_default_primitive_prep_layer_lbl. If the circuit
-            already has a prep label this argument will be ignored.
+            already has a povm label this argument will be ignored.
         
         return_split : bool, optional (default False)
             If True we additionally return a list of tuples of the form:
@@ -1859,7 +1859,7 @@ def complete_circuits(self, circuits, prep_lbl_to_prepend=None, povm_lbl_to_appe
 
         #precompute unique default povm labels.
         unique_sslbls = set([ckt._line_labels for ckt in circuits])
-        default_povm_labels = {sslbls:(self._default_primitive_povm_layer_lbl(sslbls),) for sslbls in unique_sslbls}
+        default_povm_labels = {sslbls: (self._default_primitive_povm_layer_lbl(sslbls),) for sslbls in unique_sslbls}
 
         comp_circuits = []
         if return_split:

From 8018e69b444bb88bc8ad9ad8eac5562569e43d56 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 23 Jan 2025 11:24:29 -0500
Subject: [PATCH 005/141] add some utility functions to CircuitList class

---
 pygsti/circuits/circuitlist.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/pygsti/circuits/circuitlist.py b/pygsti/circuits/circuitlist.py
index 666802382..555b5990a 100644
--- a/pygsti/circuits/circuitlist.py
+++ b/pygsti/circuits/circuitlist.py
@@ -202,6 +202,39 @@ def __setstate__(self, state_dict):
         if 'uuid' not in state_dict:  # backward compatibility
             self.uuid = _uuid.uuid4()  # create a new uuid
 
+    def permuted_subcircuitlist(self, perm, new_name=None):
+        perm = _np.atleast_1d(perm)
+        circuits = (self[i] for i in perm)
+        name = self.name if new_name is None else new_name
+        other = CircuitList(circuits, self.op_label_aliases, self.circuit_rules, self.circuit_weights, name)
+        return other
+
+    def map_line_labels(self, mapper, new_name=None):
+        """
+        Create a CircuitList where the line labels of each constituent circuit are updated according to `mapper`.
+        Parameters
+        ----------
+        mapper : dict or function
+            A dictionary whose keys are the existing self.line_labels values
+            and whose value are the new labels, or a function which takes a
+            single (existing line-label) argument and returns a new line-label.
+        Returns
+        -------
+        CircuitList
+        """
+        circuits = tuple(c.map_state_space_labels(mapper) for c in self._circuits)
+        # ^ The function Circuit.map_state_space_labels actually maps LINE LABELS.
+        other = CircuitList(circuits, name=new_name)
+        return other
+    
+    def tensor_circuits(self, other_circuitlist, new_name=None):
+        assert len(self) == len(other_circuitlist)
+        circuits = []
+        for c1,c2 in zip(self._circuits, other_circuitlist._circuits):
+            circuits.append(c1.tensor_circuit(c2))
+        out = CircuitList(circuits, name=new_name)
+        return out
+
     def elementvec_to_array(self, elementvec, layout, mergeop="sum"):
         """
         Form an array of values corresponding to this CircuitList from an element vector.

From 1d29ddb73d3f10bd3f6faabcb00ecae03e6772e9 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 23 Jan 2025 11:28:16 -0500
Subject: [PATCH 006/141] bugfixes in stich_circuits_by_germ_power_only. Remove
 HasProcessorSpec parent class of CrosstalkFreeExperimentDesign.

---
 .../crosstalkfreeexperimentdesign.py          | 91 +++++++++++--------
 1 file changed, 53 insertions(+), 38 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index bdb5a179e..a9191fba8 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -1,5 +1,7 @@
 import numpy as np
 from pygsti.protocols import CircuitListsDesign, HasProcessorSpec
+from pygsti.circuits.circuitlist import CircuitList
+from pygsti.circuits.circuit import Circuit
 import copy
 
 
@@ -54,8 +56,10 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     aux_info = {}
 
     for patch, edge_set in color_patches.items():
-        used_qubits = np.flatten(np.array(edge_set))
+        # This might be broken when edge_set is empty.
+        used_qubits = np.array(edge_set).ravel()
         unused_qubits = np.setdiff1d(np.array(vertices), used_qubits)
+        assert len(oneq_gstdesign.circuit_lists) == len(twoq_gstdesign.circuit_lists), "Not implemented."
 
         for L, (oneq_circuits, twoq_circuits) in enumerate(zip(oneq_gstdesign.circuit_lists, twoq_gstdesign.circuit_lists)):   # assumes that they use the same L 
             oneq_len = len(oneq_circuits)
@@ -63,55 +67,67 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
 
             max_len = max(oneq_len, twoq_len)
             min_len = min(oneq_len, twoq_len)
-            num_batches = (max_len // min_len) + 1
-
-            if oneq_len <= twoq_len:
-                # 2Q GST circuit list is longer
-                edge_permutations = [randstate.permutation(max_len) for _ in edge_set] # Randomize the order in which we place 2Q GST circuits on each edge
-                vertex_permutations = [[] for _ in unused_qubits] 
-                for _ in range(num_batches):
-                    for perm in vertex_permutations:
-                        perm.extend([randstate.permutation(min_len)])
-                vertex_permutations = [mp[:max_len] for mp in vertex_permutations] # Randomize the order in which we place 1Q GST circuits on each isolated qubit
-            else:
-                # 1Q GST circuit list is longer
-                vertex_permutations = [randstate.permutation(max_len) for _ in unused_qubits]
-                edge_permutations = [[] for _ in edge_set]
-                for _ in range(num_batches):
-                    for perm in edge_permutations:
-                        perm.extend([randstate.permutation(min_len)])
-                edge_permutations = [mp[:max_len] for mp in edge_permutations]
-                    
+            num_batches = int(np.ceil(max_len / min_len))
+
+            if oneq_len > twoq_len:
+                # vertex_permutations = [randstate.permutation(max_len) for _ in unused_qubits]
+                # edge_permutations = [[] for _ in edge_set]
+                # for _ in range(num_batches):
+                #     for perm in edge_permutations:
+                #         perm.extend([randstate.permutation(min_len)])
+                # edge_permutations = [mp[:max_len] for mp in edge_permutations]
+                raise NotImplementedError()
+        
+            # 2Q GST circuit list is longer
+            edge_permutations = [randstate.permutation(max_len) for _ in edge_set] # Randomize the order in which we place 2Q GST circuits on each edge
+            vertex_permutations = [[] for _ in unused_qubits] 
+            for _ in range(num_batches):
+                for perm in vertex_permutations:
+                    perm.extend([randstate.permutation(min_len)])
+            vertex_permutations = [mp[:max_len] for mp in vertex_permutations] # Randomize the order in which we place 1Q GST circuits on each isolated qubit
+            #                           ^ Before this line executes, len(vertex_permutations[i]) == num_batches for all i,
+            #                             and num_batches = ceil(max_len/min_len) <= max_len, so the indexing mp[:max_len] has no effect.
+        
             edge_permutations = np.array(edge_permutations)
             vertex_permutations = np.array(vertex_permutations)
+            # ^ vertex_permutations.shape is either () or (len(unused_qubits), num_batches, min_len)
+
 
-            for j in range(max_len): # range(max(edge_permutations.shape[1], vertex_permutations.shape[1])): 
+            """
+            NOTE: I was able to infer that the twoq_gstdesign.sslabels should really be qubit labels by seeing how (oldq, newq)
+            were in the iterator that zip'd (twoq_gstdesign.sslabels, other_thing).
+            """
+            # for j in range(max_len): # range(max(edge_permutations.shape[1], vertex_permutations.shape[1])): 
+            dim1 = 0 if (edge_permutations.ndim < 2) else edge_permutations.shape[1]
+            dim2 = 0 if (vertex_permutations.ndim < 2) else vertex_permutations.shape[1]
+            for j in range(max(dim1, dim2)): 
                 # Pick the initial subcircuit
                 if len(edge_permutations):
-                    c = twoq_circuits[edge_permutations[0,j]]
-                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, edge_set[0])}
-                    c = c.map_state_space_labels(map_dict)
+                    c = twoq_circuits.permuted_subcircuitlist( edge_permutations[0,j] )
+                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[0])}
+                    c = c.map_line_labels(map_dict)
                     edge_start = 1
                     vertex_start = 0
                 else:
-                    c = oneq_circuits[vertex_permutations[0,j]]
-                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, (unused_qubits[0],))}
-                    c = c.map_state_space_labels(map_dict)
+                    # The second component of vertex_permutations should range from 
+                    c = oneq_circuits.permuted_subcircuitlist(  vertex_permutations[0,j] )
+                    map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[0],))}
+                    c = c.map_line_labels(map_dict)
                     edge_start = 0
                     vertex_start = 1
                         
                 # Tensor together the other subcircuits
                 for i in range(edge_start, edge_permutations.shape[0]):
-                    c2 = twoq_circuits[edge_permutations[i,j]] # Fix col
-                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, edge_set[i])}
-                    c2 = c2.map_state_space_labels(map_dict)
-                    c.tensor_circuit_inplace(c2) # c is already a copy due to map_state_space_labels above
+                    c2 = twoq_circuits.permuted_subcircuitlist(  edge_permutations[i,j] )  # Fix col
+                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[i])}
+                    c2 = c2.map_line_labels(map_dict)
+                    c = c.tensor_circuits(c2) # c is already a copy due to map_line_labels above
 
                 for i in range(vertex_start, vertex_permutations.shape[0]):
-                    c2 = twoq_circuits[vertex_permutations[i,j]] # Fix col
-                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.sslbls, (unused_qubits[i],))}
-                    c2 = c2.map_state_space_labels(map_dict)
-                    c.tensor_circuit_inplace(c2) # c is already a copy due to map_state_space_labels above
+                    c2 = oneq_circuits.permuted_subcircuitlist( vertex_permutations[i,j] ) # Fix col
+                    map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}
+                    c2 = c2.map_line_labels(map_dict)
+                    c = c.tensor_circuits(c2) # c is already a copy due to map_line_labels above
                     
                 circuit_lists[L].append(c)
 
@@ -120,7 +136,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     return circuit_lists, aux_info
 
 
-class CrosstalkFreeExperimentDesign(CircuitListsDesign, HasProcessorSpec):
+class CrosstalkFreeExperimentDesign(CircuitListsDesign):
     '''
     This class initializes a crosstalk-free GST experiment design by combining 
     1Q and 2Q GST designs based on a specified edge coloring. It assumes that 
@@ -145,9 +161,8 @@ def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring
 
         TODO: Update the init function so that it handles different circuit stitchers better (i.e., by using stitcher_kwargs, etc.)
         '''
-        HasProcessorSpec.__init__(self, processor_spec)
-         
         randstate = np.random.RandomState(seed)
+        self.processor_spec = processor_spec
         self.oneq_gstdesign = oneq_gstdesign
         self.twoq_gstdesign = twoq_gstdesign
         self.vertices = self.processor_spec.qubit_labels

From 19fcf3cc0465a386e54c97e82250d522f47cf608 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 23 Jan 2025 20:31:32 -0500
Subject: [PATCH 007/141] undo changes to CircuitList. Add tqdm logging. Fix
 crosstalkfreeexperimentdesign functions

---
 pygsti/circuits/circuitlist.py                | 25 ------
 pygsti/layouts/prefixtable.py                 | 11 +--
 .../crosstalkfreeexperimentdesign.py          | 77 ++++++++++---------
 pygsti/tools/tqdm.py                          |  7 ++
 4 files changed, 54 insertions(+), 66 deletions(-)
 create mode 100644 pygsti/tools/tqdm.py

diff --git a/pygsti/circuits/circuitlist.py b/pygsti/circuits/circuitlist.py
index 555b5990a..d36391964 100644
--- a/pygsti/circuits/circuitlist.py
+++ b/pygsti/circuits/circuitlist.py
@@ -201,31 +201,6 @@ def __setstate__(self, state_dict):
         self.__dict__.update(state_dict)
         if 'uuid' not in state_dict:  # backward compatibility
             self.uuid = _uuid.uuid4()  # create a new uuid
-
-    def permuted_subcircuitlist(self, perm, new_name=None):
-        perm = _np.atleast_1d(perm)
-        circuits = (self[i] for i in perm)
-        name = self.name if new_name is None else new_name
-        other = CircuitList(circuits, self.op_label_aliases, self.circuit_rules, self.circuit_weights, name)
-        return other
-
-    def map_line_labels(self, mapper, new_name=None):
-        """
-        Create a CircuitList where the line labels of each constituent circuit are updated according to `mapper`.
-        Parameters
-        ----------
-        mapper : dict or function
-            A dictionary whose keys are the existing self.line_labels values
-            and whose value are the new labels, or a function which takes a
-            single (existing line-label) argument and returns a new line-label.
-        Returns
-        -------
-        CircuitList
-        """
-        circuits = tuple(c.map_state_space_labels(mapper) for c in self._circuits)
-        # ^ The function Circuit.map_state_space_labels actually maps LINE LABELS.
-        other = CircuitList(circuits, name=new_name)
-        return other
     
     def tensor_circuits(self, other_circuitlist, new_name=None):
         assert len(self) == len(other_circuitlist)
diff --git a/pygsti/layouts/prefixtable.py b/pygsti/layouts/prefixtable.py
index 7508fb7c4..1194debef 100644
--- a/pygsti/layouts/prefixtable.py
+++ b/pygsti/layouts/prefixtable.py
@@ -16,6 +16,7 @@
 from math import ceil
 from pygsti.baseobjs import Label as _Label
 from pygsti.circuits.circuit import SeparatePOVMCircuit as _SeparatePOVMCircuit
+from pygsti.tools.tqdm import our_tqdm
 
 
 class PrefixTable(object):
@@ -686,11 +687,10 @@ def _cache_hits(circuit_reps, circuit_lengths):
     # Not: this logic could be much better, e.g. computing a cost savings for each
     #  potentially-cached item and choosing the best ones, and proper accounting
     #  for chains of cached items.
-    
     cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
     cache_hits = [0]*len(circuit_reps)
 
-    for i in range(len(circuit_reps)):
+    for i in our_tqdm(range(len(circuit_reps)), 'Prefix table : _cache_hits '):
         circuit = circuit_reps[i] 
         L = circuit_lengths[i]  # can be a Circuit or a label tuple
         for cached_index in reversed(cacheIndices):
@@ -708,10 +708,11 @@ def _build_table(sorted_circuits_to_evaluate, cache_hits, max_cache_size, circui
 
     # Build prefix table: construct list, only caching items with hits > 0 (up to max_cache_size)
     cacheIndices = []  # indices into circuits_to_evaluate of the results to cache
-    table_contents = [None]*len(sorted_circuits_to_evaluate)
+    num_sorted_circuits = len(sorted_circuits_to_evaluate)
+    table_contents = [None]*num_sorted_circuits
     curCacheSize = 0
-    for j, (i, _) in zip(orig_indices,enumerate(sorted_circuits_to_evaluate)):
-        
+    for i in our_tqdm(range(num_sorted_circuits), 'Prefix table : _build_table '):
+        j = orig_indices[i]
         circuit_rep = circuit_reps[i] 
         L = circuit_lengths[i]
 
diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index a9191fba8..0c19af324 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -55,6 +55,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
     aux_info = {}
 
+    num_lines = -1
     for patch, edge_set in color_patches.items():
         # This might be broken when edge_set is empty.
         used_qubits = np.array(edge_set).ravel()
@@ -70,67 +71,71 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             num_batches = int(np.ceil(max_len / min_len))
 
             if oneq_len > twoq_len:
-                # vertex_permutations = [randstate.permutation(max_len) for _ in unused_qubits]
-                # edge_permutations = [[] for _ in edge_set]
+                # node_perms = [randstate.permutation(max_len) for _ in unused_qubits]
+                # edge_perms = [[] for _ in edge_set]
                 # for _ in range(num_batches):
-                #     for perm in edge_permutations:
+                #     for perm in edge_perms:
                 #         perm.extend([randstate.permutation(min_len)])
-                # edge_permutations = [mp[:max_len] for mp in edge_permutations]
+                # edge_perms = [mp[:max_len] for mp in edge_perms]
                 raise NotImplementedError()
         
             # 2Q GST circuit list is longer
-            edge_permutations = [randstate.permutation(max_len) for _ in edge_set] # Randomize the order in which we place 2Q GST circuits on each edge
-            vertex_permutations = [[] for _ in unused_qubits] 
-            for _ in range(num_batches):
-                for perm in vertex_permutations:
-                    perm.extend([randstate.permutation(min_len)])
-            vertex_permutations = [mp[:max_len] for mp in vertex_permutations] # Randomize the order in which we place 1Q GST circuits on each isolated qubit
-            #                           ^ Before this line executes, len(vertex_permutations[i]) == num_batches for all i,
-            #                             and num_batches = ceil(max_len/min_len) <= max_len, so the indexing mp[:max_len] has no effect.
+            edge_perms = [randstate.permutation(max_len) for _ in edge_set] # Randomize the order in which we place 2Q GST circuits on each edge
+            node_perms = []
+            for i in range(unused_qubits.size):
+                perms = [randstate.permutation(min_len) for _ in range(num_batches)]
+                node_perms.append(np.concatenate(perms)[:max_len])
+            edge_perms = np.array(edge_perms)
+            node_perms = np.array(node_perms)
         
-            edge_permutations = np.array(edge_permutations)
-            vertex_permutations = np.array(vertex_permutations)
-            # ^ vertex_permutations.shape is either () or (len(unused_qubits), num_batches, min_len)
+            # Check invariants
+            edge_line_contributions   = 2*edge_perms.shape[0] if edge_perms.size > 0 else 0
+            node_line_contributions = node_perms.shape[0] if node_perms.size > 0 else 0
+            curr_num_lines = edge_line_contributions + node_line_contributions
+            if num_lines < 0:
+                num_lines = curr_num_lines
 
+            assert num_lines == curr_num_lines
+            if edge_perms.size > 0 and node_perms.size > 0:
+                assert edge_perms.shape[1] == node_perms.shape[1]
 
             """
             NOTE: I was able to infer that the twoq_gstdesign.sslabels should really be qubit labels by seeing how (oldq, newq)
             were in the iterator that zip'd (twoq_gstdesign.sslabels, other_thing).
             """
-            # for j in range(max_len): # range(max(edge_permutations.shape[1], vertex_permutations.shape[1])): 
-            dim1 = 0 if (edge_permutations.ndim < 2) else edge_permutations.shape[1]
-            dim2 = 0 if (vertex_permutations.ndim < 2) else vertex_permutations.shape[1]
-            for j in range(max(dim1, dim2)): 
+            for j in range(max_len):
                 # Pick the initial subcircuit
-                if len(edge_permutations):
-                    c = twoq_circuits.permuted_subcircuitlist( edge_permutations[0,j] )
+                if len(edge_perms):
+                    c = twoq_circuits[edge_perms[0,j]]
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[0])}
-                    c = c.map_line_labels(map_dict)
+                    c = c.map_state_space_labels(map_dict)
                     edge_start = 1
-                    vertex_start = 0
+                    node_start = 0
                 else:
-                    # The second component of vertex_permutations should range from 
-                    c = oneq_circuits.permuted_subcircuitlist(  vertex_permutations[0,j] )
+                    # The second component of node_perms should range from 
+                    c = oneq_circuits[node_perms[0,j]]
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[0],))}
-                    c = c.map_line_labels(map_dict)
+                    c = c.map_state_space_labels(map_dict)
                     edge_start = 0
-                    vertex_start = 1
-                        
+                    node_start = 1
+                
                 # Tensor together the other subcircuits
-                for i in range(edge_start, edge_permutations.shape[0]):
-                    c2 = twoq_circuits.permuted_subcircuitlist(  edge_permutations[i,j] )  # Fix col
+                for i in range(edge_start, edge_perms.shape[0]):
+                    c2 = twoq_circuits[ edge_perms[i,j] ]  # Fix col
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[i])}
-                    c2 = c2.map_line_labels(map_dict)
-                    c = c.tensor_circuits(c2) # c is already a copy due to map_line_labels above
+                    c2 = c2.map_state_space_labels(map_dict)
+                    c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
 
-                for i in range(vertex_start, vertex_permutations.shape[0]):
-                    c2 = oneq_circuits.permuted_subcircuitlist( vertex_permutations[i,j] ) # Fix col
+                for i in range(node_start, node_perms.shape[0]):
+                    c2 = oneq_circuits[ node_perms[i,j] ] # Fix col
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}
-                    c2 = c2.map_line_labels(map_dict)
-                    c = c.tensor_circuits(c2) # c is already a copy due to map_line_labels above
+                    c2 = c2.map_state_space_labels(map_dict)
+                    c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
                     
+                # By this point, should have len(c._line_labels) == [some constant, number of ]
                 circuit_lists[L].append(c)
 
+
                 aux_info[c] = {'edges': edge_set, 'vertices': unused_qubits} #YOLO
 
     return circuit_lists, aux_info
diff --git a/pygsti/tools/tqdm.py b/pygsti/tools/tqdm.py
new file mode 100644
index 000000000..f3d66fe8d
--- /dev/null
+++ b/pygsti/tools/tqdm.py
@@ -0,0 +1,7 @@
+try:
+    from tqdm import tqdm
+    def our_tqdm(iterator, message):
+        return tqdm(iterator, message)
+except ImportError:
+    def our_tqdm(iterator, ignore):
+        return iterator

From 35e35fae247158a832540462ea0763271add1fcf Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 27 Jan 2025 13:37:25 -0500
Subject: [PATCH 008/141] add PermutationOperator class. Remove outdated part
 of a docstring.

---
 pygsti/modelmembers/operations/__init__.py    |  1 +
 .../modelmembers/operations/permutationop.py  | 47 +++++++++++++++++++
 pygsti/models/model.py                        |  4 +-
 3 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 pygsti/modelmembers/operations/permutationop.py

diff --git a/pygsti/modelmembers/operations/__init__.py b/pygsti/modelmembers/operations/__init__.py
index 4cf342590..0c0c19a87 100644
--- a/pygsti/modelmembers/operations/__init__.py
+++ b/pygsti/modelmembers/operations/__init__.py
@@ -31,6 +31,7 @@
 from .linearop import finite_difference_deriv_wrt_params, finite_difference_hessian_wrt_params
 from .lpdenseop import LinearlyParamArbitraryOp
 from .opfactory import OpFactory, EmbeddedOpFactory, EmbeddingOpFactory, ComposedOpFactory
+from .permutationop import PermutationOperator
 from .repeatedop import RepeatedOp
 from .staticarbitraryop import StaticArbitraryOp
 from .staticcliffordop import StaticCliffordOp
diff --git a/pygsti/modelmembers/operations/permutationop.py b/pygsti/modelmembers/operations/permutationop.py
new file mode 100644
index 000000000..ee9bbbca8
--- /dev/null
+++ b/pygsti/modelmembers/operations/permutationop.py
@@ -0,0 +1,47 @@
+from pygsti.modelmembers.operations import DenseOperator
+from pygsti.baseobjs.basisconstructors import pp_labels
+import numpy as _np
+
+class PermutationOperator(DenseOperator):
+
+    def __init__(self, perm: _np.ndarray):
+        dim = perm.size
+        mx = _np.eye(dim)
+        mx = mx[perm,:]
+        super().__init__(mx, 'pp', 'densitymx')
+        self._perm = perm
+
+    @property
+    def num_params(self):
+        return 0
+    
+    def to_vector(self):
+        return _np.array([])
+
+    def from_vector(self, v, close=False, dirty_value=True):
+        if v.size > 0:
+            raise ValueError()
+        return
+    
+    def transform(self, S):
+        raise NotImplementedError("PermutationOperator cannot be transformed!")
+    
+    def inverse_operator(self):
+        iperm = self._perm.copy()
+        iperm[iperm] = _np.arange(self.dim)
+        return PermutationOperator(iperm)
+
+    @staticmethod
+    def pp_braiding_operators(subsystem_perm):
+        subsystem_perm = _np.atleast_1d(subsystem_perm)
+        n_qubits = subsystem_perm.size
+        labels = _np.array(pp_labels(2**n_qubits))
+        braid_labels = _np.array([''.join([ell[i] for i in subsystem_perm]) for ell in labels])
+        braid_perm = []
+        for bl in braid_labels:
+            loc = _np.where(labels == bl)[0].item()
+            braid_perm.append(loc)
+        braid_perm = _np.array(braid_perm)
+        pop = PermutationOperator(braid_perm)
+        ipop = pop.inverse_operator()
+        return pop, ipop
diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index d24ef5090..44e1b9f1c 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -991,9 +991,7 @@ def uncollect_parameters(self, param_to_uncollect):
         self._rebuild_paramvec()
 
     def _rebuild_paramvec(self):
-        """ Resizes self._paramvec and updates gpindices & parent members as needed,
-            and will initialize new elements of _paramvec, but does NOT change
-            existing elements of _paramvec (use _update_paramvec for this)"""
+        """ Resizes self._paramvec and updates gpindices & parent members as needed."""
         w = self._model_paramvec_to_ops_paramvec(self._paramvec)
         Np = len(w)  # NOT self.num_params since the latter calls us!
         wl = self._paramlbls

From 3e3210f708c331af82614ecf4da7ab16994b4677 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 27 Jan 2025 19:30:51 -0500
Subject: [PATCH 009/141] update ProcessorSpec.compute_2Q_connectivity() so it
 can handle idle gates with implicit availability

---
 pygsti/processors/processorspec.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pygsti/processors/processorspec.py b/pygsti/processors/processorspec.py
index d5c022b6f..dbe8627ec 100644
--- a/pygsti/processors/processorspec.py
+++ b/pygsti/processors/processorspec.py
@@ -1217,7 +1217,12 @@ def compute_2Q_connectivity(self):
         qubit_labels = self.qubit_labels
         for gn in self.gate_names:
             if self.gate_num_qubits(gn) == 2:
-                for sslbls in self.resolved_availability(gn, 'tuple'):
-                    twoQ_connectivity[qubit_labels.index(sslbls[0]), qubit_labels.index(sslbls[1])] = True
+                avail = self.resolved_availability(gn, 'tuple')
+                if len(avail) == 1 and avail[0] is None and gn == '{idle}':
+                    avail = [(0, 1)]
+                for sslbls in avail:
+                    i = qubit_labels[sslbls[0]]
+                    j = qubit_labels[sslbls[1]]
+                    twoQ_connectivity[i, j] = True
 
         return _qgraph.QubitGraph(qubit_labels, twoQ_connectivity)

From 1253ee59b981f03cdcf6812af971fcc165e4fe2a Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 27 Jan 2025 21:04:29 -0500
Subject: [PATCH 010/141] undo imposition of certain connectivity structure for
 idle gates (now just raise an more informative error). Add a copy in
 PermutationOperator.pp_braiding_operators.

---
 pygsti/modelmembers/operations/permutationop.py | 2 +-
 pygsti/processors/processorspec.py              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pygsti/modelmembers/operations/permutationop.py b/pygsti/modelmembers/operations/permutationop.py
index ee9bbbca8..5bfae0fb4 100644
--- a/pygsti/modelmembers/operations/permutationop.py
+++ b/pygsti/modelmembers/operations/permutationop.py
@@ -33,7 +33,7 @@ def inverse_operator(self):
 
     @staticmethod
     def pp_braiding_operators(subsystem_perm):
-        subsystem_perm = _np.atleast_1d(subsystem_perm)
+        subsystem_perm = _np.atleast_1d(subsystem_perm).copy()
         n_qubits = subsystem_perm.size
         labels = _np.array(pp_labels(2**n_qubits))
         braid_labels = _np.array([''.join([ell[i] for i in subsystem_perm]) for ell in labels])
diff --git a/pygsti/processors/processorspec.py b/pygsti/processors/processorspec.py
index dbe8627ec..30fa1b571 100644
--- a/pygsti/processors/processorspec.py
+++ b/pygsti/processors/processorspec.py
@@ -1177,7 +1177,7 @@ def compute_clifford_ops_on_qubits(self):
 
         return clifford_ops_on_qubits
 
-        ### TODO: do we still need this?
+    ### TODO: do we still need this?
     @lru_cache(maxsize=100)
     def compute_clifford_2Q_connectivity(self):
         """
@@ -1219,7 +1219,7 @@ def compute_2Q_connectivity(self):
             if self.gate_num_qubits(gn) == 2:
                 avail = self.resolved_availability(gn, 'tuple')
                 if len(avail) == 1 and avail[0] is None and gn == '{idle}':
-                    avail = [(0, 1)]
+                    raise ValueError('Availability of the idle gate has not been set.')
                 for sslbls in avail:
                     i = qubit_labels[sslbls[0]]
                     j = qubit_labels[sslbls[1]]

From 99392bfa58b074d7d791dce032c935c9aed699fb Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 29 Jan 2025 19:12:05 -0500
Subject: [PATCH 011/141] ready for 3qubit xfgst

---
 pygsti/baseobjs/label.py                      |  3 ++
 pygsti/circuits/circuit.py                    |  2 +
 pygsti/processors/processorspec.py            |  6 ++-
 .../crosstalkfreeexperimentdesign.py          | 37 +++++++++++++++++--
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/pygsti/baseobjs/label.py b/pygsti/baseobjs/label.py
index a9033239b..07c08798c 100644
--- a/pygsti/baseobjs/label.py
+++ b/pygsti/baseobjs/label.py
@@ -13,6 +13,7 @@
 import itertools as _itertools
 import numbers as _numbers
 import sys as _sys
+import copy as _copy
 
 
 class Label(object):
@@ -196,6 +197,8 @@ def is_simple(self):
 
         return self.IS_SIMPLE
 
+    def copy(self):
+        return _copy.deepcopy(self)
 
 
 class LabelTup(Label, tuple):
diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index e4bd19c36..57afb84d3 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -4459,6 +4459,8 @@ def done_editing(self):
                                   else _Label(layer_lbl) for layer_lbl in self._labels])
         self._hashable_tup = self.tup
         self._hash = hash(self._hashable_tup)
+        self._str = None
+        self._str = self.str # this accessor recomputes the value of self._str
 
 class CompressedCircuit(object):
     """
diff --git a/pygsti/processors/processorspec.py b/pygsti/processors/processorspec.py
index 30fa1b571..49916b46a 100644
--- a/pygsti/processors/processorspec.py
+++ b/pygsti/processors/processorspec.py
@@ -1219,7 +1219,11 @@ def compute_2Q_connectivity(self):
             if self.gate_num_qubits(gn) == 2:
                 avail = self.resolved_availability(gn, 'tuple')
                 if len(avail) == 1 and avail[0] is None and gn == '{idle}':
-                    raise ValueError('Availability of the idle gate has not been set.')
+                    avail = [qubit_labels]
+                    # if qubit_labels.size == 2:
+                    #     avail = [qubit_labels]
+                    # else:
+                    #     raise ValueError('Availability of the idle gate has not been set.')
                 for sslbls in avail:
                     i = qubit_labels[sslbls[0]]
                     j = qubit_labels[sslbls[1]]
diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 0c19af324..ae0920e44 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -2,6 +2,7 @@
 from pygsti.protocols import CircuitListsDesign, HasProcessorSpec
 from pygsti.circuits.circuitlist import CircuitList
 from pygsti.circuits.circuit import Circuit
+from pygsti.baseobjs.label import Label
 import copy
 
 
@@ -53,6 +54,18 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     '''
 
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
+    twoq_idle_label = Label(('Gi',) + twoq_gstdesign.qubit_labels)
+    mapper = {twoq_idle_label: twoq_idle_label}
+    for cl in twoq_gstdesign.circuit_lists:
+        # for c in cl:
+        for i in range(len(cl)):
+            c = cl[i]
+            c._static = False
+            mapper.update({k:k for k in c._labels})
+            mapper[Label(())] = twoq_idle_label
+            c.map_names_inplace(mapper)
+            c.done_editing()
+    assert Label(()) not in mapper.values()
     aux_info = {}
 
     num_lines = -1
@@ -89,8 +102,8 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             node_perms = np.array(node_perms)
         
             # Check invariants
-            edge_line_contributions   = 2*edge_perms.shape[0] if edge_perms.size > 0 else 0
-            node_line_contributions = node_perms.shape[0] if node_perms.size > 0 else 0
+            edge_line_contributions = 2*edge_perms.shape[0] if edge_perms.size > 0 else 0
+            node_line_contributions =   node_perms.shape[0] if node_perms.size > 0 else 0
             curr_num_lines = edge_line_contributions + node_line_contributions
             if num_lines < 0:
                 num_lines = curr_num_lines
@@ -107,13 +120,20 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 # Pick the initial subcircuit
                 if len(edge_perms):
                     c = twoq_circuits[edge_perms[0,j]]
+                    c._static = False
+                    c._labels = [mapper[ell].copy() for ell in c._labels]
+                    c.done_editing()
+                    assert Label(()) not in c._labels
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[0])}
                     c = c.map_state_space_labels(map_dict)
                     edge_start = 1
                     node_start = 0
                 else:
-                    # The second component of node_perms should range from 
                     c = oneq_circuits[node_perms[0,j]]
+                    c._static = False
+                    c._labels = [mapper[ell].copy() for ell in c._labels]
+                    c.done_editing()
+                    assert Label(()) not in c._labels
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[0],))}
                     c = c.map_state_space_labels(map_dict)
                     edge_start = 0
@@ -122,16 +142,24 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 # Tensor together the other subcircuits
                 for i in range(edge_start, edge_perms.shape[0]):
                     c2 = twoq_circuits[ edge_perms[i,j] ]  # Fix col
+                    c2._static = False
+                    c2._labels = [mapper[ell].copy() for ell in c2._labels]
+                    c2.done_editing()
+                    assert Label(()) not in c2._labels
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[i])}
                     c2 = c2.map_state_space_labels(map_dict)
                     c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
 
                 for i in range(node_start, node_perms.shape[0]):
                     c2 = oneq_circuits[ node_perms[i,j] ] # Fix col
+                    c2._static = False
+                    c2._labels = [mapper[ell].copy() for ell in c2._labels]
+                    c2.done_editing()
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}
                     c2 = c2.map_state_space_labels(map_dict)
                     c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
-                    
+                
+                assert Label(()) not in c._labels
                 # By this point, should have len(c._line_labels) == [some constant, number of ]
                 circuit_lists[L].append(c)
 
@@ -166,6 +194,7 @@ def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring
 
         TODO: Update the init function so that it handles different circuit stitchers better (i.e., by using stitcher_kwargs, etc.)
         '''
+        # TODO: make sure idle gates are explicit.
         randstate = np.random.RandomState(seed)
         self.processor_spec = processor_spec
         self.oneq_gstdesign = oneq_gstdesign

From 6b6b69e436f67b9f8ef81414ada3d4551cdd955c Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 4 Feb 2025 09:02:05 -0500
Subject: [PATCH 012/141] replace assertion with warning

---
 pygsti/tools/matrixtools.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pygsti/tools/matrixtools.py b/pygsti/tools/matrixtools.py
index 94940c45c..b162b9746 100644
--- a/pygsti/tools/matrixtools.py
+++ b/pygsti/tools/matrixtools.py
@@ -894,8 +894,9 @@ def real_matrix_log(m, action_if_imaginary="raise", tol=1e-8):
             pass
         else:
             assert(False), "Invalid 'action_if_imaginary' argument: %s" % action_if_imaginary
-    else:
-        assert(imMag <= tol), "real_matrix_log failed to construct a real logarithm!"
+    elif imMag <= tol:
+        import warnings
+        warnings.warn("real_matrix_log failed to construct a real logarithm!")
         logM = _np.real(logM)
 
     return logM

From c54c4d07db52c97f289c1c5aa616a639efe20b32 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 4 Feb 2025 09:14:25 -0500
Subject: [PATCH 013/141] switch to try MOSEK if available, falling back on
 Clarabel and then CVXOPT

---
 pygsti/tools/optools.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/pygsti/tools/optools.py b/pygsti/tools/optools.py
index 8d68a73ff..9fabba13f 100644
--- a/pygsti/tools/optools.py
+++ b/pygsti/tools/optools.py
@@ -325,19 +325,25 @@ def diamonddist(a, b, mx_basis='pp', return_x=False):
     J = JBstd - JAstd
     prob, vars = _diamond_norm_model(dim, smallDim, J)
 
-    try:
-        prob.solve(solver='CVXOPT')
-    except _cvxpy.error.SolverError as e:
-        _warnings.warn("CVXPY failed: %s - diamonddist returning -2!" % str(e))
-        return (-2, _np.zeros((dim, dim))) if return_x else -2
-    except:
-        _warnings.warn("CVXOPT failed (unknown err) - diamonddist returning -2!")
-        return (-2, _np.zeros((dim, dim))) if return_x else -2
-
-    if return_x:
-        return prob.value, vars[0].value
-    else:
-        return prob.value
+    solvers = ['CLARABEL', 'CVXOPT']
+    if 'MOSEK' in _cvxpy.installed_solvers():
+        solvers = ['MOSEK'] + solvers
+
+    zeros = _np.zeros((dim, dim))
+    for solver in solvers:
+        try:
+            prob.solve(solver=solver)
+            out = (prob.value, vars[0].value) if return_x else prob.value 
+            return out
+        except _cvxpy.error.SolverError as e:
+            _warnings.warn(f"Calling {solver} with CVXPY failed: {str(e)}.")
+            continue
+        except:
+            _warnings.warn("CVXPY failed (unknown err) - diamonddist returning -2!")
+            return (-2, zeros) if return_x else -2
+
+    _warnings.warn(f"Calling all solvers with CVXPY failed: {str(e)} - diamonddist returning -2!")
+    return (-2, zeros) if return_x else -2
 
 
 def _diamond_norm_model(dim, smallDim, J):

From a875d447cd958270311c2ffff65e494809c50892 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 4 Feb 2025 09:17:16 -0500
Subject: [PATCH 014/141] add ability to skip named sections in
 construct_standard_report. Always compute model violation w.r.t. logl
 objective function, rather than whatever might have been used as the final
 objective function.

---
 pygsti/report/factory.py | 88 +++++++++++++++++++++++++++++++---------
 1 file changed, 68 insertions(+), 20 deletions(-)

diff --git a/pygsti/report/factory.py b/pygsti/report/factory.py
index 8d2f675d7..1eb79b8f2 100644
--- a/pygsti/report/factory.py
+++ b/pygsti/report/factory.py
@@ -84,6 +84,21 @@ def _add_lbl(lst, lbl):
     return running_lbls
 
 
+#def _robust_estimate_has_same_models(estimates, est_lbl):
+#    lbl_robust = est_lbl+ROBUST_SUFFIX
+#    if lbl_robust not in estimates: return False #no robust estimate
+#
+#    for mdl_lbl in list(estimates[est_lbl].goparameters.keys()) \
+#        + ['final iteration estimate']:
+#        if mdl_lbl not in estimates[lbl_robust].models:
+#            return False #robust estimate is missing mdl_lbl!
+#
+#        mdl = estimates[lbl_robust].models[mdl_lbl]
+#        if estimates[est_lbl].models[mdl_lbl].frobeniusdist(mdl) > 1e-8:
+#            return False #model mismatch!
+#
+#    return True
+
 def _get_viewable_crf(est, est_lbl, mdl_lbl, verbosity=0):
     printer = _VerbosityPrinter.create_printer(verbosity)
 
@@ -184,8 +199,7 @@ def _create_master_switchboard(ws, results_dict, confidence_level,
     Ls = None
 
     for results in results_dict.values():
-        est_labels = _add_new_estimate_labels(est_labels, results.estimates,
-                                              combine_robust)
+        est_labels = _add_new_estimate_labels(est_labels, results.estimates, combine_robust)
         loc_Ls = results.circuit_lists['final'].xs \
             if isinstance(results.circuit_lists['final'], _PlaquetteGridCircuitStructure) else [0]
         Ls = _add_new_labels(Ls, loc_Ls)
@@ -305,10 +319,8 @@ def _create_master_switchboard(ws, results_dict, confidence_level,
             else:
                 est_modvi = est
 
-            switchBd.objfn_builder[d, i] = est.parameters.get(
-                'final_objfn_builder', _objfns.ObjectiveFunctionBuilder.create_from('logl'))
-            switchBd.objfn_builder_modvi[d, i] = est_modvi.parameters.get(
-                'final_objfn_builder', _objfns.ObjectiveFunctionBuilder.create_from('logl'))
+            switchBd.objfn_builder[d, i] = est.parameters.get('final_objfn_builder', _objfns.ObjectiveFunctionBuilder.create_from('logl'))
+            switchBd.objfn_builder_modvi[d, i] = _objfns.ObjectiveFunctionBuilder.create_from('logl')
             switchBd.params[d, i] = est.parameters
 
             switchBd.clifford_compilation[d, i] = est.parameters.get("clifford compilation", 'auto')
@@ -1172,6 +1184,32 @@ def construct_standard_report(results, title="auto",
         - idt_idle_oplabel : Label, optional
             The label identifying the idle gate (for use with idle tomography).
 
+        - skip_sections : tuple[str], optional
+             Contains names of standard report sections that should be skipped
+             in this particular report. Strings will be cast to lowercase, 
+             stripped of white space, and then mapped to omitted Section classes
+             as follows
+
+                {
+                    'summary'         : SummarySection,
+                    'goodness'        : GoodnessSection,
+                    'colorbox'        : GoodnessColorBoxPlotSection,
+                    'invariantgates'  : GaugeInvariantsGatesSection,
+                    'invariantgerms'  : GaugeInvariantsGermsSection,
+                    'variant'         : GaugeVariantSection,
+                    'variantraw'      : GaugeVariantsRawSection,
+                    'variantdecomp'   : GaugeVariantsDecompSection,
+                    'varianterrorgen' : GaugeVariantsErrorGenSection,
+                    'input'           : InputSection,
+                    'meta'            : MetaSection,
+                    'help'            : HelpSection
+                }
+            
+            A KeyError will be raised if skip_sections contains a string
+            that is not in the keys of the above dict (after casting to
+            lower case and stripping white space).
+
+
     verbosity : int, optional
         How much detail to send to stdout.
 
@@ -1240,20 +1278,30 @@ def construct_standard_report(results, title="auto",
         flags.add('CombineRobust')
 
     # build section list
-    sections = [
-        _section.SummarySection(),
-        _section.GoodnessSection(),
-        _section.GoodnessColorBoxPlotSection(),
-        _section.GaugeInvariantsGatesSection(),
-        _section.GaugeInvariantsGermsSection(),
-        _section.GaugeVariantSection(),
-        _section.GaugeVariantsRawSection(),
-        _section.GaugeVariantsDecompSection(),
-        _section.GaugeVariantsErrorGenSection(),
-        _section.InputSection(),
-        _section.MetaSection(),
-        _section.HelpSection()
-    ]
+    possible_sections = {
+        'summary'         : _section.SummarySection(),
+        'goodness'        : _section.GoodnessSection(),
+        'colorbox'        : _section.GoodnessColorBoxPlotSection(),
+        'invariantgates'  : _section.GaugeInvariantsGatesSection(),
+        'invariantgerms'  : _section.GaugeInvariantsGermsSection(),
+        'variant'         : _section.GaugeVariantSection(),
+        'variantraw'      : _section.GaugeVariantsRawSection(),
+        'variantdecomp'   : _section.GaugeVariantsDecompSection(),
+        'varianterrorgen' : _section.GaugeVariantsErrorGenSection(),
+        'input'           : _section.InputSection(),
+        'meta'            : _section.MetaSection(),
+        'help'            : _section.HelpSection()
+    }
+
+    skip_sections = advanced_options.get('skip_sections', tuple())
+    if skip_sections:
+        if isinstance(skip_sections, str):
+            skip_sections = [skip_sections]
+        skip_sections = [s.lower().replace(' ','') for s in skip_sections]
+        for s in skip_sections:
+            possible_sections.pop(s)
+    sections = list(possible_sections.values())
+    # ^ This whole process won't affect ordering of objects in "sections".
 
     if 'ShowScaling' in flags:
         sections.append(_section.GoodnessScalingSection())

From eef4b443d1dd1510f9b339c0899e6e055d317fac Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 4 Feb 2025 09:18:12 -0500
Subject: [PATCH 015/141] make computing some report quantities more robust to
 degenerate inputs. These are hacky changes. A proper change would be to use
 try-catch if encountering linalg errors.

---
 pygsti/report/reportables.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/pygsti/report/reportables.py b/pygsti/report/reportables.py
index e4fa68146..068caf8fb 100644
--- a/pygsti/report/reportables.py
+++ b/pygsti/report/reportables.py
@@ -353,7 +353,7 @@ def rel_circuit_eigenvalues(model_a, model_b, circuit):
     """
     A = model_a.sim.product(circuit)  # "gate"
     B = model_b.sim.product(circuit)  # "target gate"
-    rel_op = _np.dot(_np.linalg.inv(B), A)  # "relative gate" == target^{-1} * gate
+    rel_op = _np.dot(_np.linalg.pinv(B), A)  # "relative gate" == target^{-1} * gate
     return _np.linalg.eigvals(rel_op)
 
 
@@ -1290,7 +1290,7 @@ def std_unitarity(a, b, mx_basis):
     -------
     float
     """
-    Lambda = _np.dot(a, _np.linalg.inv(b))
+    Lambda = _np.dot(a, _np.linalg.pinv(b))
     return _tools.unitarity(Lambda, mx_basis)
 
 
@@ -1310,10 +1310,13 @@ def eigenvalue_unitarity(a, b):
     -------
     float
     """
-    Lambda = _np.dot(a, _np.linalg.inv(b))
-    d2 = Lambda.shape[0]
-    lmb = _np.linalg.eigvals(Lambda)
-    return float(_np.real(_np.linalg.norm(lmb)**2) - 1.0) / (d2 - 1.0)
+    try:
+        Lambda = _np.dot(a, _np.linalg.pinv(b))
+        d2 = Lambda.shape[0]
+        lmb = _np.linalg.eigvals(Lambda)
+        return float(_np.real(_np.linalg.norm(lmb)**2) - 1.0) / (d2 - 1.0)
+    except _np.linalg.LinAlgError:
+        return -1
 
 
 def nonunitary_entanglement_infidelity(a, b, mx_basis):
@@ -1641,7 +1644,7 @@ def rel_eigenvalues(a, b, mx_basis):
     -------
     numpy.ndarray
     """
-    target_op_inv = _np.linalg.inv(b)
+    target_op_inv = _np.linalg.pinv(b)
     rel_op = _np.dot(target_op_inv, a)
     return _np.linalg.eigvals(rel_op).astype("complex")  # since they generally *can* be complex
 
@@ -1750,7 +1753,7 @@ def rel_gate_eigenvalues(a, b, mx_basis):  # DUPLICATE of rel_eigenvalues TODO
     -------
     numpy.ndarray
     """
-    rel_op = _np.dot(_np.linalg.inv(b), a)  # "relative gate" == target^{-1} * gate
+    rel_op = _np.dot(_np.linalg.pinv(b), a)  # "relative gate" == target^{-1} * gate
     return _np.linalg.eigvals(rel_op).astype("complex")  # since they generally *can* be complex
 
 

From 689ffa7762bcfb887bf9027f4d7ebd2a00140fae Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 4 Feb 2025 09:19:23 -0500
Subject: [PATCH 016/141] make sure that circuits produced in crosstalk free
 experiment design functions only use idles with explicit target qubits,
 rather than implicit global idles

---
 .../crosstalkfreeexperimentdesign.py          | 54 +++++++++++++------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index ae0920e44..85dc72b44 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -55,20 +55,23 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
 
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
     twoq_idle_label = Label(('Gi',) + twoq_gstdesign.qubit_labels)
-    mapper = {twoq_idle_label: twoq_idle_label}
+    oneq_idle_label = Label(('Gi',) + oneq_gstdesign.qubit_labels)
+    mapper_2q = {twoq_idle_label: twoq_idle_label}
+    mapper_1q = {oneq_idle_label: oneq_idle_label}
     for cl in twoq_gstdesign.circuit_lists:
-        # for c in cl:
-        for i in range(len(cl)):
-            c = cl[i]
-            c._static = False
-            mapper.update({k:k for k in c._labels})
-            mapper[Label(())] = twoq_idle_label
-            c.map_names_inplace(mapper)
-            c.done_editing()
-    assert Label(()) not in mapper.values()
+        for c in cl:
+            mapper_2q.update({k:k for k in c._labels})
+            mapper_2q[Label(())] = twoq_idle_label
+    for cl in oneq_gstdesign.circuit_lists:
+        for c in cl:
+            mapper_1q.update({k:k for k in c._labels})
+            mapper_1q[Label(())] = oneq_idle_label
+    assert Label(()) not in mapper_2q.values()
+    assert Label(()) not in mapper_1q.values()
     aux_info = {}
 
     num_lines = -1
+    global_line_order = None
     for patch, edge_set in color_patches.items():
         # This might be broken when edge_set is empty.
         used_qubits = np.array(edge_set).ravel()
@@ -107,6 +110,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             curr_num_lines = edge_line_contributions + node_line_contributions
             if num_lines < 0:
                 num_lines = curr_num_lines
+                global_line_order = tuple(range(num_lines))
 
             assert num_lines == curr_num_lines
             if edge_perms.size > 0 and node_perms.size > 0:
@@ -121,7 +125,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 if len(edge_perms):
                     c = twoq_circuits[edge_perms[0,j]]
                     c._static = False
-                    c._labels = [mapper[ell].copy() for ell in c._labels]
+                    c._labels = [mapper_2q[ell].copy() for ell in c._labels]
                     c.done_editing()
                     assert Label(()) not in c._labels
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[0])}
@@ -131,7 +135,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 else:
                     c = oneq_circuits[node_perms[0,j]]
                     c._static = False
-                    c._labels = [mapper[ell].copy() for ell in c._labels]
+                    c._labels = [mapper_1q[ell].copy() for ell in c._labels]
                     c.done_editing()
                     assert Label(()) not in c._labels
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[0],))}
@@ -143,7 +147,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 for i in range(edge_start, edge_perms.shape[0]):
                     c2 = twoq_circuits[ edge_perms[i,j] ]  # Fix col
                     c2._static = False
-                    c2._labels = [mapper[ell].copy() for ell in c2._labels]
+                    c2._labels = [mapper_2q[ell].copy() for ell in c2._labels]
                     c2.done_editing()
                     assert Label(()) not in c2._labels
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[i])}
@@ -153,15 +157,16 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 for i in range(node_start, node_perms.shape[0]):
                     c2 = oneq_circuits[ node_perms[i,j] ] # Fix col
                     c2._static = False
-                    c2._labels = [mapper[ell].copy() for ell in c2._labels]
+                    c2._labels = [mapper_1q[ell].copy() for ell in c2._labels]
                     c2.done_editing()
+                    assert Label(()) not in c2._labels
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}
                     c2 = c2.map_state_space_labels(map_dict)
                     c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
                 
                 assert Label(()) not in c._labels
                 # By this point, should have len(c._line_labels) == [some constant, number of ]
-                circuit_lists[L].append(c)
+                circuit_lists[L].append(c.reorder_lines(global_line_order))
 
 
                 aux_info[c] = {'edges': edge_set, 'vertices': unused_qubits} #YOLO
@@ -383,7 +388,22 @@ def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_pa
         color_patches[next_color].remove(order(u, next_vertex))
 
     return edge_colors, free_colors, color_patches
-    
+
+
+def check_valid_edge_coloring(color_patches):
+    """
+    color_patches (dict): A dictionary mapping each color to a list of edges colored with that color.
+                          Unlike with edges, the items in color_patches are NOT symmetric [i.e., it only contains (v1, v2) for v1 < v2]
+    """
+    for c, patch in color_patches.items():
+        in_patch = set()
+        for pair in patch:
+            in_patch.add(pair[0])
+            in_patch.add(pair[1])
+        if len(in_patch) != 2*len(patch):
+            raise ValueError()
+    return
+
 
 def find_edge_coloring(deg: int, vertices: list, edges: list, neighbors: dict) -> dict:
     '''
@@ -469,5 +489,7 @@ def find_edge_coloring(deg: int, vertices: list, edges: list, neighbors: dict) -
         if d in free_colors[w]:
             free_colors[w].remove(d)
 
+        check_valid_edge_coloring(color_patches)
+
     return color_patches
         
\ No newline at end of file

From 1d66e1c0a3958d2d4e54925abd6514891a5b3c57 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 4 Feb 2025 15:59:14 -0500
Subject: [PATCH 017/141] style

---
 .../crosstalkfreeexperimentdesign.py          | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 85dc72b44..59e72649c 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -30,7 +30,7 @@ def find_neighbors(vertices: list, edges: list) -> dict:
 
 def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list, 
                                        oneq_gstdesign, twoq_gstdesign, randstate: int) -> tuple:
-    '''
+    """
     Generate crosstalk-free GST circuits by stitching together 1Q and 2Q GST circuits for 
     each color patch.
 
@@ -51,7 +51,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     tuple: A tuple containing:
         - circuit_lists (list): A list of crosstalk-free GST circuits for each germ power.
         - aux_info (dict): Auxiliary information mapping circuits to their corresponding edges and vertices.
-    '''
+    """
 
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
     twoq_idle_label = Label(('Gi',) + twoq_gstdesign.qubit_labels)
@@ -175,7 +175,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
 
 
 class CrosstalkFreeExperimentDesign(CircuitListsDesign):
-    '''
+    """
     This class initializes a crosstalk-free GST experiment design by combining 
     1Q and 2Q GST designs based on a specified edge coloring. It assumes that 
     the GST designs share the same germ powers (Ls) and utilizes a specified 
@@ -191,14 +191,14 @@ class CrosstalkFreeExperimentDesign(CircuitListsDesign):
 
     circuit_lists (list): The generated list of stitched circuits.
     aux_info (dict): Auxiliary information mapping circuits to their corresponding edges and vertices.
-    '''
+    """
     def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring, 
                  circuit_stitcher = stitch_circuits_by_germ_power_only, seed = None):
-        '''
+        """
         Assume that the GST designs have the same Ls.
 
         TODO: Update the init function so that it handles different circuit stitchers better (i.e., by using stitcher_kwargs, etc.)
-        '''
+        """
         # TODO: make sure idle gates are explicit.
         randstate = np.random.RandomState(seed)
         self.processor_spec = processor_spec
@@ -219,9 +219,9 @@ def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, edge_coloring
         CircuitListsDesign.__init__(self, self.circuit_lists, qubit_labels=self.vertices)
 
 
-'''
+"""
 Everything below is used to find an edge coloring of a graph.
-'''
+"""
 
 def order(u, v):
     """
@@ -242,7 +242,7 @@ def order(u, v):
 
 
 def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, free_colors: dict) -> list:
-    '''
+    """
     Selects candidate vertices to be added to a fan.
 
     This function returns vertices connected to the anchor vertex `u` 
@@ -259,7 +259,7 @@ def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, fr
     Returns:
     list: A list of candidate vertices that can be colored with free colors 
           available to the last vertex in the fan.
-    '''
+    """
     last_vertex = fan[-1]
     free_vertex_colors = free_colors[last_vertex]
     return [v for v in vertices if edge_colors[(u, v)] in free_vertex_colors]
@@ -267,7 +267,7 @@ def find_fan_candidates(fan: list, u: int, vertices: list, edge_colors: dict, fr
 
 def build_maximal_fan(u: int, v: int, vertex_neighbors: dict, 
                       free_colors: dict, edge_colors: dict) -> list:
-    '''
+    """
     Construct a maximal fan of vertex u starting with vertex v.
 
     A fan is a sequence of distinct neighbors of u that satisfies the following:
@@ -283,7 +283,7 @@ def build_maximal_fan(u: int, v: int, vertex_neighbors: dict,
 
     Returns:
     list: A list representing the maximal fan of vertex u.
-    '''
+    """
     u_neighbors = copy.deepcopy(vertex_neighbors[u])
     fan = [v]
     u_neighbors.remove(v)
@@ -297,7 +297,7 @@ def build_maximal_fan(u: int, v: int, vertex_neighbors: dict,
 
 
 def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge_colors: dict):
-    '''
+    """
     Finds, if it exists, the next vertex in a cd_u path. It does so by finding the neighbor 
     of the current vertex which is attached by an edge of the right color.
 
@@ -310,7 +310,7 @@ def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge
     Returns: 
     int or None: The next vertex in the cd_u path that is connected by an edge of the specified color, 
                  or None if no such vertex exists.
-    '''
+    """
     
     for vertex in neighbors[current_vertex]:
         if edge_colors[(current_vertex, vertex)] == color:
@@ -319,7 +319,7 @@ def find_next_path_vertex(current_vertex: int, color: int, neighbors: dict, edge
     
 
 def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors: dict) -> list:
-    '''
+    """
     Finds the cd_u path.
 
     The cd_u path is a path passing through u of edges whose colors alternate between c and d.
@@ -336,7 +336,7 @@ def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors
 
     Returns:
     list: A list of tuples representing the edges in the cd_u path.
-    '''
+    """
     cdu_path = []
     current_color = d
     current_vertex = u
@@ -352,7 +352,7 @@ def find_color_path(u: int, v: int, c: int, d: int, neighbors: dict, edge_colors
 
 
 def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_patches: dict):
-    '''
+    """
     Rotate the colors in a fan of vertices connected to a specified vertex.
 
     This function shifts the colors in the fan over by one position, updating the 
@@ -370,7 +370,7 @@ def rotate_fan(fan: list, u: int, edge_colors: dict, free_colors: dict, color_pa
 
     Returns:
     tuple: Updated dictionaries for edge_colors, free_colors, and color_patches after rotation.
-    '''
+    """
     for i in range(len(fan) - 1):
         curr_vertex = fan[i]
         next_vertex = fan[i+1]
@@ -406,7 +406,7 @@ def check_valid_edge_coloring(color_patches):
 
 
 def find_edge_coloring(deg: int, vertices: list, edges: list, neighbors: dict) -> dict:
-    '''
+    """
     Implements Misra & Gries' edge coloring algorithm for a simple undirected graph.
 
     This function colors the edges of a simple undirected graph using at most 
@@ -423,7 +423,7 @@ def find_edge_coloring(deg: int, vertices: list, edges: list, neighbors: dict) -
     Returns:
     color_patches (dict): A dictionary mapping each color to a list of edges colored with that color.
                           Unlike with edges, the items in color_patches are NOT symmetric [i.e., it only contains (v1, v2) for v1 < v2]
-    '''
+    """
 
     edges = copy.deepcopy(edges)
     free_colors = {u: [i for i in range(deg+1)] for u in vertices} # Keeps track of which colors are free on each vertex

From 028e961a03261912bba0357c0dc3d9fbcde5cced Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 5 Feb 2025 13:36:19 -0500
Subject: [PATCH 018/141] use separate 2q idle label

---
 pygsti/protocols/crosstalkfreeexperimentdesign.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 59e72649c..504af3607 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -54,7 +54,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     """
 
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
-    twoq_idle_label = Label(('Gi',) + twoq_gstdesign.qubit_labels)
+    twoq_idle_label = Label(('Gii',) + twoq_gstdesign.qubit_labels)
     oneq_idle_label = Label(('Gi',) + oneq_gstdesign.qubit_labels)
     mapper_2q = {twoq_idle_label: twoq_idle_label}
     mapper_1q = {oneq_idle_label: oneq_idle_label}

From 9d2890a069f7eb9c768a4f1b28fa80a8d9ff8f90 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 5 Feb 2025 13:40:16 -0500
Subject: [PATCH 019/141] methods for (de)serialization of PermutationOperator

---
 .../modelmembers/operations/permutationop.py  | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/pygsti/modelmembers/operations/permutationop.py b/pygsti/modelmembers/operations/permutationop.py
index 5bfae0fb4..f19d25c5b 100644
--- a/pygsti/modelmembers/operations/permutationop.py
+++ b/pygsti/modelmembers/operations/permutationop.py
@@ -27,9 +27,30 @@ def transform(self, S):
         raise NotImplementedError("PermutationOperator cannot be transformed!")
     
     def inverse_operator(self):
-        iperm = self._perm.copy()
-        iperm[iperm] = _np.arange(self.dim)
+        iperm = PermutationOperator.inv_perm(self._perm)
         return PermutationOperator(iperm)
+    
+    @staticmethod
+    def inv_perm(perm):
+        iperm = perm.copy()
+        iperm[iperm] = _np.arange(iperm.size)
+        return iperm
+    
+    @staticmethod
+    def perm_from_mx(mx):
+        perm = _np.array([_np.where(row == 1)[0][0] for row in mx])
+        return perm
+    
+    ## We need to implement this in order to deserialize.
+    @classmethod
+    def _from_memoized_dict(cls, mm_dict, serial_memo):
+        mx = cls._decodemx(mm_dict['dense_matrix'])
+        mx = mx.squeeze()
+        # state_space = _statespace.StateSpace.from_nice_serialization(mm_dict['state_space'])
+        # basis = _Basis.from_nice_serialization(mm_dict['basis']) if (mm_dict['basis'] is not None) else None
+        # return cls(m, basis, mm_dict['evotype'], state_space)
+        perm = PermutationOperator.perm_from_mx(mx)
+        return PermutationOperator(perm)
 
     @staticmethod
     def pp_braiding_operators(subsystem_perm):

From 5644b20204940ff4704356bca00f00e4794db664 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 5 Feb 2025 13:45:01 -0500
Subject: [PATCH 020/141] tweak meaning of force=True in
 ModelMember.unlink_parent

---
 pygsti/modelmembers/modelmember.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygsti/modelmembers/modelmember.py b/pygsti/modelmembers/modelmember.py
index 43dcdc94e..93878ac72 100644
--- a/pygsti/modelmembers/modelmember.py
+++ b/pygsti/modelmembers/modelmember.py
@@ -335,7 +335,7 @@ def unlink_parent(self, force=False):
         None
         """
         for subm in self.submembers():
-            subm.unlink_parent()
+            subm.unlink_parent(force)
 
         if (self.parent is not None) and (force or self.parent._obj_refcount(self) == 0):
             self._parent = None

From e96df9f81d2417f04792a7bceb7ce42aaa2348a8 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 5 Feb 2025 13:52:50 -0500
Subject: [PATCH 021/141] leave a TODO

---
 pygsti/optimize/simplerlm.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pygsti/optimize/simplerlm.py b/pygsti/optimize/simplerlm.py
index 44a185285..0bf3ab791 100644
--- a/pygsti/optimize/simplerlm.py
+++ b/pygsti/optimize/simplerlm.py
@@ -522,6 +522,11 @@ def simplish_leastsq(
     f = obj_fn(global_x)  # 'E'-type array
     norm_f = ari.norm2_f(f)
     if not _np.isfinite(norm_f):
+        # TODO: this path can be hit when f contains NaNs. We should
+        # really have a separate error message for that. Performing
+        # the check will require updating our ArraysInterface API
+        # (which isn't hard but is beside the point as I'm writing this)
+        #   -Riley.
         msg = "Infinite norm of objective function at initial point!"
 
     if len(global_x) == 0:  # a model with 0 parameters - nothing to optimize

From 021fbf3759bc4b285fe3af2162bac46850305803 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 5 Feb 2025 13:54:15 -0500
Subject: [PATCH 022/141] Clean up convert_members_inplace by adding two tiny
 helper classes to explicitmodel.py. Update convert_members_inplace so it can
 handle failures of conversion when dealing with embedded ops.

---
 pygsti/models/explicitmodel.py | 84 ++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index b1f601298..2f6b04e77 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -46,6 +46,31 @@
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
 
 
+class Roster:
+    def __init__(self, arg):
+        if isinstance(arg, str) and arg == 'all':
+            self.trivial = True
+            self.collection = None
+        else:
+            self.trivial = False
+            self.collection = arg
+    def __contains__(self, item):
+        return self.trivial or (item in self.collection)
+
+
+class ModelView:
+    @staticmethod
+    def cast(arg):
+        return arg if arg is not None else ModelView()
+    def __init__(self):
+        self.operations  = _collections.defaultdict(lambda: None)
+        self.preps       = _collections.defaultdict(lambda: None)
+        self.povms       = _collections.defaultdict(lambda: None)
+        self.instruments = _collections.defaultdict(lambda: None)
+        self.factories   = _collections.defaultdict(lambda: None)
+
+
+
 class ExplicitOpModel(_mdl.OpModel):
     """
     Encapsulates a set of gate, state preparation, and POVM effect operations.
@@ -329,32 +354,55 @@ def __getitem__(self, label):
         else:
             raise KeyError("Key %s has an invalid prefix" % label)
 
-    def convert_members_inplace(self, to_type, categories_to_convert='all', labels_to_convert='all',
-                                ideal_model=None, flatten_structure=False, set_default_gauge_group=False, cptp_truncation_tol= 1e-6):
+    def convert_members_inplace(self, to_type,
+            categories_to_convert='all', labels_to_convert='all',
+            ideal_model=None, flatten_structure=False, set_default_gauge_group=False,
+            cptp_truncation_tol= 1e-6, allow_smaller_pp_basis=False
+        ):
         """
         TODO: docstring -- like set_all_parameterizations but doesn't set default gauge group by default
+
+        allow_smaller_pp_basis : bool
+
+            Setting allow_smaller_pp_basis=True allows dimension mismatches between
+            this ExplicitOpModel's operations and the dimensions we'd expect for
+            operations based on the properties of self.basis.
+            
+            We can ONLY handle mismatches when self.basis.name indicates a Pauli product basis
+            or a tensor product thereof. We handle a failed conversion by trying a second time,
+            passing in the string literal `pp` instead of `self.basis` to _op.convert(...).
+    
         """
         if isinstance(categories_to_convert, str): categories_to_convert = (categories_to_convert,)
+        fallback_basis = '' if not allow_smaller_pp_basis else self.basis.name.replace('pp','').replace('*','') + 'pp'
+        ideal_model = ModelView.cast(ideal_model)
+        roster = Roster(labels_to_convert)
         if any([c in categories_to_convert for c in ('all', 'ops', 'operations')]):
-            for lbl, gate in self.operations.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.operations.get(lbl, None) if (ideal_model is not None) else None
-                    self.operations[lbl] = _op.convert(gate, to_type, self.basis, ideal, flatten_structure, cptp_truncation_tol)
+            op_items = [(k,v) for (k,v) in self.operations.items() if k in roster]
+            for lbl, gate in op_items:
+                ideal = ideal_model.operations[lbl]
+                try:
+                    op = _op.convert(gate, to_type, self.basis, ideal, flatten_structure, cptp_truncation_tol)
+                except ValueError as e:
+                    if not fallback_basis == 'pp':
+                        raise e
+                    op = _op.convert(gate, to_type, 'pp', ideal, flatten_structure, cptp_truncation_tol)
+                self.operations[lbl] = op
         if any([c in categories_to_convert for c in ('all', 'instruments')]):
-            for lbl, inst in self.instruments.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.instruments.get(lbl, None) if (ideal_model is not None) else None
-                    self.instruments[lbl] = _instrument.convert(inst, to_type, self.basis, ideal, flatten_structure)
+            inst_items = [(k,v) for (k,v) in self.instruments.items() if k in roster]
+            for lbl, inst in inst_items:
+                ideal = ideal_model.instruments[lbl]
+                self.instruments[lbl] = _instrument.convert(inst, to_type, self.basis, ideal, flatten_structure)
         if any([c in categories_to_convert for c in ('all', 'preps')]):
-            for lbl, prep in self.preps.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.preps.get(lbl, None) if (ideal_model is not None) else None
-                    self.preps[lbl] = _state.convert(prep, to_type, self.basis, ideal, flatten_structure)
+            prep_items = [(k,v) for (k,v) in self.preps.items() if k in roster]
+            for lbl, prep in prep_items:
+                ideal = ideal_model.preps[lbl]
+                self.preps[lbl] = _state.convert(prep, to_type, self.basis, ideal, flatten_structure)
         if any([c in categories_to_convert for c in ('all', 'povms')]):
-            for lbl, povm in self.povms.items():
-                if labels_to_convert == 'all' or lbl in labels_to_convert:
-                    ideal = ideal_model.povms.get(lbl, None) if (ideal_model is not None) else None
-                    self.povms[lbl] = _povm.convert(povm, to_type, self.basis, ideal, flatten_structure)
+            povm_items = [(k,v) for (k,v) in self.povms.items() if k in roster]
+            for lbl, povm in povm_items:
+                ideal = ideal_model.povms[lbl]
+                self.povms[lbl] = _povm.convert(povm, to_type, self.basis, ideal, flatten_structure)
 
         self._clean_paramvec()  # param indices were probabaly updated
         if set_default_gauge_group:

From 182124458dda4bd53c10b308c4e567a9a2ee5477 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 5 Feb 2025 13:54:58 -0500
Subject: [PATCH 023/141] add create_explicit method to LocalNoiseModel class

---
 pygsti/models/localnoisemodel.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 782148f9b..c93de5919 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -403,6 +403,35 @@ def rescale(coeffs):
 
         return op_coeffs
 
+    def create_explicit(self, reparamerize_to=None):
+        from pygsti.models import ExplicitOpModel
+
+        state = self.__getstate__()
+        state['povms'] = state['povm_blks']['layers']
+        state['preps'] = state['prep_blks']['layers']
+        opdict = _OrderedMemberDict(None, reparamerize_to, None, dict())
+        opdict.update(state['_opcaches']['complete-layers'])
+        state['operations'] = opdict
+
+        for v in state['operations'].values():
+            v.unlink_parent(force=True)
+        for v in state['povms'].values():
+            v.unlink_parent(force=True)
+        for v in state['preps'].values():
+            v.unlink_parent(force=True)
+
+        eom = ExplicitOpModel(self.state_space)
+        eom.preps.update(state['preps'])
+        eom.povms.update(state['povms'])
+        eom.operations.update(state['operations'])
+        if reparamerize_to is None:
+            return eom
+
+        assert isinstance(reparamerize_to, str)
+        eom.convert_members_inplace(reparamerize_to, allow_smaller_pp_basis=True)
+        eom._rebuild_paramvec()
+        return eom
+
 
 class _SimpleCompLayerRules(_LayerRules):
 

From 35d035396140ce6aa7a100e905250d321c4bda53 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 18 Feb 2025 13:05:12 -0500
Subject: [PATCH 024/141] add option to change threshold at which customsolve
 is used. Updated test_mpi.py

---
 pygsti/optimize/customsolve.py     | 21 +++++++++++++++------
 test/test_packages/mpi/test_mpi.py |  8 +++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/pygsti/optimize/customsolve.py b/pygsti/optimize/customsolve.py
index 21afd49e9..21f79c852 100644
--- a/pygsti/optimize/customsolve.py
+++ b/pygsti/optimize/customsolve.py
@@ -23,6 +23,9 @@
     _fastcalc = None
 
 
+CUSTOM_SOLVE_THRESHOLD = 10_000
+
+
 def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
     """
     Simple parallel Gaussian Elimination with pivoting.
@@ -95,7 +98,7 @@ def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
         return
 
     #Just gather everything to one processor and compute there:
-    if comm.size < proc_threshold and a.shape[1] < 10000:
+    if comm.size < proc_threshold and a.shape[1] < CUSTOM_SOLVE_THRESHOLD:
         # We're not exactly sure where scipy is better, but until we speed up / change gaussian-elim
         # alg the scipy alg is much faster for small numbers of procs and so should be used unless
         # A is too large to be gathered to the root proc.
@@ -163,9 +166,11 @@ def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
         # Step 1: find the index of the row that is the best pivot.
         # each proc looks for its best pivot (Note: it should not consider rows already pivoted on)
         potential_pivot_indices = all_row_indices[potential_pivot_mask]
-        ibest_global, ibest_local, h, k = _find_pivot(a, b, icol, potential_pivot_indices, my_row_slice,
-                                                      shared_floats, shared_ints, resource_alloc, comm, host_comm,
-                                                      smbuf1, smbuf2, smbuf3, host_index_buf, host_val_buf)
+        ibest_global, ibest_local, h, k = _find_pivot(
+            a, b, icol, potential_pivot_indices,
+            my_row_slice, shared_floats, shared_ints, resource_alloc,
+            comm, host_comm, smbuf1, smbuf1b, smbuf2,
+            smbuf3, host_index_buf, host_val_buf)
 
         # Step 2: proc that owns best row (holds that row and is root of param-fine comm) broadcasts it
         pivot_row, pivot_b = _broadcast_pivot_row(a, b, ibest_local, h, k, shared_rowb, local_pivot_rowb,
@@ -210,8 +215,12 @@ def custom_solve(a, b, x, ari, resource_alloc, proc_threshold=100):
     return
 
 
-def _find_pivot(a, b, icol, potential_pivot_inds, my_row_slice, shared_floats, shared_ints,
-                resource_alloc, comm, host_comm, buf1, buf1b, buf2, buf3, best_host_indices, best_host_vals):
+def _find_pivot(
+        a, b, icol, potential_pivot_inds,
+        my_row_slice, shared_floats, shared_ints, resource_alloc,
+        comm, host_comm, buf1, buf1b,
+        buf2, buf3, best_host_indices, best_host_vals
+    ):
     
     #print(f'Length potential_pivot_inds {len(potential_pivot_inds)}')
     #print(f'potential_pivot_inds: {potential_pivot_inds}')
diff --git a/test/test_packages/mpi/test_mpi.py b/test/test_packages/mpi/test_mpi.py
index 995b3a92f..a04668a66 100644
--- a/test/test_packages/mpi/test_mpi.py
+++ b/test/test_packages/mpi/test_mpi.py
@@ -1,4 +1,4 @@
-# This file is designed to be run via: mpiexec -np 4 python -W ignore testMPI.py
+# This file is designed to be run via: mpiexec -np 4 python -W ignore test_mpi.py
 # This does not use nosetests because I want to set verbosity differently based on rank (quiet if not rank 0)
 # By wrapping asserts in comm.rank == 0, only rank 0 should fail (should help with output)
 # Can run with different number of procs, but 4 is minimum to test all modes (pure MPI, pure shared mem, and mixed)
@@ -11,7 +11,9 @@
 import pygsti
 from pygsti.modelpacks import smq1Q_XYI as std
 
+pygsti.optimize.customsolve.CUSTOM_SOLVE_THRESHOLD = 10
 wcomm = MPI.COMM_WORLD
+print(f'Running with CUSTOM_SOLVE_THRESHOLD = {pygsti.optimize.customsolve.CUSTOM_SOLVE_THRESHOLD}')
 
 
 class ParallelTest(object):
@@ -226,7 +228,7 @@ def run_fills(self, sim, natoms, nparams):
         else:
             raise RuntimeError("Improper sim type passed by test_fills_generator")
 
-        serial_layout = mdl.sim.create_layout(circuits, array_types=('E','EP','EPP'), derivative_dimension=nP)
+        serial_layout = mdl.sim.create_layout(circuits, array_types=('E','EP','EPP'), derivative_dimensions=nP)
 
         nE = serial_layout.num_elements
         nC = len(circuits)
@@ -246,7 +248,7 @@ def run_fills(self, sim, natoms, nparams):
         global_serial_layout = serial_layout.global_layout
     
         #Use a parallel layout to compute the same probabilities & their derivatives
-        local_layout = mdl.sim.create_layout(circuits, array_types=('E','EP','EPP'), derivative_dimension=nP,
+        local_layout = mdl.sim.create_layout(circuits, array_types=('E','EP','EPP'), derivative_dimensions=nP,
                                              resource_alloc=self.ralloc)
     
         vp_local = local_layout.allocate_local_array('e', 'd')

From 5a90e5c597b452e5eb464e7a1b0b7b7d891a82c2 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 18 Feb 2025 16:46:02 -0500
Subject: [PATCH 025/141] comment where changes might be incorporated for
 faster forward sim

---
 pygsti/forwardsims/matrixforwardsim.py | 38 ++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index bad797e36..d2c13626c 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -738,6 +738,9 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
         eval_tree = layout_atom_tree
         cacheSize = len(eval_tree)
         prodCache = _np.zeros((cacheSize, dim, dim), 'd')
+        # ^ This assumes assignments prodCache[i] = <2d numpy array>.
+        #   It would be better for this to be a dict (mapping _most likely_
+        #   to ndarrays) if we don't need slicing or other axis indexing.
         scaleCache = _np.zeros(cacheSize, 'd')
 
         for iDest, iRight, iLeft in eval_tree:
@@ -751,7 +754,10 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
                 else:
                     gate = self.model.circuit_layer_operator(opLabel, 'op').to_dense(on_space='minimal')
                     nG = max(_nla.norm(gate), 1.0)
+                    # ^ This indicates a need to compute norms of the operation matrices. Can't do this
+                    #   with scipy.linalg if gate is represented implicitly. 
                     prodCache[iDest] = gate / nG
+                    # ^ Indicates a need to overload division by scalars.
                     scaleCache[iDest] = _np.log(nG)
                 continue
 
@@ -764,10 +770,14 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
             scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
 
             if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
-                nL, nR = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]),
-                             1e-300), max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
+                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]),  1e-300)
+                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
+                # ^ I want to allow L,R to be tensor product operators. That precludes
+                #   calling _nla.norm.
                 sL, sR = L / nL, R / nR
-                prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
+                # ^ Again, shows the need to overload division by scalars.
+                prodCache[iDest] = sL @ sR
+                scaleCache[iDest] += _np.log(nL) + _np.log(nR)
 
         nanOrInfCacheIndices = (~_np.isfinite(prodCache)).nonzero()[0]  # may be duplicates (a list, not a set)
         # since all scaled gates start with norm <= 1, products should all have norm <= 1
@@ -838,6 +848,8 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
 
         tSerialStart = _time.time()
         dProdCache = _np.zeros((cacheSize,) + deriv_shape)
+        # ^ I think that deriv_shape will be a tuple of length > 2.
+        #   (Based on how swapaxes is used in the loop below ...)
         wrtIndices = _slct.indices(wrt_slice) if (wrt_slice is not None) else None
 
         for iDest, iRight, iLeft in eval_tree:
@@ -851,6 +863,9 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
                     #doperation = self.dproduct( (opLabel,) , wrt_filter=wrtIndices)
                     doperation = self._doperation(opLabel, wrt_filter=wrtIndices)
                     dProdCache[iDest] = doperation / _np.exp(scale_cache[iDest])
+                    # ^ Need a way to track tensor product structure in whatever's 
+                    #   being returned by self._doperation (presumably it's a tensor ...)
+
                 continue
 
             tm = _time.time()
@@ -861,8 +876,18 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
             # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
             L, R = prod_cache[iLeft], prod_cache[iRight]
             dL, dR = dProdCache[iLeft], dProdCache[iRight]
-            dProdCache[iDest] = _np.dot(dL, R) + \
-                _np.swapaxes(_np.dot(L, dR), 0, 1)  # dot(dS, T) + dot(S, dT)
+            term1 = _np.dot(dL, R)
+            term2 = _np.swapaxes(_np.dot(L, dR), 0, 1) 
+            # ^ From the numpy docs on .dot :
+            #
+            #   If a is an N-D array and b is an M-D array (where M>=2),
+            #   it is a sum product over the last axis of a and the second-to-last axis of b:
+            #
+            #       dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])
+            #
+            dProdCache[iDest] = term1 +  term2  # dot(dS, T) + dot(S, dT)
+            # ^ We need addition of tensor-product-structured "doperators."
+
             profiler.add_time("compute_dproduct_cache: dots", tm)
             profiler.add_count("compute_dproduct_cache: dots")
 
@@ -870,9 +895,12 @@ def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
             if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
                 dProdCache[iDest] /= _np.exp(scale)
                 if dProdCache[iDest].max() < _DSMALL and dProdCache[iDest].min() > -_DSMALL:
+                    # ^ Need the tensor-product-structured "doperators" to have .max() and .min()
+                    #   methods.
                     _warnings.warn("Scaled dProd small in order to keep prod managable.")
             elif (_np.count_nonzero(dProdCache[iDest]) and dProdCache[iDest].max() < _DSMALL
                   and dProdCache[iDest].min() > -_DSMALL):
+                # ^ Need to bypass the call to _np.count_nonzero(...).
                 _warnings.warn("Would have scaled dProd but now will not alter scale_cache.")
 
         #profiler.print_mem("DEBUGMEM: POINT2"); profiler.comm.barrier()

From 4f3ba4de922037c53116ef5ee26e8e32ab3a7f3f Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 21 Feb 2025 13:50:03 -0500
Subject: [PATCH 026/141] logging

---
 pygsti/tools/optools.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pygsti/tools/optools.py b/pygsti/tools/optools.py
index 9fabba13f..1cf98c931 100644
--- a/pygsti/tools/optools.py
+++ b/pygsti/tools/optools.py
@@ -29,6 +29,7 @@
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
 
 IMAG_TOL = 1e-7  # tolerance for imaginary part being considered zero
+DIAMOND_NORM_SOLVE_VERBOSE = False
 
 
 def _flat_mut_blks(i, j, block_dims):
@@ -332,7 +333,7 @@ def diamonddist(a, b, mx_basis='pp', return_x=False):
     zeros = _np.zeros((dim, dim))
     for solver in solvers:
         try:
-            prob.solve(solver=solver)
+            prob.solve(solver=solver, verbose=DIAMOND_NORM_SOLVE_VERBOSE)
             out = (prob.value, vars[0].value) if return_x else prob.value 
             return out
         except _cvxpy.error.SolverError as e:

From 5f96a1616b85a4176efeb8bcbf959e1dd4f8e052 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 21 Feb 2025 16:35:42 -0500
Subject: [PATCH 027/141] argument check

---
 pygsti/models/explicitmodel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index 2f6b04e77..844939d60 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -374,6 +374,7 @@ def convert_members_inplace(self, to_type,
     
         """
         if isinstance(categories_to_convert, str): categories_to_convert = (categories_to_convert,)
+        assert all(c in ['all', 'ops', 'operations', 'instruments', 'preps', 'povms'] for c in categories_to_convert)
         fallback_basis = '' if not allow_smaller_pp_basis else self.basis.name.replace('pp','').replace('*','') + 'pp'
         ideal_model = ModelView.cast(ideal_model)
         roster = Roster(labels_to_convert)

From dd868214fca50211be07cd3879140f60f29cc0a6 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 24 Feb 2025 08:45:03 -0500
Subject: [PATCH 028/141] move LocalNoiseModel.create_explicit to
 ImplicitOpModel. Add comments in matrixforwardsim.py.

---
 pygsti/forwardsims/matrixforwardsim.py | 29 +++++++++++++++++++++++-
 pygsti/models/implicitmodel.py         | 31 ++++++++++++++++++++++++++
 pygsti/models/localnoisemodel.py       | 29 ------------------------
 3 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index d2c13626c..29867671d 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -695,8 +695,9 @@ def _array_types_for_method(cls, method_name):
         return super()._array_types_for_method(method_name)
 
     def __init__(self, model=None, distribute_by_timestamp=False, num_atoms=None, processor_grid=None,
-                 param_blk_sizes=None):
+                 param_blk_sizes=None, cache_doperations=True):
         super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
+        self._cache_dops = cache_doperations
         self._mode = "distribute_by_timestamp" if distribute_by_timestamp else "time_independent"
 
     def _to_nice_serialization(self):
@@ -1518,6 +1519,32 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         _np.seterr(**old_err)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
+        if not self._cache_dops:
+            # This call errors because it tries to compute layout_atom.as_layout(resource_alloc),
+            # which isn't implemented. Looking at how layout_atom is used in the other branch
+            # of this if-statement it isn't clear how to work around this. Can look at 
+            # MapForwardSimulator._bulk_fill_dprobs_atom(...).
+            # 
+            # Verbatim contents:
+            #
+            #       resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim * _slct.length(param_slice))
+            #       self.calclib.mapfill_dprobs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]), dest_param_slice,
+            #                                  layout_atom, param_slice, resource_alloc, self.derivative_eps)
+            #
+            # where
+            #
+            #       self.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_" + evotype.name).
+            # 
+            # and an implementation can be found at
+            #
+            #       /Users/rjmurr/Documents/pg-xfgst/repo/pygsti/forwardsims/mapforwardsim_calc_generic.py.
+            #
+            # Specifically, in mapfill_probs_atom. But that doesn't do anything like layout_atom.as_layout(resource_alloc) .... :(
+            # 
+
+            _DistributableForwardSimulator._bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
+            return
+        
         dim = self.model.evotype.minimal_dim(self.model.state_space)
         resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim * dim * _slct.length(param_slice))
         prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
diff --git a/pygsti/models/implicitmodel.py b/pygsti/models/implicitmodel.py
index c3501649e..ea8d6353f 100644
--- a/pygsti/models/implicitmodel.py
+++ b/pygsti/models/implicitmodel.py
@@ -20,6 +20,7 @@
 from pygsti.modelmembers import povms as _povm
 from pygsti.modelmembers.modelmembergraph import ModelMemberGraph as _MMGraph
 from pygsti.baseobjs.label import Label as _Label
+from pygsti.models.memberdict import OrderedMemberDict as _OrderedMemberDict
 
 from pygsti.baseobjs.basis import Basis as _Basis
 from pygsti.baseobjs.statespace import StateSpace as _StateSpace
@@ -369,6 +370,36 @@ def _from_nice_serialization(cls, state):
             root_dicts[root_key][sub_key].update(mm_dict)  # Note: sub_keys should already be created
         return mdl
 
+    def create_explicit(self, reparamerize_to=None):
+        from pygsti.models import ExplicitOpModel
+
+        state = self.__getstate__()
+        state['povms'] = state['povm_blks']['layers']
+        state['preps'] = state['prep_blks']['layers']
+        opdict = _OrderedMemberDict(None, reparamerize_to, None, dict())
+        opdict.update(state['_opcaches']['complete-layers'])
+        state['operations'] = opdict
+
+        for v in state['operations'].values():
+            v.unlink_parent(force=True)
+        for v in state['povms'].values():
+            v.unlink_parent(force=True)
+        for v in state['preps'].values():
+            v.unlink_parent(force=True)
+
+        eom = ExplicitOpModel(self.state_space)
+        eom.preps.update(state['preps'])
+        eom.povms.update(state['povms'])
+        eom.operations.update(state['operations'])
+        if reparamerize_to is None:
+            return eom
+
+        assert isinstance(reparamerize_to, str)
+        eom.convert_members_inplace(
+            reparamerize_to, allow_smaller_pp_basis=True
+        )
+        eom._rebuild_paramvec()
+        return eom
 
 def _init_spam_layers(model, prep_layers, povm_layers):
     """ Helper function for initializing the .prep_blks and .povm_blks elements of an implicit model"""
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index c93de5919..782148f9b 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -403,35 +403,6 @@ def rescale(coeffs):
 
         return op_coeffs
 
-    def create_explicit(self, reparamerize_to=None):
-        from pygsti.models import ExplicitOpModel
-
-        state = self.__getstate__()
-        state['povms'] = state['povm_blks']['layers']
-        state['preps'] = state['prep_blks']['layers']
-        opdict = _OrderedMemberDict(None, reparamerize_to, None, dict())
-        opdict.update(state['_opcaches']['complete-layers'])
-        state['operations'] = opdict
-
-        for v in state['operations'].values():
-            v.unlink_parent(force=True)
-        for v in state['povms'].values():
-            v.unlink_parent(force=True)
-        for v in state['preps'].values():
-            v.unlink_parent(force=True)
-
-        eom = ExplicitOpModel(self.state_space)
-        eom.preps.update(state['preps'])
-        eom.povms.update(state['povms'])
-        eom.operations.update(state['operations'])
-        if reparamerize_to is None:
-            return eom
-
-        assert isinstance(reparamerize_to, str)
-        eom.convert_members_inplace(reparamerize_to, allow_smaller_pp_basis=True)
-        eom._rebuild_paramvec()
-        return eom
-
 
 class _SimpleCompLayerRules(_LayerRules):
 

From 7f0d9dde6331732f35a8ea09b8a7c9073e5480b3 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 24 Feb 2025 14:00:19 -0500
Subject: [PATCH 029/141] add features to create_explicit

---
 pygsti/models/implicitmodel.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/pygsti/models/implicitmodel.py b/pygsti/models/implicitmodel.py
index ea8d6353f..bace0295c 100644
--- a/pygsti/models/implicitmodel.py
+++ b/pygsti/models/implicitmodel.py
@@ -370,13 +370,18 @@ def _from_nice_serialization(cls, state):
             root_dicts[root_key][sub_key].update(mm_dict)  # Note: sub_keys should already be created
         return mdl
 
-    def create_explicit(self, reparamerize_to=None):
+    def create_explicit(self, composedops_as_views=False, spam_type='full TP', op_type='CPTPLND'):
         from pygsti.models import ExplicitOpModel
-
+        if isinstance(composedops_as_views, str):
+            # Old positional argument behavior.
+            spam_type = composedops_as_views
+            op_type = composedops_as_views
+            composedops_as_views = False
+        
         state = self.__getstate__()
         state['povms'] = state['povm_blks']['layers']
         state['preps'] = state['prep_blks']['layers']
-        opdict = _OrderedMemberDict(None, reparamerize_to, None, dict())
+        opdict = _OrderedMemberDict(None, spam_type, None, dict())
         opdict.update(state['_opcaches']['complete-layers'])
         state['operations'] = opdict
 
@@ -391,14 +396,24 @@ def create_explicit(self, reparamerize_to=None):
         eom.preps.update(state['preps'])
         eom.povms.update(state['povms'])
         eom.operations.update(state['operations'])
-        if reparamerize_to is None:
-            return eom
 
-        assert isinstance(reparamerize_to, str)
         eom.convert_members_inplace(
-            reparamerize_to, allow_smaller_pp_basis=True
+            to_type=spam_type, categories_to_convert=('preps', 'povms'),
+            allow_smaller_pp_basis=True, flatten_structure=True
+        )
+        eom.convert_members_inplace(
+            to_type=op_type, categories_to_convert=('operations',),
+            allow_smaller_pp_basis=True
         )
-        eom._rebuild_paramvec()
+        if composedops_as_views:
+            from pygsti.modelmembers.operations import ComposedOp
+            allop_keys = eom.operations.keys()
+            for k in allop_keys:
+                curr_op = eom[k]
+                if isinstance(curr_op, ComposedOp) and all([subk in allop_keys for subk in k]):
+                    view_op = ComposedOp([eom[subk] for subk in k])
+                    eom[k] = view_op
+            eom._rebuild_paramvec()
         return eom
 
 def _init_spam_layers(model, prep_layers, povm_layers):

From 6c6fb53414175af614ca85249efe6fa9dd226e66 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 27 Feb 2025 14:39:00 -0800
Subject: [PATCH 030/141] add option to SimplerLMOptimizer and simplish_leastsq
 to terminate based on the chi2 test statistic

---
 pygsti/optimize/simplerlm.py | 37 ++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/pygsti/optimize/simplerlm.py b/pygsti/optimize/simplerlm.py
index a741004c5..ac14c371e 100644
--- a/pygsti/optimize/simplerlm.py
+++ b/pygsti/optimize/simplerlm.py
@@ -16,6 +16,7 @@
 
 import numpy as _np
 import scipy as _scipy
+from scipy import stats as _stats
 
 from pygsti.optimize import arraysinterface as _ari
 from pygsti.optimize.customsolve import custom_solve as _custom_solve
@@ -107,6 +108,7 @@ def cast(cls, obj):
             return cls(**obj) if obj else cls()
 
     def __init__(self):
+        self.tol = dict()
         super().__init__()
 
 
@@ -191,10 +193,10 @@ def __init__(self, maxiter=100, maxfev=100, tol=1e-6, fditer=0, first_fditer=0,
                  oob_action="reject", oob_check_mode=0, serial_solve_proc_threshold=100, lsvec_mode="normal"):
 
         super().__init__()
-        if isinstance(tol, float): tol = {'relx': 1e-8, 'relf': tol, 'f': 1.0, 'jac': tol, 'maxdx': 1.0}
+        if isinstance(tol, float): tol = {'relx': 1e-8, 'relf': tol, 'f': 1.0, 'jac': tol, 'maxdx': 1.0, 'chi2_icdf_tol': 0.0, 'critval_discount': 1.0}
         self.maxiter = maxiter
         self.maxfev = maxfev
-        self.tol = tol
+        self.tol.update(tol)
         self.fditer = fditer
         self.first_fditer = first_fditer
         self.init_munu = init_munu
@@ -283,6 +285,13 @@ def run(self, objective: TimeIndependentMDCObjectiveFunction, profiler, printer)
         else:
             ari = _ari.UndistributedArraysInterface(nEls, nP)
 
+        chi2_icdf_tol    = self.tol.get('chi2_icdf_tol', 1e-6)
+        critical_value   = _stats.chi2.ppf(chi2_icdf_tol, objective.num_data_params() - objective.model.num_params)
+        critval_discount = self.tol.get('critval_discount', 1.0)
+        def chi2_stopper(norm_f):
+            chi2val = objective.chi2k_distributed_qty(norm_f)
+            return chi2val < critval_discount * critical_value
+
         opt_x, converged, msg, mu, nu, norm_f, f = simplish_leastsq(
             objective_func, jacobian, x0,
             max_iter=self.maxiter,
@@ -291,6 +300,7 @@ def run(self, objective: TimeIndependentMDCObjectiveFunction, profiler, printer)
             jac_norm_tol=self.tol.get('jac', 1e-6),
             rel_ftol=self.tol.get('relf', 1e-6),
             rel_xtol=self.tol.get('relx', 1e-8),
+            chi2_stopper=chi2_stopper,
             max_dx_scale=self.tol.get('maxdx', 1.0),
             init_munu=self.init_munu,
             oob_check_interval=self.oob_check_interval,
@@ -300,7 +310,8 @@ def run(self, objective: TimeIndependentMDCObjectiveFunction, profiler, printer)
             arrays_interface=ari,
             serial_solve_proc_threshold=self.serial_solve_proc_threshold,
             x_limits=x_limits,
-            verbosity=printer - 1, profiler=profiler)
+            verbosity=printer - 1, profiler=profiler,
+            )
 
         printer.log("Least squares message = %s" % msg, 2)
         assert(converged), "Failed to converge: %s" % msg
@@ -372,7 +383,7 @@ def jac_guarded(k: int, num_fd_iters: int, obj_fn: Callable, jac_fn: Callable, f
 
 def simplish_leastsq(
     obj_fn, jac_fn, x0, f_norm2_tol=1e-6, jac_norm_tol=1e-6,
-    rel_ftol=1e-6, rel_xtol=1e-6, max_iter=100, num_fd_iters=0, max_dx_scale=1.0,
+    rel_ftol=1e-6, rel_xtol=1e-6, chi2_stopper=None, max_iter=100, num_fd_iters=0, max_dx_scale=1.0,
     init_munu="auto", oob_check_interval=0, oob_action="reject", oob_check_mode=0,
     resource_alloc=None, arrays_interface=None, serial_solve_proc_threshold=100,
     x_limits=None, verbosity=0, profiler=None
@@ -414,6 +425,10 @@ def simplish_leastsq(
         Tolerance on the relative value of `|x|`, so that if
         `d(|x|)/|x| < rel_xtol` then mark converged.
 
+    chi2_stopper : Callable[float] -> float, optional
+        Terminate if chi2_stopper( norm( sum(obj_fn(x)**2) ) ) is True.
+        If None, then we set to a lambda function that always returns False.
+
     max_iter : int, optional
         The maximum number of (outer) interations.
 
@@ -518,6 +533,8 @@ def simplish_leastsq(
     max_norm_dx = (max_dx_scale**2) * len(global_x) if max_dx_scale else None
     # ^ don't let any component change by more than ~max_dx_scale
 
+    if chi2_stopper is None:
+        chi2_stopper = lambda _: False
 
     f = obj_fn(global_x)  # 'E'-type array
     norm_f = ari.norm2_f(f)
@@ -548,6 +565,18 @@ def simplish_leastsq(
             if len(msg) > 0:
                 break  # exit outer loop if an exit-message has been set
 
+            if chi2_stopper(norm_f):
+                if oob_check_interval <= 1:
+                    msg = "Critical value for chi2 statistic acheived."
+                    converged = True
+                    break
+                else:
+                    printer.log(("** Converged with out-of-bounds with check interval=%d, reverting to last know in-bounds point and setting interval=1 **") % oob_check_interval, 2)
+                    oob_check_interval = 1
+                    x[:] = best_x[:]
+                    mu, nu, norm_f, f[:] = best_x_state
+                    continue   
+
             if norm_f < f_norm2_tol:
                 if oob_check_interval <= 1:
                     msg = "Sum of squares is at most %g" % f_norm2_tol

From 284bbf9c268d3b739edc4106c4b4b9727fd4d3eb Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 17 Jun 2025 14:42:16 -0700
Subject: [PATCH 031/141] Make the matrix forward simulator respect the lanes
 present in the structure.

---
 pygsti/forwardsims/matrixforwardsim.py | 1574 +++++++++++++++++++++++-
 1 file changed, 1572 insertions(+), 2 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index e811daf73..399c67456 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -31,6 +31,10 @@
 from pygsti.tools.matrixtools import _fas
 from pygsti.tools import listtools as _lt
 from pygsti.circuits import CircuitList as _CircuitList
+from pygsti.tools.internalgates import internal_gate_unitaries
+from pygsti.tools.optools import unitary_to_superop
+from pygsti.baseobjs.label import LabelTup, LabelTupTup
+
 
 _dummy_profiler = _DummyProfiler()
 
@@ -764,8 +768,8 @@ def _compute_product_cache(self, layout_atom_tree, resource_alloc):
             scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
 
             if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
-                nL, nR = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]),
-                             1e-300), max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
+                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]), 1e-300)
+                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
                 sL, sR = L / nL, R / nR
                 prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
 
@@ -2103,3 +2107,1569 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                                   layout.resource_alloc())
         return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
                                               dataset, ds_cache)
+
+
+class NicksMatrixForwardSimulator(_DistributableForwardSimulator, SimpleMatrixForwardSimulator):
+    """
+    Computes circuit outcome probabilities by multiplying together circuit-layer process matrices.
+
+    Interfaces with a model via its `circuit_layer_operator` method and extracts a dense matrix
+    representation of operators by calling their `to_dense` method.  An "evaluation tree" that
+    composes all of the circuits using pairwise "joins"  is constructed by a :class:`MatrixCOPALayout`
+    layout object, and this tree then directs pairwise multiplications of process matrices to compute
+    circuit outcome probabilities.  Derivatives are computed analytically, using operators'
+    `deriv_wrt_params` methods.
+
+    Parameters
+    ----------
+    model : Model, optional
+        The parent model of this simulator.  It's fine if this is `None` at first,
+        but it will need to be set (by assigning `self.model` before using this simulator.
+
+    distribute_by_timestamp : bool, optional
+        When `True`, treat the data as time dependent, and distribute the computation of outcome
+        probabilitiesby assigning groups of processors to the distinct time stamps within the
+        dataset.  This means of distribution be used only when the circuits themselves contain
+        no time delay infomation (all circuit layer durations are 0), as operators are cached
+        at the "start" time of each circuit, i.e., the timestamp in the data set.  If `False`,
+        then the data is treated in a time-independent way, and the overall counts for each outcome
+        are used.  If support for intra-circuit time dependence is needed, you must use a different
+        forward simulator (e.g. :class:`MapForwardSimulator`).
+
+    num_atoms : int, optional
+        The number of atoms (sub-evaluation-trees) to use when creating the layout (i.e. when calling
+        :meth:`create_layout`).  This determines how many units the element (circuit outcome
+        probability) dimension is divided into, and doesn't have to correclate with the number of
+        processors.  When multiple processors are used, if `num_atoms` is less than the number of
+        processors then `num_atoms` should divide the number of processors evenly, so that
+        `num_atoms // num_procs` groups of processors can be used to divide the computation
+        over parameter dimensions.
+
+    processor_grid : tuple optional
+        Specifies how the total number of processors should be divided into a number of
+        atom-processors, 1st-parameter-deriv-processors, and 2nd-parameter-deriv-processors.
+        Each level of specification is optional, so this can be a 1-, 2-, or 3- tuple of
+        integers (or None).  Multiplying the elements of `processor_grid` together should give
+        at most the total number of processors.
+
+    param_blk_sizes : tuple, optional
+        The parameter block sizes along the first or first & second parameter dimensions - so
+        this can be a 0-, 1- or 2-tuple of integers or `None` values.  A block size of `None`
+        means that there should be no division into blocks, and that each block processor
+        computes all of its parameter indices at once.
+    """
+
+    @classmethod
+    def _array_types_for_method(cls, method_name):
+        # The array types of *intermediate* or *returned* values within various class methods (for memory estimates)
+        if method_name == '_bulk_fill_probs_block': return cls._array_types_for_method('_compute_product_cache')
+        if method_name == '_bulk_fill_dprobs_block':
+            return cls._array_types_for_method('_compute_product_cache') \
+                + cls._array_types_for_method('_compute_dproduct_cache')
+        if method_name == '_bulk_fill_hprobs_block':
+            return cls._array_types_for_method('_compute_product_cache') \
+                + cls._array_types_for_method('_compute_dproduct_cache') \
+                + cls._array_types_for_method('_compute_hproduct_cache')
+
+        if method_name == '_compute_product_cache': return ('zdd', 'z', 'z')  # cache of gates, scales, and scaleVals
+        if method_name == '_compute_dproduct_cache': return ('zddb',)  # cache x dim x dim x distributed_nparams
+        if method_name == '_compute_hproduct_cache': return ('zddbb',)  # cache x dim x dim x dist_np1 x dist_np2
+        return super()._array_types_for_method(method_name)
+
+    def __init__(self, model=None, distribute_by_timestamp=False, num_atoms=None, processor_grid=None,
+                 param_blk_sizes=None):
+        super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
+        self._mode = "distribute_by_timestamp" if distribute_by_timestamp else "time_independent"
+        self.swap_gate_superop = unitary_to_superop(internal_gate_unitaries()["SWAP"])
+
+        # We are also going to set up lanes to use.
+
+        # Fix it to 5 qubits.
+        self._lanes_used = {0: {0}, 1: {1}, 2: {2,3}, 3: {4}}
+        self._qubits_to_lanes = {0: 0, 1: 1, 2:2, 3:2, 4:3}
+
+
+    def _to_nice_serialization(self):
+        state = super()._to_nice_serialization()
+        state.update({'mode': self._mode,
+                      # (don't serialize parent model or processor distribution info)
+                      })
+        return state
+
+    @classmethod
+    def _from_nice_serialization(cls, state):
+        #Note: resets processor-distribution information
+        return cls(None, state['mode'] == "distribute_by_timestamp")
+
+    def copy(self):
+        """
+        Return a shallow copy of this MatrixForwardSimulator
+
+        Returns
+        -------
+        MatrixForwardSimulator
+        """
+        return MatrixForwardSimulator(self.model)
+
+    def _compute_product_cache(self, layout_atom_tree, resource_alloc):
+        """
+        Computes an array of operation sequence products (process matrices).
+
+        Note: will *not* parallelize computation:  parallelization should be
+        done at a higher level.
+        """
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+
+        #Note: resource_alloc gives procs that could work together to perform
+        # computation, e.g. paralllel dot products but NOT to just partition
+        # futher (e.g. among the wrt_slices) as this is done in the layout.
+        # This function doesn't make use of resource_alloc - all procs compute the same thing.
+
+        eval_tree = layout_atom_tree
+        cacheSize = len(eval_tree)
+        
+        # This is the maximum size any operator can be. However, we are going to make use of the minimum size.
+        prodCache = _np.zeros((cacheSize, dim, dim), 'd')
+        prodCache = [[] for _ in range(cacheSize)] # Build the cache dynamically.
+        scaleCache = _np.zeros(cacheSize, 'd')
+
+        for iDest, iRight, iLeft in eval_tree:
+
+            #Special case of an "initial operation" that can be filled directly
+            if iRight is None:  # then iLeft gives operation:
+                opLabel = iLeft
+                if opLabel is None:
+                    prodCache[iDest] = _np.identity(dim)
+                    # Note: scaleCache[i] = 0.0 from initialization
+                else:
+                    small_gate = 1
+                    if isinstance(opLabel, LabelTup):
+                        small_gate = self.model.operation_blks["gates"][opLabel].to_dense(on_space="minimal")
+                        # We know that this operator is the whole lane.
+
+                        qubits = opLabel.qubits
+                        if len(qubits) == 2:
+                            if qubits[0] > qubits[1]:
+                                # We need to swap.
+                                small_gate = self.swap_gate_superop.T @ small_gate @ self.swap_gate_superop
+
+                    elif isinstance(opLabel, LabelTupTup):
+                        # We need to iterate through this operator in order to build up the right system.
+                        for ind in range(len(opLabel)):
+                            next_matrix = self.model.operation_blks["gates"][opLabel[ind]].to_dense(on_space="minimal")
+                            # Do we need to insert the swap gates?
+                            qubits = opLabel[ind].qubits
+                            if len(qubits) == 2:
+                                if qubits[0] > qubits[1]:
+                                    # We need to swap.
+                                    next_matrix = self.swap_gate_superop.T @ next_matrix @ self.swap_gate_superop
+
+                            small_gate = _np.kron(small_gate, next_matrix)
+                    # gate = self.model.circuit_layer_operator(opLabel, 'op').to_dense(on_space='minimal')
+                    nG = max(_nla.norm(small_gate), 1.0)
+                    prodCache[iDest] = small_gate / nG
+                    scaleCache[iDest] = _np.log(nG)
+                continue
+
+            # combine iLeft + iRight => iDest
+            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
+            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
+            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
+            L, R = prodCache[iLeft], prodCache[iRight]
+            prodCache[iDest] = L @ R
+            scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
+
+            if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
+                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]), 1e-300)
+                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
+                sL, sR = L / nL, R / nR
+                prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
+
+
+        if __debug__:
+            # So that it can be optimized out when called with -o.
+
+            for i in range(cacheSize):
+                # since all scaled gates start with norm <= 1, products should all have norm <= 1
+                assert len((~_np.isfinite(prodCache[i])).nonzero()[0]) == 0
+
+        return prodCache, scaleCache
+
+    def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
+                                resource_alloc=None, wrt_slice=None, profiler=None):
+        """
+        Computes a tree of product derivatives in a linear cache space. Will
+        use derivative columns to parallelize computation.
+        """
+
+        if profiler is None: profiler = _dummy_profiler
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        nDerivCols = self.model.num_params if (wrt_slice is None) \
+            else _slct.length(wrt_slice)
+        deriv_shape = (nDerivCols, dim, dim)
+        eval_tree = layout_atom_tree
+        cacheSize = len(eval_tree)
+
+        #Note: resource_alloc gives procs that could work together to perform
+        # computation, e.g. paralllel dot products but NOT to just partition
+        # futher (e.g. among the wrt_slices) as this is done in the layout.
+        # This function doesn't make use of resource_alloc - all procs compute the same thing.
+
+        ## ------------------------------------------------------------------
+        #
+        ##print("MPI: _compute_dproduct_cache begin: %d deriv cols" % nDerivCols)
+        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
+        #    #print("MPI: _compute_dproduct_cache called w/comm size %d" % comm.Get_size())
+        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
+        #
+        #    if resource_alloc.comm.Get_size() > nDerivCols:
+        #
+        #        #If there are more processors than deriv cols, give a
+        #        # warning -- note that we *cannot* make use of a tree being
+        #        # split because there's no good way to reconstruct the
+        #        # *non-final* parent-tree elements from those of the sub-trees.
+        #        _warnings.warn("Increased speed could be obtained by giving dproduct cache computation"
+        #                       " *fewer* processors, as there are more cpus than derivative columns.")
+        #
+        #    # Use comm to distribute columns
+        #    allDerivColSlice = slice(0, nDerivCols) if (wrt_slice is None) else wrt_slice
+        #    _, myDerivColSlice, _, sub_resource_alloc = \
+        #        _mpit.distribute_slice(allDerivColSlice, resource_alloc.comm)
+        #    #print("MPI: _compute_dproduct_cache over %d cols (%s) (rank %d computing %s)" \
+        #    #    % (nDerivCols, str(allDerivColIndices), comm.Get_rank(), str(myDerivColIndices)))
+        #    if sub_resource_alloc is not None and sub_resource_alloc.comm is not None \
+        #       and sub_resource_alloc.comm.Get_size() > 1:
+        #        _warnings.warn("Too many processors to make use of in "
+        #                       " _compute_dproduct_cache.")
+        #        if sub_resource_alloc.comm.Get_rank() > 0: myDerivColSlice = slice(0, 0)
+        #        #don't compute anything on "extra", i.e. rank != 0, cpus
+        #
+        #    my_results = self._compute_dproduct_cache(
+        #        layout_atom_tree, prod_cache, scale_cache, None, myDerivColSlice, profiler)
+        #    # pass None as comm, *not* mySubComm, since we can't do any
+        #    #  further parallelization
+        #
+        #    tm = _time.time()
+        #    all_results = resource_alloc.comm.allgather(my_results)
+        #    profiler.add_time("MPI IPC", tm)
+        #    return _np.concatenate(all_results, axis=1)  # TODO: remove this concat w/better gather?
+        #
+        ## ------------------------------------------------------------------
+
+        tSerialStart = _time.time()
+        dProdCache = _np.zeros((cacheSize,) + deriv_shape)
+        wrtIndices = _slct.indices(wrt_slice) if (wrt_slice is not None) else None
+
+        for iDest, iRight, iLeft in eval_tree:
+
+            #Special case of an "initial operation" that can be filled directly
+            if iRight is None:  # then iLeft gives operation:
+                opLabel = iLeft
+                if opLabel is None:
+                    dProdCache[iDest] = _np.zeros(deriv_shape)
+                else:
+                    #doperation = self.dproduct( (opLabel,) , wrt_filter=wrtIndices)
+                    doperation = self._doperation(opLabel, wrt_filter=wrtIndices)
+                    dProdCache[iDest] = doperation / _np.exp(scale_cache[iDest])
+                continue
+
+            tm = _time.time()
+
+            # combine iLeft + iRight => i
+            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
+            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
+            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
+            L, R = prod_cache[iLeft], prod_cache[iRight]
+            dL, dR = dProdCache[iLeft], dProdCache[iRight]
+            dProdCache[iDest] = _np.dot(dL, R) + \
+                _np.swapaxes(_np.dot(L, dR), 0, 1)  # dot(dS, T) + dot(S, dT)
+            profiler.add_time("compute_dproduct_cache: dots", tm)
+            profiler.add_count("compute_dproduct_cache: dots")
+
+            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
+            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
+                dProdCache[iDest] /= _np.exp(scale)
+                if dProdCache[iDest].max() < _DSMALL and dProdCache[iDest].min() > -_DSMALL:
+                    _warnings.warn("Scaled dProd small in order to keep prod managable.")
+            elif (_np.count_nonzero(dProdCache[iDest]) and dProdCache[iDest].max() < _DSMALL
+                  and dProdCache[iDest].min() > -_DSMALL):
+                _warnings.warn("Would have scaled dProd but now will not alter scale_cache.")
+
+        #profiler.print_mem("DEBUGMEM: POINT2"); profiler.comm.barrier()
+
+        profiler.add_time("compute_dproduct_cache: serial", tSerialStart)
+        profiler.add_count("compute_dproduct_cache: num columns", nDerivCols)
+
+        return dProdCache
+
+    def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
+                                d_prod_cache2, scale_cache, resource_alloc=None,
+                                wrt_slice1=None, wrt_slice2=None):
+        """
+        Computes a tree of product 2nd derivatives in a linear cache space. Will
+        use derivative rows and columns to parallelize computation.
+        """
+
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+
+        # Note: dProdCache?.shape = (#circuits,#params_to_diff_wrt,dim,dim)
+        nDerivCols1 = d_prod_cache1.shape[1]
+        nDerivCols2 = d_prod_cache2.shape[1]
+        assert(wrt_slice1 is None or _slct.length(wrt_slice1) == nDerivCols1)
+        assert(wrt_slice2 is None or _slct.length(wrt_slice2) == nDerivCols2)
+        hessn_shape = (nDerivCols1, nDerivCols2, dim, dim)
+        eval_tree = layout_atom_tree
+        cacheSize = len(eval_tree)
+
+        #Note: resource_alloc gives procs that could work together to perform
+        # computation, e.g. paralllel dot products but NOT to just partition
+        # futher (e.g. among the wrt_slices) as this is done in the layout.
+        # This function doesn't make use of resource_alloc - all procs compute the same thing.
+
+        ## ------------------------------------------------------------------
+        #
+        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
+        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
+        #
+        #    if resource_alloc.comm.Get_size() > nDerivCols1 * nDerivCols2:
+        #        #If there are more processors than deriv cells, give a
+        #        # warning -- note that we *cannot* make use of a tree being
+        #        # split because there's no good way to reconstruct the
+        #        # *non-final* parent-tree elements from those of the sub-trees.
+        #        _warnings.warn("Increased speed could be obtained"
+        #                       " by giving hproduct cache computation"
+        #                       " *fewer* processors and *smaller* (sub-)tree"
+        #                       " (e.g. by splitting tree beforehand), as there"
+        #                       " are more cpus than hessian elements.")  # pragma: no cover
+        #
+        #    # allocate final result memory
+        #    hProdCache = _np.zeros((cacheSize,) + hessn_shape)
+        #
+        #    # Use comm to distribute columns
+        #    allDeriv1ColSlice = slice(0, nDerivCols1)
+        #    allDeriv2ColSlice = slice(0, nDerivCols2)
+        #    deriv1Slices, myDeriv1ColSlice, deriv1Owners, mySubComm = \
+        #        _mpit.distribute_slice(allDeriv1ColSlice, resource_alloc.comm)
+        #
+        #    # Get slice into entire range of model params so that
+        #    #  per-gate hessians can be computed properly
+        #    if wrt_slice1 is not None and wrt_slice1.start is not None:
+        #        myHessianSlice1 = _slct.shift(myDeriv1ColSlice, wrt_slice1.start)
+        #    else: myHessianSlice1 = myDeriv1ColSlice
+        #
+        #    #print("MPI: _compute_hproduct_cache over %d cols (rank %d computing %s)" \
+        #    #    % (nDerivCols2, comm.Get_rank(), str(myDerivColSlice)))
+        #
+        #    if mySubComm is not None and mySubComm.Get_size() > 1:
+        #        deriv2Slices, myDeriv2ColSlice, deriv2Owners, mySubSubComm = \
+        #            _mpit.distribute_slice(allDeriv2ColSlice, mySubComm)
+        #
+        #        # Get slice into entire range of model params (see above)
+        #        if wrt_slice2 is not None and wrt_slice2.start is not None:
+        #            myHessianSlice2 = _slct.shift(myDeriv2ColSlice, wrt_slice2.start)
+        #        else: myHessianSlice2 = myDeriv2ColSlice
+        #
+        #        if mySubSubComm is not None and mySubSubComm.Get_size() > 1:
+        #            _warnings.warn("Too many processors to make use of in "
+        #                           " _compute_hproduct_cache.")
+        #            #TODO: remove: not needed now that we track owners
+        #            #if mySubSubComm.Get_rank() > 0: myDeriv2ColSlice = slice(0,0)
+        #            #  #don't compute anything on "extra", i.e. rank != 0, cpus
+        #
+        #        hProdCache[:, myDeriv1ColSlice, myDeriv2ColSlice] = self._compute_hproduct_cache(
+        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice],
+        #            d_prod_cache2[:, myDeriv2ColSlice], scale_cache, None, myHessianSlice1, myHessianSlice2)
+        #        # pass None as comm, *not* mySubSubComm, since we can't do any further parallelization
+        #
+        #        #NOTE: we only need to gather to the root processor (TODO: update this)
+        #        _mpit.gather_slices(deriv2Slices, deriv2Owners, hProdCache, [None, myDeriv1ColSlice],
+        #                            2, mySubComm)  # , gather_mem_limit) #gather over col-distribution (Deriv2)
+        #        #note: gathering axis 2 of hProdCache[:,myDeriv1ColSlice],
+        #        #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
+        #    else:
+        #        #compute "Deriv1" row-derivatives distribution only; don't use column distribution
+        #        hProdCache[:, myDeriv1ColSlice] = self._compute_hproduct_cache(
+        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice], d_prod_cache2,
+        #            scale_cache, None, myHessianSlice1, wrt_slice2)
+        #        # pass None as comm, *not* mySubComm (this is ok, see "if" condition above)
+        #
+        #    #NOTE: we only need to gather to the root processor (TODO: update this)
+        #    _mpit.gather_slices(deriv1Slices, deriv1Owners, hProdCache, [], 1, resource_alloc.comm)
+        #    #, gather_mem_limit) #gather over row-distribution (Deriv1)
+        #    #note: gathering axis 1 of hProdCache,
+        #    #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
+        #
+        #    return hProdCache
+        #
+        ## ------------------------------------------------------------------
+
+        hProdCache = _np.zeros((cacheSize,) + hessn_shape)
+        wrtIndices1 = _slct.indices(wrt_slice1) if (wrt_slice1 is not None) else None
+        wrtIndices2 = _slct.indices(wrt_slice2) if (wrt_slice2 is not None) else None
+
+        for iDest, iRight, iLeft in eval_tree:
+
+            #Special case of an "initial operation" that can be filled directly
+            if iRight is None:  # then iLeft gives operation:
+                opLabel = iLeft
+                if opLabel is None:
+                    hProdCache[iDest] = _np.zeros(hessn_shape)
+                elif not self.model.circuit_layer_operator(opLabel, 'op').has_nonzero_hessian():
+                    #all gate elements are at most linear in params, so
+                    # all hessians for single- or zero-circuits are zero.
+                    hProdCache[iDest] = _np.zeros(hessn_shape)
+                else:
+                    hoperation = self._hoperation(opLabel,
+                                                  wrt_filter1=wrtIndices1,
+                                                  wrt_filter2=wrtIndices2)
+                    hProdCache[iDest] = hoperation / _np.exp(scale_cache[iDest])
+                continue
+
+            # combine iLeft + iRight => i
+            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
+            # (Dest,iLeft,iRight,iFinal) = tup implies circuit[iDest] = circuit[iLeft] + circuit[iRight], but we want:
+            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
+            L, R = prod_cache[iLeft], prod_cache[iRight]
+            dL1, dR1 = d_prod_cache1[iLeft], d_prod_cache1[iRight]
+            dL2, dR2 = d_prod_cache2[iLeft], d_prod_cache2[iRight]
+            hL, hR = hProdCache[iLeft], hProdCache[iRight]
+            # Note: L, R = GxG ; dL,dR = vgs x GxG ; hL,hR = vgs x vgs x GxG
+
+            dLdRa = _np.swapaxes(_np.dot(dL1, dR2), 1, 2)
+            dLdRb = _np.swapaxes(_np.dot(dL2, dR1), 1, 2)
+            dLdR_sym = dLdRa + _np.swapaxes(dLdRb, 0, 1)
+
+            hProdCache[iDest] = _np.dot(hL, R) + dLdR_sym + _np.transpose(_np.dot(L, hR), (1, 2, 0, 3))
+
+            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
+            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
+                hProdCache[iDest] /= _np.exp(scale)
+                if hProdCache[iDest].max() < _HSMALL and hProdCache[iDest].min() > -_HSMALL:
+                    _warnings.warn("Scaled hProd small in order to keep prod managable.")
+            elif (_np.count_nonzero(hProdCache[iDest]) and hProdCache[iDest].max() < _HSMALL
+                  and hProdCache[iDest].min() > -_HSMALL):
+                _warnings.warn("hProd is small (oh well!).")
+
+        return hProdCache
+
+    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
+        """
+        Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
+
+        Parameters
+        ----------
+        circuits : list
+            The circuits whose outcome probabilities should be included in the layout.
+
+        dataset : DataSet
+            The source of data counts that will be compared to the circuit outcome
+            probabilities.  The computed outcome probabilities are limited to those
+            with counts present in `dataset`.
+
+        resource_alloc : ResourceAllocation
+            A available resources and allocation information.  These factors influence how
+            the layout (evaluation strategy) is constructed.
+
+        array_types : tuple, optional
+            A tuple of string-valued array types.  See :meth:`ForwardSimulator.create_layout`.
+
+        derivative_dimensions : int or tuple[int], optional
+            Optionally, the parameter-space dimension used when taking first
+            and second derivatives with respect to the cirucit outcome probabilities.  This must be
+            non-None when `array_types` contains `'ep'` or `'epp'` types.
+            If a tuple, then must be length 1.
+
+        verbosity : int or VerbosityPrinter
+            Determines how much output to send to stdout.  0 means no output, higher
+            integers mean more output.
+
+        layout_creation_circuit_cache : dict, optional (default None)
+            A precomputed dictionary serving as a cache for completed
+            circuits. I.e. circuits with prep labels and POVM labels appended.
+            Along with other useful pre-computed circuit structures used in layout
+            creation.
+            
+        Returns
+        -------
+        MatrixCOPALayout
+        """
+        # There are two types of quantities we adjust to create a good layout: "group-counts" and "processor-counts"
+        #  - group counts:  natoms, nblks, nblks2 give how many indpendently computed groups/ranges of circuits,
+        #                   1st parameters, and 2nd parameters are used.  Making these larger can reduce memory
+        #                   consumption by reducing intermediate memory usage.
+        #  - processor counts: na, np, np2 give how many "atom-processors", "param-processors" and "param2-processors"
+        #                      are used to process data along each given direction.  These values essentially specify
+        #                      how the physical procesors are divided by giving the number of (roughly equal) intervals
+        #                      exist along each dimension of the physical processor "grid".  Thus, thees values are set
+        #                      based on the total number of cores available and how many dimensions are being computed.
+
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+        mem_limit = resource_alloc.mem_limit - resource_alloc.allocated_memory \
+            if (resource_alloc.mem_limit is not None) else None  # *per-processor* memory limit
+        printer = _VerbosityPrinter.create_printer(verbosity, resource_alloc)
+        nprocs = resource_alloc.comm_size
+        comm = resource_alloc.comm
+        if isinstance(derivative_dimensions, int):
+            num_params = derivative_dimensions
+        elif isinstance(derivative_dimensions, tuple):
+            assert len(derivative_dimensions) == 1
+            num_params = derivative_dimensions[0]
+        else:
+            num_params = self.model.num_params
+        C = 1.0 / (1024.0**3)
+
+        if mem_limit is not None:
+            if mem_limit <= 0:
+                raise MemoryError("Attempted layout creation w/memory limit = %g <= 0!" % mem_limit)
+            printer.log("Layout creation w/mem limit = %.2fGB" % (mem_limit * C))
+
+        natoms, na, npp, param_dimensions, param_blk_sizes = self._compute_processor_distribution(
+            array_types, nprocs, num_params, len(circuits), default_natoms=1)
+
+        if self._mode == "distribute_by_timestamp":
+            #Special case: time dependent data that gets grouped & distributed by unique timestamp
+            # To to this, we override above values of natoms, na, and npp:
+            natoms = 1  # save all processor division for within the (single) atom, for different timestamps
+            na, npp = 1, (1, 1)  # save all processor division for within the (single) atom, for different timestamps
+
+        printer.log("MatrixLayout: %d processors divided into %s (= %d) grid along circuit and parameter directions." %
+                    (nprocs, ' x '.join(map(str, (na,) + npp)), _np.prod((na,) + npp)))
+        printer.log("   %d atoms, parameter block size limits %s" % (natoms, str(param_blk_sizes)))
+        assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
+
+        layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
+                                   na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
+                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
+
+        if mem_limit is not None:
+            loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
+            loc_nparams2 = num_params / npp[1] if len(npp) > 1 else 0
+            blk1 = param_blk_sizes[0] if len(param_blk_sizes) > 0 else 0
+            blk2 = param_blk_sizes[1] if len(param_blk_sizes) > 1 else 0
+            if blk1 is None: blk1 = loc_nparams1
+            if blk2 is None: blk2 = loc_nparams2
+            global_layout = layout.global_layout
+            if comm is not None:
+                from mpi4py import MPI
+                max_local_els = comm.allreduce(layout.num_elements, op=MPI.MAX)    # layout.max_atom_elements
+                max_atom_els = comm.allreduce(layout.max_atom_elements, op=MPI.MAX)
+                max_local_circuits = comm.allreduce(layout.num_circuits, op=MPI.MAX)
+                max_atom_cachesize = comm.allreduce(layout.max_atom_cachesize, op=MPI.MAX)
+            else:
+                max_local_els = layout.num_elements
+                max_atom_els = layout.max_atom_elements
+                max_local_circuits = layout.num_circuits
+                max_atom_cachesize = layout.max_atom_cachesize
+            mem_estimate = _bytes_for_array_types(array_types, global_layout.num_elements, max_local_els, max_atom_els,
+                                                  global_layout.num_circuits, max_local_circuits,
+                                                  layout._param_dimensions, (loc_nparams1, loc_nparams2),
+                                                  (blk1, blk2), max_atom_cachesize,
+                                                  self.model.evotype.minimal_dim(self.model.state_space))
+
+            GB = 1.0 / 1024.0**3
+            if mem_estimate > mem_limit:
+                raise MemoryError("Not enough memory for desired layout! (limit=%.1fGB, required=%.1fGB)" % (
+                    mem_limit * GB, mem_estimate * GB))
+            else:
+                printer.log("   Esimated memory required = %.1fGB" % (mem_estimate * GB))
+
+        return layout
+    
+    @staticmethod
+    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
+        """
+        Helper function for pre-computing/pre-processing circuits structures
+        used in matrix layout creation.
+        """
+        cache = dict()
+        completed_circuits, split_circuits = model.complete_circuits(circuits, return_split=True)
+
+        cache['completed_circuits'] = {ckt: comp_ckt for ckt, comp_ckt in zip(circuits, completed_circuits)}
+        cache['split_circuits'] = {ckt: split_ckt for ckt, split_ckt in zip(circuits, split_circuits)}
+
+        if dataset is not None:
+            aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
+            ds_circuits = _lt.apply_aliases_to_circuits(circuits, aliases)
+            unique_outcomes_list = []
+            for ckt in ds_circuits:
+                ds_row = dataset[ckt]
+                unique_outcomes_list.append(ds_row.unique_outcomes if ds_row is not None else None)
+        else:
+            unique_outcomes_list = [None]*len(circuits)
+
+        expanded_circuit_outcome_list = model.bulk_expand_instruments_and_separate_povm(circuits, 
+                                                                                        observed_outcomes_list = unique_outcomes_list, 
+                                                                                        split_circuits = split_circuits)
+        
+        expanded_circuit_cache = {ckt: expanded_ckt for ckt,expanded_ckt in zip(circuits, expanded_circuit_outcome_list)}
+                    
+        cache['expanded_and_separated_circuits'] = expanded_circuit_cache
+
+        expanded_subcircuits_no_spam_cache = dict()
+        for expc_outcomes in cache['expanded_and_separated_circuits'].values():
+            for sep_povm_c, _ in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
+                exp_nospam_c = sep_povm_c.circuit_without_povm[1:] 
+                expanded_subcircuits_no_spam_cache[exp_nospam_c] = exp_nospam_c.expand_subcircuits()
+
+        cache['expanded_subcircuits_no_spam'] = expanded_subcircuits_no_spam_cache
+
+        return cache
+
+    def _scale_exp(self, scale_exps):
+        old_err = _np.seterr(over='ignore')
+        scaleVals = _np.exp(scale_exps)  # may overflow, but OK if infs occur here
+        _np.seterr(**old_err)
+        return scaleVals
+
+    def _rho_e_from_spam_tuple(self, spam_tuple):
+        # This calculator uses the convention that rho has shape (N,1)
+        rholabel, elabel = spam_tuple
+        rho = self.model.circuit_layer_operator(rholabel, 'prep').to_dense(on_space='minimal')[:, None]
+        E = _np.conjugate(_np.transpose(self.model.circuit_layer_operator(
+            elabel, 'povm').to_dense(on_space='minimal')[:, None]))
+        return rho, E
+
+    def _probs_from_rho_e(self, rho, e, gs, scale_vals):
+        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
+
+        #Compute probability and save in return array
+        # want vp[iFinal] = float(dot(e, dot(G, rho)))
+        #  vp[i] = sum_k,l e[0,k] gs[i,k,l] rho[l,0] * scale_vals[i]
+        #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
+        #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
+        #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
+        return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
+        # shape == (len(circuit_list),) ; may overflow but OK
+
+    def _dprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs, scale_vals, wrt_slice=None):
+        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
+
+        rholabel, elabel = spam_tuple
+        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
+        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
+        nCircuits = gs.shape[0]
+
+        nDerivCols = self.model.num_params if wrt_slice is None else _slct.length(wrt_slice)
+
+        # GATE DERIVS (assume d_gs is already sized/filtered) -------------------
+        assert(d_gs.shape[1] == nDerivCols), "d_gs must be pre-filtered!"
+
+        #Compute d(probability)/dOps and save in return list (now have G,dG => product, dprod_dOps)
+        #  prod, dprod_dOps = G,dG
+        # dp_dOps[i,j] = sum_k,l e[0,k] d_gs[i,j,k,l] rho[l,0]
+        # dp_dOps[i,j] = sum_k e[0,k] dot( d_gs, rho )[i,j,k,0]
+        # dp_dOps[i,j] = dot( e, dot( d_gs, rho ) )[0,i,j,0]
+        # dp_dOps      = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,3))
+        old_err2 = _np.seterr(invalid='ignore', over='ignore')
+        path = _np.einsum_path('hk,ijkl,lm->ij', e, d_gs, rho, optimize='optimal')
+        dp_dOps = _np.einsum('hk,ijkl,lm->ij', e, d_gs, rho, optimize=path[0]) * scale_vals[:, None]
+        _np.seterr(**old_err2)
+        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols)
+        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
+        #  this case set to zero since we can't tell whether it's + or - inf anyway...
+        dp_dOps[_np.isnan(dp_dOps)] = 0
+
+        #SPAM -------------
+
+        if self.model._param_interposer is not None:
+            #When there is an interposer, we compute derivs wrt *all* the ops params (inefficient?),
+            # then apply interposer, then take desired wrt_filter columns:
+            nOpDerivCols = self.model._param_interposer.num_op_params
+
+            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
+            _fas(dp_drhos, [None, rhoVec.gpindices],
+                 _np.squeeze(_np.dot(_np.dot(e, gs), rhoVec.deriv_wrt_params()),  # *don't* apply wrt filter here
+                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
+            dp_drhos = _np.dot(dp_drhos, self.model._param_interposer.deriv_op_params_wrt_model_params())
+            if wrt_slice is not None: dp_drhos = dp_drhos[:, wrt_slice]
+
+            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
+            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
+            _fas(dp_dEs, [None, EVec.gpindices], _np.dot(dp_dAnyE, EVec.deriv_wrt_params()))
+            dp_dEs = _np.dot(dp_dEs, self.model._param_interposer.deriv_op_params_wrt_model_params())
+            if wrt_slice is not None: dp_dEs = dp_dEs[:, wrt_slice]
+
+        else:
+            #Simpler case of no interposer
+            nOpDerivCols = nDerivCols
+
+            rho_wrtFilter, rho_gpindices = self._process_wrt_filter(
+                wrt_slice, self.model.circuit_layer_operator(rholabel, 'prep'))
+            E_wrtFilter, E_gpindices = self._process_wrt_filter(
+                wrt_slice, self.model.circuit_layer_operator(elabel, 'povm'))
+
+            # Get: dp_drhos[i, rho_gpindices] = dot(e,gs[i],drho/drhoP)
+            # dp_drhos[i,J0+J] = sum_kl e[0,k] gs[i,k,l] drhoP[l,J]
+            # dp_drhos[i,J0+J] = dot(e, gs, drhoP)[0,i,J]
+            # dp_drhos[:,J0+J] = squeeze(dot(e, gs, drhoP),axis=(0,))[:,J]
+            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
+            _fas(dp_drhos, [None, rho_gpindices],
+                 _np.squeeze(_np.dot(_np.dot(e, gs),
+                                     rhoVec.deriv_wrt_params(rho_wrtFilter)),
+                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
+
+            # Get: dp_dEs[i, E_gpindices] = dot(transpose(dE/dEP),gs[i],rho))
+            # dp_dEs[i,J0+J] = sum_lj dEPT[J,j] gs[i,j,l] rho[l,0]
+            # dp_dEs[i,J0+J] = sum_j dEP[j,J] dot(gs, rho)[i,j]
+            # dp_dEs[i,J0+J] = sum_j dot(gs, rho)[i,j,0] dEP[j,J]
+            # dp_dEs[i,J0+J] = dot(squeeze(dot(gs, rho),2), dEP)[i,J]
+            # dp_dEs[:,J0+J] = dot(squeeze(dot(gs, rho),axis=(2,)), dEP)[:,J]
+            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
+            # may overflow, but OK (deriv w.r.t any of self.effects - independent of which)
+            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
+            _fas(dp_dEs, [None, E_gpindices],
+                 _np.dot(dp_dAnyE, EVec.deriv_wrt_params(E_wrtFilter)))
+
+        sub_vdp = dp_drhos + dp_dEs + dp_dOps
+        return sub_vdp
+
+    def _hprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs1, d_gs2, h_gs, scale_vals,
+                           wrt_slice1=None, wrt_slice2=None):
+        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
+
+        rholabel, elabel = spam_tuple
+        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
+        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
+        nCircuits = gs.shape[0]
+
+        rho_wrtFilter1, rho_gpindices1 = self._process_wrt_filter(
+            wrt_slice1, self.model.circuit_layer_operator(rholabel, 'prep'))
+        rho_wrtFilter2, rho_gpindices2 = self._process_wrt_filter(
+            wrt_slice2, self.model.circuit_layer_operator(rholabel, 'prep'))
+        E_wrtFilter1, E_gpindices1 = self._process_wrt_filter(
+            wrt_slice1, self.model.circuit_layer_operator(elabel, 'povm'))
+        E_wrtFilter2, E_gpindices2 = self._process_wrt_filter(
+            wrt_slice2, self.model.circuit_layer_operator(elabel, 'povm'))
+
+        nDerivCols1 = self.model.num_params if wrt_slice1 is None else _slct.length(wrt_slice1)
+        nDerivCols2 = self.model.num_params if wrt_slice2 is None else _slct.length(wrt_slice2)
+
+        #flt1 = self._get_filter_info(wrtSlices1)
+        #flt2 = self._get_filter_info(wrtSlices2)
+
+        # GATE DERIVS (assume h_gs is already sized/filtered) -------------------
+        assert(h_gs.shape[1] == nDerivCols1), "h_gs must be pre-filtered!"
+        assert(h_gs.shape[2] == nDerivCols2), "h_gs must be pre-filtered!"
+
+        #Compute d2(probability)/dGates2 and save in return list
+        # d2pr_dOps2[i,j,k] = sum_l,m e[0,l] h_gs[i,j,k,l,m] rho[m,0]
+        # d2pr_dOps2[i,j,k] = sum_l e[0,l] dot( d_gs, rho )[i,j,k,l,0]
+        # d2pr_dOps2[i,j,k] = dot( e, dot( d_gs, rho ) )[0,i,j,k,0]
+        # d2pr_dOps2        = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,4))
+        old_err2 = _np.seterr(invalid='ignore', over='ignore')
+        d2pr_dOps2 = _np.squeeze(_np.dot(e, _np.dot(h_gs, rho)), axis=(0, 4)) * scale_vals[:, None, None]
+        _np.seterr(**old_err2)
+
+        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols, nDerivCols)
+        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
+        #  this case set to zero since we can't tell whether it's + or - inf anyway...
+        d2pr_dOps2[_np.isnan(d2pr_dOps2)] = 0
+
+        # SPAM DERIVS (assume d_gs1 and d_gs2 are already sized/filtered) --------
+        assert(d_gs1.shape[1] == nDerivCols1), "d_gs1 must be pre-filtered!"
+        assert(d_gs2.shape[1] == nDerivCols2), "d_gs1 must be pre-filtered!"
+
+        # Get: d2pr_drhos[i, j, rho_gpindices] = dot(e,d_gs[i,j],drho/drhoP))
+        # d2pr_drhos[i,j,J0+J] = sum_kl e[0,k] d_gs[i,j,k,l] drhoP[l,J]
+        # d2pr_drhos[i,j,J0+J] = dot(e, d_gs, drhoP)[0,i,j,J]
+        # d2pr_drhos[:,:,J0+J] = squeeze(dot(e, d_gs, drhoP),axis=(0,))[:,:,J]
+        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
+        d2pr_drhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+        _fas(d2pr_drhos1, [None, None, rho_gpindices2],
+             _np.squeeze(_np.dot(_np.dot(e, d_gs1), drho), axis=(0,))
+             * scale_vals[:, None, None])  # overflow OK
+
+        # get d2pr_drhos where gate derivatives are wrt the 2nd set of gate parameters
+        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
+            assert(nDerivCols1 == nDerivCols2)
+            d2pr_drhos2 = _np.transpose(d2pr_drhos1, (0, 2, 1))
+        else:
+            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
+            d2pr_drhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
+            _fas(d2pr_drhos2, [None, None, rho_gpindices1],
+                 _np.squeeze(_np.dot(_np.dot(e, d_gs2), drho), axis=(0,))
+                 * scale_vals[:, None, None])  # overflow OK
+            d2pr_drhos2 = _np.transpose(d2pr_drhos2, (0, 2, 1))
+
+        # Get: d2pr_dEs[i, j, E_gpindices] = dot(transpose(dE/dEP),d_gs[i,j],rho)
+        # d2pr_dEs[i,j,J0+J] = sum_kl dEPT[J,k] d_gs[i,j,k,l] rho[l,0]
+        # d2pr_dEs[i,j,J0+J] = sum_k dEP[k,J] dot(d_gs, rho)[i,j,k,0]
+        # d2pr_dEs[i,j,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[i,j,J]
+        # d2pr_dEs[:,:,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[:,:,J]
+        d2pr_dEs1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+        dp_dAnyE = _np.squeeze(_np.dot(d_gs1, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
+        devec = EVec.deriv_wrt_params(E_wrtFilter2)
+        _fas(d2pr_dEs1, [None, None, E_gpindices2],
+             _np.dot(dp_dAnyE, devec))
+
+        # get d2pr_dEs where gate derivatives are wrt the 2nd set of gate parameters
+        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
+            assert(nDerivCols1 == nDerivCols2)
+            d2pr_dEs2 = _np.transpose(d2pr_dEs1, (0, 2, 1))
+        else:
+            d2pr_dEs2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
+            dp_dAnyE = _np.squeeze(_np.dot(d_gs2, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
+            devec = EVec.deriv_wrt_params(E_wrtFilter1)
+            _fas(d2pr_dEs2, [None, None, E_gpindices1], _np.dot(dp_dAnyE, devec))
+            d2pr_dEs2 = _np.transpose(d2pr_dEs2, (0, 2, 1))
+
+        # Get: d2pr_dErhos[i, e_offset[eIndex]:e_offset[eIndex+1], e_offset[rhoIndex]:e_offset[rhoIndex+1]] =
+        #    dEP^T * prod[i,:,:] * drhoP
+        # d2pr_dErhos[i,J0+J,K0+K] = sum jk dEPT[J,j] prod[i,j,k] drhoP[k,K]
+        # d2pr_dErhos[i,J0+J,K0+K] = sum j dEPT[J,j] dot(prod,drhoP)[i,j,K]
+        # d2pr_dErhos[i,J0+J,K0+K] = dot(dEPT,prod,drhoP)[J,i,K]
+        # d2pr_dErhos[i,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[i,J,K]
+        # d2pr_dErhos[:,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[:,J,K]
+        d2pr_dErhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
+        dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
+        devec = EVec.deriv_wrt_params(E_wrtFilter1)
+        _fas(d2pr_dErhos1, (None, E_gpindices1, rho_gpindices2),
+             _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
+
+        # get d2pr_dEs where e derivatives are wrt the 2nd set of gate parameters
+        if wrt_slice1 == wrt_slice2:  # Note: this doesn't involve gate derivatives
+            d2pr_dErhos2 = _np.transpose(d2pr_dErhos1, (0, 2, 1))
+        else:
+            d2pr_dErhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
+            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
+            dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
+            devec = EVec.deriv_wrt_params(E_wrtFilter2)
+            _fas(d2pr_dErhos2, [None, E_gpindices2, rho_gpindices1],
+                 _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
+            d2pr_dErhos2 = _np.transpose(d2pr_dErhos2, (0, 2, 1))
+
+        #Note: these 2nd derivatives are non-zero when the spam vectors have
+        # a more than linear dependence on their parameters.
+        if self.model.circuit_layer_operator(rholabel, 'prep').has_nonzero_hessian():
+            dp_dAnyRho = _np.dot(e, gs).squeeze(0) * scale_vals[:, None]  # overflow OK
+            d2pr_d2rhos = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+            _fas(d2pr_d2rhos, [None, rho_gpindices1, rho_gpindices2],
+                 _np.tensordot(dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep').hessian_wrt_params(
+                     rho_wrtFilter1, rho_wrtFilter2), (1, 0)))
+            # _np.einsum('ij,jkl->ikl', dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep') \
+            #    .hessian_wrt_params(rho_wrtFilter1, rho_wrtFilter2))
+        else:
+            d2pr_d2rhos = 0
+
+        if self.model.circuit_layer_operator(elabel, 'povm').has_nonzero_hessian():
+            dp_dAnyE = _np.dot(gs, rho).squeeze(2) * scale_vals[:, None]  # overflow OK
+            d2pr_d2Es = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+            _fas(d2pr_d2Es, [None, E_gpindices1, E_gpindices2],
+                 _np.tensordot(dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
+                     E_wrtFilter1, E_wrtFilter2), (1, 0)))
+            # _np.einsum('ij,jkl->ikl', dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
+            #    E_wrtFilter1, E_wrtFilter2))
+        else:
+            d2pr_d2Es = 0
+
+        # END SPAM DERIVS -----------------------
+
+        ret = d2pr_d2rhos + d2pr_dErhos2 + d2pr_drhos2    # wrt rho
+        ret += d2pr_dErhos1 + d2pr_d2Es + d2pr_dEs2      # wrt e
+        ret += d2pr_drhos1 + d2pr_dEs1 + d2pr_dOps2   # wrt gates
+
+        return ret
+
+    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
+        #Free memory from previous subtree iteration before computing caches
+        scaleVals = Gs = prodCache = scaleCache = None
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2)  # prod cache
+
+        #Fill cache info
+        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
+
+        if not resource_alloc.is_host_leader:
+            # (same as "if resource_alloc.host_comm is not None and resource_alloc.host_comm.rank != 0")
+            # we cannot further utilize multiplie processors when computing a single block.  The required
+            # ending condition is that array_to_fill on each processor has been filled.  But if memory
+            # is being shared and resource_alloc contains multiple processors on a single host, we only
+            # want *one* (the rank=0) processor to perform the computation, since array_to_fill will be
+            # shared memory that we don't want to have muliple procs using simultaneously to compute the
+            # same thing.  Thus, we just do nothing on all of the non-root host_comm processors.
+            # We could also print a warning (?), or we could carefully guard any shared mem updates
+            # using "if resource_alloc.is_host_leader" conditions (if we could use  multiple procs elsewhere).
+            return
+
+        #use cached data to final values
+        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
+        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
+        # ( n_circuits, dim, dim )
+
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
+            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
+            #  to the the element indices when `spamtuple` is used.
+            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices],
+                 self._probs_from_rho_e(rho, E, Gs[tree_indices], scaleVals[tree_indices]))
+        _np.seterr(**old_err)
+
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim * dim * _slct.length(param_slice))
+        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
+        dProdCache = self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
+                                                  resource_alloc, param_slice)
+        if not resource_alloc.is_host_leader:
+            return  # Non-root host processors aren't used anymore to compute the result on the root proc
+
+        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
+        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
+        dGs = layout_atom.nonscratch_cache_view(dProdCache, axis=0)
+
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices, dest_param_slice], self._dprobs_from_rho_e(
+                spam_tuple, rho, E, Gs[tree_indices], dGs[tree_indices], scaleVals[tree_indices], param_slice))
+
+        _np.seterr(**old_err)
+
+    def _bulk_fill_hprobs_atom(self, array_to_fill, dest_param_slice1, dest_param_slice2, layout_atom,
+                               param_slice1, param_slice2, resource_alloc):
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2
+                                                 * _slct.length(param_slice1) * _slct.length(param_slice2))
+        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
+        dProdCache1 = self._compute_dproduct_cache(
+            layout_atom.tree, prodCache, scaleCache, resource_alloc, param_slice1)  # computed on rank=0 only
+        dProdCache2 = dProdCache1 if (param_slice1 == param_slice2) else \
+            self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
+                                         resource_alloc, param_slice2)  # computed on rank=0 only
+        hProdCache = self._compute_hproduct_cache(layout_atom.tree, prodCache, dProdCache1,
+                                                  dProdCache2, scaleCache, resource_alloc,
+                                                  param_slice1, param_slice2)  # computed on rank=0 only
+
+        if not resource_alloc.is_host_leader:
+            return  # Non-root host processors aren't used anymore to compute the result on the root proc
+
+        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
+        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
+        dGs1 = layout_atom.nonscratch_cache_view(dProdCache1, axis=0)
+        dGs2 = layout_atom.nonscratch_cache_view(dProdCache2, axis=0)
+        #( n_circuits, nDerivColsX, dim, dim )
+
+        hGs = layout_atom.nonscratch_cache_view(hProdCache, axis=0)
+        #( n_circuits, len(wrt_filter1), len(wrt_filter2), dim, dim )
+
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices, dest_param_slice1, dest_param_slice2], self._hprobs_from_rho_e(
+                spam_tuple, rho, E, Gs[tree_indices], dGs1[tree_indices], dGs2[tree_indices],
+                hGs[tree_indices], scaleVals[tree_indices], param_slice1, param_slice2))
+
+        _np.seterr(**old_err)
+
+    def bulk_product(self, circuits, scale=False, resource_alloc=None):
+        """
+        Compute the products of many circuits at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        Returns
+        -------
+        prods : numpy array
+            Array of shape S x G x G, where:
+            - S == the number of operation sequences
+            - G == the linear dimension of a operation matrix (G x G operation matrices).
+        scaleValues : numpy array
+            Only returned when scale == True. A length-S array specifying
+            the scaling that needs to be applied to the resulting products
+            (final_product[i] = scaleValues[i] * prods[i]).
+        """
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+
+        # Need to break these circuits down into lanes first.
+        def compute_subcircuits(circuit, lanes_to_qubits_used, qubits_to_lanes):
+
+            lanes_to_gates = [[] for _ in range(len(lanes_to_qubits_used))]
+            for layer in circuit:
+                if isinstance(layer, LabelTupTup):
+                    group = []
+                    nused = 0
+                    for op in layer:
+                        qubits_used = op.qubits
+                        lane = qubits_to_lanes[qubits_used[0]]
+                        if nused + len(qubits_used) == len(lanes_to_qubits_used[lane]):
+                            group.append(op)
+                            lanes_to_gates[lane].append(LabelTupTup(tuple(group)))
+                            nused = 0
+                            group = []
+                        elif nused + len(qubits_used) < len(lanes_to_qubits_used[lane]):
+                            nused += len(qubits_used)
+                            group.append(op)
+                        else:
+                            raise ValueError("Too many indices")
+                elif isinstance(layer, LabelTup):
+                    qubits_used = layer.qubits
+                    lanes_to_gates[qubits_to_lanes[qubits_used[0]]] = layer
+            return lanes_to_gates
+
+        full_list = []
+        for cir in circuits:
+            full_list.append(compute_subcircuits(cir, self._lanes_used, self._qubits_to_lanes))
+
+
+        nCircuits = len(circuits)
+
+        eval_tree = _EvalTree.create(full_list)
+        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
+
+        # Now the cache will also hold the circuit lanes.
+        # So 0:nCircuits*nLanes will hold all the Gs.
+        # Tensor back up in a [(lane)*nLanes, (lane+1)*nLanes]
+
+        sval = _np.zeros(len(circuits))
+        gates = [1 for _ in circuits]
+
+        for ind in range():
+            for lane in range(len(self._lanes_used)):
+                gates[ind] = _np.kron(gates[ind], prodCache[lane + ind*len(self._lanes_used)])
+                sval[ind] += scaleCache[lane + ind*len(self._lanes_used)]
+
+        gates = _np.array(gates)
+        old_err = _np.seterr(over="ignore")
+        gates *= _np.exp(sval)[:, None, None]
+        _np.seterr(**old_err)
+
+
+        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
+        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
+        Gs = prodCache[0:nCircuits]
+
+        if scale:
+            return Gs, scaleVals
+        else:
+            old_err = _np.seterr(over='ignore')
+            Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
+            _np.seterr(**old_err)
+            return Gs
+
+    def bulk_dproduct(self, circuits, flat=False, return_prods=False,
+                      scale=False, resource_alloc=None, wrt_filter=None):
+        """
+        Compute the derivative of a many operation sequences at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        flat : bool, optional
+            Affects the shape of the returned derivative array (see below).
+
+        return_prods : bool, optional
+            when set to True, additionally return the probabilities.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        wrt_filter : list of ints, optional
+            If not None, a list of integers specifying which gate parameters
+            to include in the derivative.  Each element is an index into an
+            array of gate parameters ordered by concatenating each gate's
+            parameters (in the order specified by the model).  This argument
+            is used internally for distributing derivative calculations across
+            multiple processors.
+
+        Returns
+        -------
+        derivs : numpy array
+            * if flat == False, an array of shape S x M x G x G, where:
+              - S == len(circuits)
+              - M == the length of the vectorized model
+              - G == the linear dimension of a operation matrix (G x G operation matrices)
+              and derivs[i,j,k,l] holds the derivative of the (k,l)-th entry
+              of the i-th operation sequence product with respect to the j-th model
+              parameter.
+            * if flat == True, an array of shape S*N x M where:
+              - N == the number of entries in a single flattened gate (ordering same as numpy.flatten),
+              - S,M == as above,
+              and deriv[i,j] holds the derivative of the (i % G^2)-th entry of
+              the (i / G^2)-th flattened operation sequence product  with respect to
+              the j-th model parameter.
+        products : numpy array
+            Only returned when return_prods == True.  An array of shape
+            S x G x G; products[i] is the i-th operation sequence product.
+        scaleVals : numpy array
+            Only returned when scale == True.  An array of shape S such that
+            scaleVals[i] contains the multiplicative scaling needed for
+            the derivatives and/or products for the i-th operation sequence.
+        """
+        nCircuits = len(circuits)
+        nDerivCols = self.model.num_params if (wrt_filter is None) else _slct.length(wrt_filter)
+
+        wrtSlice = _slct.list_to_slice(wrt_filter) if (wrt_filter is not None) else None
+        #TODO: just allow slices as argument: wrt_filter -> wrtSlice?
+
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+
+        eval_tree = _EvalTree.create(circuits)
+        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
+        dProdCache = self._compute_dproduct_cache(eval_tree, prodCache, scaleCache,
+                                                  resource_alloc.comm, wrtSlice)
+
+        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
+        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
+        Gs = prodCache[0:nCircuits]
+        dGs = dProdCache[0:nCircuits]
+
+        if not scale:
+            old_err = _np.seterr(over='ignore', invalid='ignore')
+            if return_prods:
+                Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
+
+            # may overflow or get nans (invalid), but ok
+            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 3) * scaleVals, 0, 3)
+            # convert nans to zero, as these occur b/c an inf scaleVal is mult by a zero deriv value, and we
+            dGs[_np.isnan(dGs)] = 0
+            _np.seterr(**old_err)
+
+        if flat:
+            # cols = deriv cols, rows = flattened everything else
+            dim = self.model.evotype.minimal_dim(self.model.state_space)
+            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 1).reshape(
+                (nDerivCols, nCircuits * dim**2)), 0, 1)
+
+        if return_prods:
+            return (dGs, Gs, scaleVals) if scale else (dGs, Gs)
+        else:
+            return (dGs, scaleVals) if scale else dGs
+
+    ## ---------------------------------------------------------------------------------------------
+    ## TIME DEPENDENT functionality ----------------------------------------------------------------
+    ## ---------------------------------------------------------------------------------------------
+
+    def _ds_quantities(self, timestamp, ds_cache, layout, dataset, TIMETOL=1e-6):
+        if timestamp not in ds_cache:
+            if 'truncated_ds' not in ds_cache:
+                ds_cache['truncated_ds'] = dataset.truncate(layout.circuits)
+            trunc_dataset = ds_cache['truncated_ds']
+
+            if 'ds_for_time' not in ds_cache:
+                #tStart = _time.time()
+                ds_cache['ds_for_time'] = trunc_dataset.split_by_time()
+                #print("DB: Split dataset by time in %.1fs (%d timestamps)" % (_time.time() - tStart,
+                #                                                              len(ds_cache['ds_for_time'])))
+
+            if timestamp not in ds_cache['ds_for_time']:
+                return (None, None, None, None, None)
+
+            #Similar to MDC store's add_count_vectors function -- maybe consolidate in FUTURE?
+            counts = _np.empty(layout.num_elements, 'd')
+            totals = _np.empty(layout.num_elements, 'd')
+            dataset_at_t = ds_cache['ds_for_time'][timestamp]  # trunc_dataset.time_slice(timestamp, timestamp+TIMETOL)
+
+            firsts = []; indicesOfCircuitsWithOmittedData = []
+            for (i, circuit) in enumerate(layout.circuits):  # should be 'ds_circuits' really
+                inds = layout.indices_for_index(i)
+                if circuit in dataset_at_t:
+                    cnts = dataset_at_t[circuit].counts
+                else:
+                    cnts = {}  # Note: this will cause 0 totals, which will need to be handled downstream
+                totals[inds] = sum(cnts.values())  # dataset[opStr].total
+                counts[inds] = [cnts.get(x, 0) for x in layout.outcomes_for_index(i)]
+                lklen = _slct.length(inds)  # consolidate w/ `add_omitted_freqs`?
+                if 0 < lklen < self.model.compute_num_outcomes(circuit):
+                    firsts.append(_slct.to_array(inds)[0])
+                    indicesOfCircuitsWithOmittedData.append(i)
+
+            if len(firsts) > 0:
+                firsts = _np.array(firsts, 'i')
+                indicesOfCircuitsWithOmittedData = _np.array(indicesOfCircuitsWithOmittedData, 'i')
+                #print("DB: SPARSE DATA: %d of %d rows have sparse data" % (len(firsts), len(layout.circuits)))
+            else:
+                firsts = indicesOfCircuitsWithOmittedData = None
+
+            #if self.circuits.circuit_weights is not None:
+            #  SEE add_count_vectors
+
+            nonzero_totals = _np.where(_np.abs(totals) < 1e-10, 1e-10, totals)  # avoid divide-by-zero error on nxt line
+            freqs = counts / nonzero_totals
+            ds_cache[timestamp] = (counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData)
+
+        return ds_cache[timestamp]
+
+    def _bulk_fill_timedep_objfn(self, raw_objective, array_to_fill, layout, ds_circuits,
+                                 num_total_outcomes, dataset, ds_cache=None):
+
+        assert(self._mode == "distribute_by_timestamp"), \
+            ("Must set `distribute_by_timestamp=True` to use a "
+             "time-dependent objective function with MatrixForwardSimulator!")
+
+        resource_alloc = layout.resource_alloc()
+        atom_resource_alloc = layout.resource_alloc('atom-processing')
+        atom_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
+
+        #Split timestamps up between processors - maybe do this in a time-dep layout?
+        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
+        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
+            _mpit.distribute_indices(list(range(len(all_timestamps))), atom_resource_alloc)
+        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
+
+        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
+                                                                  (layout.num_elements,), 'd')
+        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
+        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
+        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
+
+        array_to_fill[:] = 0.0
+        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
+        assert(my_array_to_fill.shape == (layout.num_elements,))
+
+        for timestamp_index in my_timestamp_inds:
+            timestamp = all_timestamps[timestamp_index]
+
+            # compute objective at time timestamp
+            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
+                self._ds_quantities(timestamp, ds_cache, layout, dataset)
+            if counts is None: return  # no data at this time => no contribution
+
+            for _, obj in self.model._iter_parameterized_objs():
+                obj.set_time(timestamp)
+            for opcache in self.model._opcaches.values():
+                for obj in opcache.values():
+                    obj.set_time(timestamp)
+
+            for atom in layout.atoms:  # layout only holds local atoms
+                self._bulk_fill_probs_atom(probs_array[atom.element_slice], atom, timestamp_processing_ralloc)
+
+            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
+            # (similar to DistributableForwardSimulator._bulk_fill_probs)
+
+            terms = raw_objective.terms(probs_array, counts, totals, freqs)
+            if firsts is not None and shared_mem_leader:  # consolidate with `_update_terms_for_omitted_probs`
+                omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
+                                                 for i in indicesOfCircuitsWithOmittedData])
+                terms[firsts] += raw_objective.zero_freq_terms(totals[firsts], omitted_probs)
+            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
+
+            my_array_to_fill += terms
+
+        #collect/gather results (SUM local arrays together)
+        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
+
+        _smt.cleanup_shared_ndarray(probs_array_shm)
+
+    def _bulk_fill_timedep_dobjfn(self, raw_objective, array_to_fill, layout, ds_circuits,
+                                  num_total_outcomes, dataset, ds_cache=None):
+
+        assert(self._mode == "distribute_by_timestamp"), \
+            ("Must set `distribute_by_timestamp=True` to use a "
+             "time-dependent objective function with MatrixForwardSimulator!")
+
+        resource_alloc = layout.resource_alloc()
+        param_resource_alloc = layout.resource_alloc('param-processing')
+        param_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
+
+        #Split timestamps up between processors - maybe do this in a time-dep layout?
+        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
+        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
+            _mpit.distribute_indices(list(range(len(all_timestamps))), param_resource_alloc)
+        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
+
+        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
+                                                                  (layout.num_elements,), 'd')
+        dprobs_array, dprobs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
+                                                                    (layout.num_elements, self.model.num_params), 'd')
+        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
+        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
+        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
+
+        array_to_fill[:] = 0.0
+        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
+        all_param_slice = slice(0, self.model.num_params)  # All params computed at once for now
+        assert(my_array_to_fill.shape == (layout.num_elements, self.model.num_params))
+
+        for timestamp_index in my_timestamp_inds:
+            timestamp = all_timestamps[timestamp_index]
+            # compute objective at time layout_atom.time
+            #print("DB: Rank %d : layout atom for t=" % resource_alloc.comm.rank, layout_atom.timestamp)
+
+            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
+                self._ds_quantities(timestamp, ds_cache, layout, dataset)
+
+            for _, obj in self.model._iter_parameterized_objs():
+                obj.set_time(timestamp)
+            for opcache in self.model._opcaches.values():
+                for obj in opcache.values():
+                    obj.set_time(timestamp)
+
+            for atom in layout.atoms:  # layout only holds local atoms
+                self._bulk_fill_probs_atom(probs_array, atom, timestamp_processing_ralloc)
+                self._bulk_fill_dprobs_atom(dprobs_array, all_param_slice, atom,
+                                            all_param_slice, timestamp_processing_ralloc)
+
+            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
+            # (similar to DistributableForwardSimulator._bulk_fill_probs)
+
+            if shared_mem_leader:
+                if firsts is not None:  # consolidate with TimeIndependentMDCObjectiveFunction.dterms?
+                    dprobs_omitted_rowsum = _np.empty((len(firsts), self.model.num_params), 'd')
+                    for ii, i in enumerate(indicesOfCircuitsWithOmittedData):
+                        dprobs_omitted_rowsum[ii, :] = _np.sum(dprobs_array[layout.indices_for_index(i), :], axis=0)
+
+                dprobs_array *= raw_objective.dterms(probs_array, counts, totals, freqs)[:, None]
+
+                if firsts is not None:  # consolidate with _update_dterms_for_omitted_probs?
+                    omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
+                                                     for i in indicesOfCircuitsWithOmittedData])
+                    dprobs_array[firsts] -= raw_objective.zero_freq_dterms(totals[firsts], omitted_probs)[:, None] \
+                        * dprobs_omitted_rowsum
+            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
+
+            my_array_to_fill += dprobs_array
+
+        #collect/gather results (SUM local arrays together)
+        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
+
+        _smt.cleanup_shared_ndarray(probs_array_shm)
+        _smt.cleanup_shared_ndarray(dprobs_array_shm)
+
+    def bulk_fill_timedep_chi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                               min_prob_clip_for_weighting, prob_clip_interval, ds_cache=None):
+        """
+        Compute the chi2 contributions for an entire tree of circuits, allowing for time dependent operations.
+
+        Computation is performed by summing together the contributions for each time the circuit is
+        run, as given by the timestamps in `dataset`.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated 1D numpy array of length equal to the
+            total number of computed elements (i.e. layout.num_elements)
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the chi2 contributions.
+
+        min_prob_clip_for_weighting : float, optional
+            Sets the minimum and maximum probability p allowed in the chi^2
+            weights: N/(p*(1-p)) by clipping probability p values to lie within
+            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
+        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
+                                   layout.resource_alloc())
+        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                             dataset, ds_cache)
+
+    def bulk_fill_timedep_dchi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                                min_prob_clip_for_weighting, prob_clip_interval, chi2_array_to_fill=None,
+                                ds_cache=None):
+        """
+        Compute the chi2 jacobian contributions for an entire tree of circuits, allowing for time dependent operations.
+
+        Similar to :meth:`bulk_fill_timedep_chi2` but compute the *jacobian*
+        of the summed chi2 contributions for each circuit with respect to the
+        model's parameters.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated ExM numpy array where E is the total number of
+            computed elements (i.e. layout.num_elements) and M is the
+            number of model parameters.
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the chi2 contributions.
+
+        min_prob_clip_for_weighting : float, optional
+            Sets the minimum and maximum probability p allowed in the chi^2
+            weights: N/(p*(1-p)) by clipping probability p values to lie within
+            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        chi2_array_to_fill : numpy array, optional
+            when not None, an already-allocated length-E numpy array that is filled
+            with the per-circuit chi2 contributions, just like in
+            bulk_fill_timedep_chi2(...).
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
+        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
+                                   layout.resource_alloc())
+        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                              dataset, ds_cache)
+
+    def bulk_fill_timedep_loglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                                 min_prob_clip, radius, prob_clip_interval, ds_cache=None):
+        """
+        Compute the log-likelihood contributions (within the "poisson picture") for an entire tree of circuits.
+
+        Computation is performed by summing together the contributions for each time the circuit is run,
+        as given by the timestamps in `dataset`.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated 1D numpy array of length equal to the
+            total number of computed elements (i.e. layout.num_elements)
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the logl contributions.
+
+        min_prob_clip : float, optional
+            The minimum probability treated normally in the evaluation of the
+            log-likelihood.  A penalty function replaces the true log-likelihood
+            for probabilities that lie below this threshold so that the
+            log-likelihood never becomes undefined (which improves optimizer
+            performance).
+
+        radius : float, optional
+            Specifies the severity of rounding used to "patch" the
+            zero-frequency terms of the log-likelihood.
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
+        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
+                                                  layout.resource_alloc())
+        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                             dataset, ds_cache)
+
+    def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                                  min_prob_clip, radius, prob_clip_interval, logl_array_to_fill=None, ds_cache=None):
+        """
+        Compute the ("poisson picture")log-likelihood jacobian contributions for an entire tree of circuits.
+
+        Similar to :meth:`bulk_fill_timedep_loglpp` but compute the *jacobian*
+        of the summed logl (in posison picture) contributions for each circuit
+        with respect to the model's parameters.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated ExM numpy array where E is the total number of
+            computed elements (i.e. layout.num_elements) and M is the
+            number of model parameters.
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the logl contributions.
+
+        min_prob_clip : float
+            a regularization parameter for the log-likelihood objective function.
+
+        radius : float
+            a regularization parameter for the log-likelihood objective function.
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        logl_array_to_fill : numpy array, optional
+            when not None, an already-allocated length-E numpy array that is filled
+            with the per-circuit logl contributions, just like in
+            bulk_fill_timedep_loglpp(...).
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
+        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
+                                                  layout.resource_alloc())
+        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                              dataset, ds_cache)

From 2abdfa6fd4c1b882bb888d0f77a48026c0ef3186 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 20 Jun 2025 16:30:57 -0700
Subject: [PATCH 032/141] save the lane information if the circuit is built
 from tensor products

---
 pygsti/circuits/circuit.py | 44 ++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index f6c2e2043..02fba6c61 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -10,7 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
-import collections as _collections
+from __future__ import annotations
 import itertools as _itertools
 import warnings as _warnings
 
@@ -527,11 +527,13 @@ def _bare_init(self, labels, line_labels, editable, name='', stringrep=None, occ
         self._name = name  # can be None
         #self._times = None  # for FUTURE expansion
         self.auxinfo = {}  # for FUTURE expansion / user metadata
+        self.saved_auxinfo = {}
+        self.saved_auxinfo["lanes"] = {tuple(line_labels): self._labels}
 
     #Note: If editing _copy_init one should also check _bare_init in case changes must be propagated.
     #specialized codepath for copying
     def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occurrence=None,
-                compilable_layer_indices_tup=(), hashable_tup=None, precomp_hash=None):
+                compilable_layer_indices_tup=(), hashable_tup=None, precomp_hash=None, saved_aux: dict[str, dict]={}):
         self._labels = labels
         self._line_labels = line_labels
         self._occurrence_id = occurrence
@@ -547,6 +549,9 @@ def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occ
         #self._times = None  # for FUTURE expansion
         self.auxinfo = {}  # for FUTURE expansion / user metadata
 
+
+        self.saved_auxinfo = saved_aux
+
         return self
     
     #pickle management functions
@@ -1047,13 +1052,13 @@ def copy(self, editable='auto'):
                 editable_labels =[[lbl] if lbl.IS_SIMPLE else list(lbl.components) for lbl in self._labels]
                 return ret._copy_init(editable_labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
-                                      self._compilable_layer_indices_tup)
+                                      self._compilable_layer_indices_tup, saved_aux=self.saved_auxinfo)
             else:
                 #copy the editable labels (avoiding shallow copy issues)
                 editable_labels = [sublist.copy() for sublist in self._labels]
                 return ret._copy_init(editable_labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
-                                      self._compilable_layer_indices_tup)
+                                      self._compilable_layer_indices_tup, saved_aux=self.saved_auxinfo)
         else: #create static copy
             if self._static:
                 #if presently static leverage precomputed hashable_tup and hash. 
@@ -1062,7 +1067,7 @@ def copy(self, editable='auto'):
                 return ret._copy_init(self._labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
                                       self._compilable_layer_indices_tup, 
-                                      self._hashable_tup, self._hash)
+                                      self._hashable_tup, self._hash, saved_aux=self.saved_auxinfo)
             else:
                 static_labels = tuple([layer_lbl if isinstance(layer_lbl, _Label) else _Label(layer_lbl) 
                                        for layer_lbl in self._labels])
@@ -1070,7 +1075,7 @@ def copy(self, editable='auto'):
                 return ret._copy_init(static_labels, self._line_labels, 
                                       editable, self._name, self._str, self._occurrence_id, 
                                       self._compilable_layer_indices_tup, 
-                                      hashable_tup, hash(hashable_tup))
+                                      hashable_tup, hash(hashable_tup), saved_aux=self.saved_auxinfo)
 
     def clear(self):
         """
@@ -1219,7 +1224,7 @@ def extract_labels(self, layers=None, lines=None, strict=True):
                     return self._labels[layers]
                 if isinstance(layers, slice) and strict is True:  # if strict=False, then need to recompute line labels
                     #can speed this up a measurably by manually computing the new hashable tuple value and hash
-                    if not self._line_labels in (('*',), ()):
+                    if self._line_labels not in (('*',), ()):
                         new_hashable_tup = self._labels[layers] + ('@',) + self._line_labels
                     else:
                         new_hashable_tup = self._labels[layers]
@@ -2231,7 +2236,7 @@ def insert_layer_inplace(self, circuit_layer, j):
 
         self.insert_labels_into_layers_inplace([circuit_layer], j)
 
-    def insert_circuit(self, circuit, j):
+    def insert_circuit(self, circuit: Circuit, j):
         """
         Inserts a circuit into this circuit, returning a copy.
 
@@ -2259,7 +2264,7 @@ def insert_circuit(self, circuit, j):
         if self._static: cpy.done_editing()
         return cpy
 
-    def insert_circuit_inplace(self, circuit, j):
+    def insert_circuit_inplace(self, circuit: Circuit, j):
         """
         Inserts a circuit into this circuit.
 
@@ -2294,7 +2299,7 @@ def insert_circuit_inplace(self, circuit, j):
         labels_to_insert = circuit.extract_labels(layers=None, lines=lines_to_insert)
         self.insert_labels_into_layers_inplace(labels_to_insert, j)
 
-    def append_circuit(self, circuit):
+    def append_circuit(self, circuit: Circuit):
         """
         Append a circuit to the end of this circuit, returning a copy.
 
@@ -2312,7 +2317,7 @@ def append_circuit(self, circuit):
         """
         return self.insert_circuit(circuit, self.num_layers)
 
-    def append_circuit_inplace(self, circuit):
+    def append_circuit_inplace(self, circuit: Circuit):
         """
         Append a circuit to the end of this circuit.
 
@@ -2331,7 +2336,7 @@ def append_circuit_inplace(self, circuit):
         assert(not self._static), "Cannot edit a read-only circuit!"
         self.insert_circuit_inplace(circuit, self.num_layers)
 
-    def prefix_circuit(self, circuit):
+    def prefix_circuit(self, circuit: Circuit):
         """
         Prefix a circuit to the beginning of this circuit, returning a copy.
 
@@ -2349,7 +2354,7 @@ def prefix_circuit(self, circuit):
         """
         return self.insert_circuit(circuit, 0)
 
-    def prefix_circuit_inplace(self, circuit):
+    def prefix_circuit_inplace(self, circuit: Circuit):
         """
         Prefix a circuit to the beginning of this circuit.
 
@@ -2368,7 +2373,7 @@ def prefix_circuit_inplace(self, circuit):
         assert(not self._static), "Cannot edit a read-only circuit!"
         self.insert_circuit_inplace(circuit, 0)
 
-    def tensor_circuit_inplace(self, circuit, line_order=None):
+    def tensor_circuit_inplace(self, circuit: Circuit, line_order=None):
         """
         The tensor product of this circuit and `circuit`.
 
@@ -2421,8 +2426,9 @@ def tensor_circuit_inplace(self, circuit, line_order=None):
         #Add circuit's labels into this circuit
         self.insert_labels_as_lines_inplace(circuit._labels, line_labels=circuit.line_labels)
         self._line_labels = new_line_labels  # essentially just reorders labels if needed
+        self.saved_auxinfo["lanes"].update(circuit.saved_auxinfo["lanes"])
 
-    def tensor_circuit(self, circuit, line_order=None):
+    def tensor_circuit(self, circuit: Circuit, line_order=None):
         """
         The tensor product of this circuit and `circuit`, returning a copy.
 
@@ -2450,7 +2456,7 @@ def tensor_circuit(self, circuit, line_order=None):
         if self._static: cpy.done_editing()
         return cpy
 
-    def replace_layer_with_circuit_inplace(self, circuit, j):
+    def replace_layer_with_circuit_inplace(self, circuit: Circuit, j):
         """
         Replaces the `j`-th layer of this circuit with `circuit`.
 
@@ -2470,7 +2476,7 @@ def replace_layer_with_circuit_inplace(self, circuit, j):
         del self[j]
         self.insert_labels_into_layers_inplace(circuit, j)
 
-    def replace_layer_with_circuit(self, circuit, j):
+    def replace_layer_with_circuit(self, circuit: Circuit, j):
         """
         Replaces the `j`-th layer of this circuit with `circuit`,
         returning a copy.
@@ -3823,7 +3829,7 @@ def convert_to_cirq(self,
         return cirq.Circuit(moments)
     
     @classmethod
-    def from_cirq(cls, circuit, qubit_conversion=None, cirq_gate_conversion= None,
+    def from_cirq(cls, circuit: Circuit, qubit_conversion=None, cirq_gate_conversion= None,
                   remove_implied_idles = True, global_idle_replacement_label = 'auto'):
         """
         Converts and instantiates a pyGSTi Circuit object from a Cirq Circuit object.
@@ -4489,7 +4495,7 @@ class CompressedCircuit(object):
         takes more time but could result in better compressing.
     """
 
-    def __init__(self, circuit, min_len_to_compress=20, max_period_to_look_for=20):
+    def __init__(self, circuit: Circuit, min_len_to_compress=20, max_period_to_look_for=20):
         """
         Create a new CompressedCircuit object
 

From a0187149b54e2c7485542723822d9648987efe79 Mon Sep 17 00:00:00 2001
From: nkoskelo <koskelo2@illinois.edu>
Date: Tue, 1 Jul 2025 13:11:05 -0700
Subject: [PATCH 033/141] Preliminary spatially homogeneous qubits.

---
 pygsti/circuits/circuit.py       | 40 ++++++++++++++++++++++++++++++++
 pygsti/models/localnoisemodel.py | 28 ++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 02fba6c61..28d6397fe 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -2498,6 +2498,46 @@ def replace_layer_with_circuit(self, circuit: Circuit, j):
         if self._static: cpy.done_editing()
         return cpy
 
+    def replace_spatially_equivalent_qubits(self, old_single_qubit, equiv_qubit_in_model):
+        """
+        Changes the *name* of a gate throughout this Circuit.
+
+        Note that the name is only a part of the label identifying each
+        gate, and doesn't include the lines (qubits) a gate acts upon.  For
+        example, the "Gx:0" and "Gx:1" labels both have the same name but
+        act on different qubits.
+
+        Parameters
+        ----------
+        old_single_qubit : int
+            The qubit to replace.
+
+        equiv_qubit_in_model : int
+            The qubit to replace `equiv_qubit_in_model` with.
+
+        Returns
+        -------
+        None
+        """
+        assert(not self._static), "Cannot edit a read-only circuit!"
+
+        def replace(obj):  # obj is either a simple label or a list
+            if isinstance(obj, _Label):
+                if len(obj.qubits) == 1:
+                    if obj.qubits[0] == old_single_qubit:
+                        newobj = _Label(obj.name,
+                                        (equiv_qubit_in_model,))
+                    else:
+                        newobj = obj
+                else: 
+                    newobj = obj
+            else:
+                newobj = [replace(sub) for sub in obj]
+            return newobj
+
+        self._labels = replace(self._labels)
+
+
     def replace_gatename_inplace(self, old_gatename, new_gatename):
         """
         Changes the *name* of a gate throughout this Circuit.
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 739fb8f7d..cc46bb770 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -612,3 +612,31 @@ def _layer_component_operation(self, model, complbl, cache):
         else:
             ret = _opfactory.op_from_factories(model.factories['layers'], complbl)
         return ret
+
+
+
+
+class LocalNoiseModelWithEquivalentClassesForSingleQubits(LocalNoiseModel):
+
+    def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None, evotype="default",
+                 simulator="auto", on_construction_error='raise',
+                 independent_gates=False, ensure_composed_gates=False, implicit_idle_mode="none", equiv_qubits_classes=None):
+
+        super().__init__(processor_spec, gatedict, prep_layers, povm_layers, evotype, simulator,
+                on_construction_error, independent_gates, ensure_composed_gates, implicit_idle_mode)
+
+        self.equiv_qubit_classes = equiv_qubit_classes
+
+        for key in self.operation_blks:
+            for labels in self.operation_blks[key]:
+                qubit_used = labels.qubits
+                if len(qubits_used) == 1:
+                    # We may be able to replace this.
+                    new_qubit = self.equiv_qubit_classes[int(qubits_used[0])]
+                    if new_qubit not in qubits_used:
+                        # Need to replace.
+                        new_label = labels[0] + (new_qubit,)
+                        self.operation_blks[key][labels] = self.operation_blks[key][new_label]
+                        # This assumes no circular updates.
+
+        
\ No newline at end of file

From 646c5d117d0f92da4c9c3bf6538a86a2978014cd Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Mon, 7 Jul 2025 17:56:06 -0700
Subject: [PATCH 034/141] Save

---
 pygsti/baseobjs/label.py                     |   3 +-
 pygsti/forwardsims/matrixforwardsim.py       |  93 +++
 pygsti/layouts/evaltree.py                   | 757 ++++++++++++++++++-
 pygsti/layouts/matrixlayout.py               | 229 +++++-
 pygsti/models/modelconstruction.py           |  55 +-
 pygsti/models/singlequbitequivalencerules.py | 115 +++
 6 files changed, 1246 insertions(+), 6 deletions(-)
 create mode 100644 pygsti/models/singlequbitequivalencerules.py

diff --git a/pygsti/baseobjs/label.py b/pygsti/baseobjs/label.py
index db1d92745..08ce3fe70 100644
--- a/pygsti/baseobjs/label.py
+++ b/pygsti/baseobjs/label.py
@@ -138,7 +138,8 @@ def __new__(cls, name, state_space_labels=None, time=None, args=None):
                 return LabelStr.init(name, time)
 
         else:
-            if args is not None: return LabelTupWithArgs.init(name, state_space_labels, time, args)
+            if args is not None:
+                return LabelTupWithArgs.init(name, state_space_labels, time, args)
             else:
                 if time == 0.0:
                     return LabelTup.init(name, state_space_labels)
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 399c67456..d8dccbc58 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -21,7 +21,9 @@
 from pygsti.forwardsims.forwardsim import ForwardSimulator as _ForwardSimulator
 from pygsti.forwardsims.forwardsim import _bytes_for_array_types
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
 from pygsti.layouts.matrixlayout import MatrixCOPALayout as _MatrixCOPALayout
+from pygsti.layouts.matrixlayout import _MatrixCOPALayoutAtomWithLCS
 from pygsti.baseobjs.profiler import DummyProfiler as _DummyProfiler
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
@@ -3673,3 +3675,94 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                                   layout.resource_alloc())
         return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
                                               dataset, ds_cache)
+
+
+class LCSEvalTreeMatrixForwardSimulator(MatrixForwardSimulator):
+
+    def bulk_product(self, circuits, scale=False, resource_alloc=None):
+        """
+        Compute the products of many circuits at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        Returns
+        -------
+        prods : numpy array
+            Array of shape S x G x G, where:
+            - S == the number of operation sequences
+            - G == the linear dimension of a operation matrix (G x G operation matrices).
+        scaleValues : numpy array
+            Only returned when scale == True. A length-S array specifying
+            the scaling that needs to be applied to the resulting products
+            (final_product[i] = scaleValues[i] * prods[i]).
+        """
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+        nCircuits = len(circuits)
+
+        eval_tree = _EvalTreeLCS(circuits)
+        prodCache = eval_tree.fill_out_circuit_cache(self.model)
+        Gs = prodCache[0:nCircuits]
+
+
+        return Gs
+
+    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, resource_alloc):
+        
+        # Overestimate the amount of cache usage by assuming everything is the same size.
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
+
+        prodCache = layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
+        Gs = layout_atom.tree.reconstruct_full_matrices(prodCache)
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
+            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
+            #  to the the element indices when `spamtuple` is used.
+            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices],
+                 self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
+        _np.seterr(**old_err)
+
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+        
+        eps = 1e-7  # hardcoded?
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+
+        for i in range(self.model.num_params):
+            if i in iParamToFinal:
+                iFinal = iParamToFinal[i]
+                vec = orig_vec.copy(); vec[i] += eps
+                self.model.from_vector(vec, close=True)
+                self._bulk_fill_probs_atom(probs2, layout_atom)
+                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
+        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache)
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index d8aa24f85..9b07ffa76 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -18,6 +18,12 @@
 
 from pygsti.circuits.circuit import Circuit as _Circuit
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
+from pygsti.baseobjs.label import LabelTupTup, Label
+from pygsti.modelmembers.operations import create_from_superop_mx
+from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
+import itertools
+from typing import Sequence
+
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -340,7 +346,7 @@ def _get_start_indices(max_intersect):
                             (_time.time() - tm)); tm = _time.time()
 
                 #merge_method = "fast"
-                #Another possible algorith (but slower)
+                #Another possible algorithm (but slower)
                 #if merge_method == "best":
                 #    while len(indicesLeft) > 0:
                 #        iToMergeInto,_ = min(enumerate(map(len,subTreeSetList)),
@@ -444,3 +450,752 @@ def _get_start_indices(max_intersect):
 
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
+
+
+
+def _best_matching_only(A: Sequence, B: Sequence) -> int:
+    """
+    Returns:
+    -----
+    int - the length of the longest matching prefix between A and B.
+    """
+    i = 0
+    n = len(A)
+    m = len(B)
+    while i < n and i < m:
+        if A[i] != B[i]:
+            return len(A[:i])
+        i += 1
+    return len(A[:i])
+
+
+
+def _lcs_dp_version(A, B):
+    """
+    Compute the longest common substring between A and B using
+    dynamic programming.
+
+    
+    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
+
+    """
+    
+    table = setup_lcs_dynamic_programming_table(A, B)
+    n, m = table.shape
+    for i in range(n-2, -1, -1):
+        for j in range(m-2, -1, -1):
+            opt1 = 0
+            if A[i] == B[j]:
+                opt1 = _best_matching_only(A[i:], B[j:])
+            opt2 = table[i, j+1]
+            opt3 = table[i+1, j]
+            table[i,j] = max(opt1, opt2, opt3)
+    return table
+
+def setup_lcs_dynamic_programming_table(A, B):
+    """
+    Create the table used for LCS dynamic programming.
+    """
+    return _np.zeros((len(A) + 1, len(B) + 1))
+
+def build_one_round_of_eval_tree(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
+    if table_data_and_sequences:
+        table, sequences = table_data_and_sequences
+    else:
+        table, sequences = _compute_lcs_for_every_pair_of_circuits(circuits)
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = build_internal_tables(circuits)
+
+    best_index = _np.where(table == _np.max(table))
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_circuits = circuits
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
+
+    if _np.max(internal_subtable) >= _np.max(table):
+        # We are only going to replace if this was the longest substring.
+        for cir_ind in best_internal_index[0]:
+            for seq in internal_subsequences[cir_ind]:
+                key = tuple(seq)
+                if key in all_subsequences_to_replace:
+                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+                else:
+                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    if _np.max(table) >= _np.max(internal_subtable):
+        for ii in range(len(best_index[0])):
+            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
+            cir_index = best_index[0][ii]
+            cir_index2 = best_index[1][ii]
+            seq = updated_circuits[cir_index][starting_point: int(starting_point + length+1)]
+
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                if cir_index not in all_subsequences_to_replace[key]:
+                    # We did not already handle this with internal subsequences.
+                    all_subsequences_to_replace[key][cir_index] = [starting_point]
+                if cir_index2 not in all_subsequences_to_replace[key]:
+                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
+
+            else:
+                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
+
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_circuits[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+
+                    sp += 1
+                updated_circuits[cir_ind] = my_cir
+
+                cache_struct[cir_ind] = updated_circuits[cir_ind]
+
+            updated_circuits.append(list(seq))
+            cache_struct[cache_num] = updated_circuits[cache_num]
+
+            cache_num += 1
+
+    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+
+    return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
+
+def locate_sequences_in_AB(A, B, dp_table) -> tuple[int, int, int]:
+    """
+    Finds the indices of the starting points of the sequences in A and B.
+
+    Returns:
+    ---------
+    int - starting index in A of LCS(A,B)
+    int - starting index in B of LCS(A,B)
+    int - length of LCS(A,B)
+    """
+    n, m = dp_table.shape
+    i = 0
+    j = 0
+    while i < n-1 and j < m -1:
+        curr = dp_table[i,j]
+        opt1 = dp_table[i+1, j+1]
+        opt2 = dp_table[i+1, j]
+        opt3 = dp_table[i, j+1]
+        options = [opt1, opt2, opt3]
+        if _np.all(curr == options):
+            i += 1
+            j += 1
+        elif opt2 > opt1 and opt2 > opt3:
+            i += 1
+        elif opt3 > opt2 and opt3 > opt1:
+            j += 1
+        else:
+            # All three options are equal. So we should march the diagonal.
+            i += 1
+            j += 1
+            return i-1, j-1, dp_table[i,j]
+    return None, None, None
+
+def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
+    """
+    Computes the LCS for every pair of circuits A,B in circuit_list
+    """
+    best_subsequences = {}
+    best_lengths = _np.zeros((len(circuit_list), len(circuit_list)))
+    curr_best = 0
+    for i, cir0 in enumerate(circuit_list):
+        if len(cir0) >= curr_best:
+            # Could be the best.
+            for j in range(i+1, len(circuit_list)):
+                cir1 = circuit_list[j]
+                if len(cir1) >= curr_best:
+                    table = _lcs_dp_version(cir0, cir1)
+                    best_lengths[i,j] = table[0,0]
+                    best_subsequences[(i,j)] = locate_sequences_in_AB(cir0, cir1, table)
+                    curr_best = max(best_lengths[i,j], curr_best)
+                else:
+                    best_lengths[i,j] = -1
+                    best_subsequences[(i,j)] = (None, None, None)
+        else:
+            # Skipped because cannot be the best yet.
+            best_lengths[i,j] = -1
+            best_subsequences[(i,j)] = (None, None, None)
+    return best_lengths, best_subsequences
+
+
+def _longest_common_internal_subsequence(A: _Circuit) -> tuple[int, dict[tuple, list[int]]]:
+    """
+    Compute the longest common subsequence within a single circuit A.
+
+    Returns:
+    ---------
+    int - length of longest common subsequences within A
+    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
+    """
+    n = len(A)
+    best = 0
+    best_ind = {}
+    changed = False
+    for w in range(1, int(_np.floor(n / 2) + 1)):
+        for sp in range(n - w):
+            window = A[sp: sp + w]
+            for match in range(sp+ w, n-w + 1):
+                if A[match: match + w] == window:
+                    if best == w:
+                        if tuple(window) in best_ind:
+                            best_ind[tuple(window)].add(match)
+                        else:
+                            best_ind[tuple(window)] = {sp, match}
+                    else:
+                        best_ind = {tuple(window): {sp, match}}
+                        changed = True
+                        best = w
+        if not changed:
+            return best, best_ind
+    return best, best_ind
+
+def build_internal_tables(circuit_list):
+    """
+    Compute all the longest common internal sequences for each circuit A in circuit_list
+    """
+
+    C = len(circuit_list)
+    the_table = _np.zeros(C)
+    seq_table = [[] for _ in range(C)]
+
+    curr_best = 1
+    for i in range(C):
+        if len(circuit_list[i]) >= curr_best:
+            the_table[i], seq_table[i] = _longest_common_internal_subsequence(circuit_list[i])
+            curr_best = max(curr_best, the_table[i])
+    return the_table, seq_table
+
+def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
+    """
+    Add in explicit idles to the labels for each layer.
+    """
+
+    tmp = circuit.copy(editable=True)
+    num_layers = circuit.num_layers
+
+    for i in range(num_layers):
+        tmp[i] = Label(tmp.layer_label_with_idles(i, idle_gate_name))
+
+    if tmp._static:
+        tmp.done_editing()
+    return tmp
+
+
+def _compute_qubit_to_lanes_mapping_for_circuit(circuit, num_qubits: int) -> tuple[dict[int, int], dict[int, tuple[int]]]:
+    """
+    Returns
+    --------
+    Dictionary mapping qubit number to lane number in the circuit.
+    """
+
+    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(num_qubits)}
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        for op in layer:
+            qubits_used = op.qubits
+            for qb in qubits_used:
+                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
+
+    lanes = {}
+    lan_num = 0
+    visited: dict[int, int] = {}
+    def reachable_nodes(starting_point: int, graph_qubits_to_neighbors: dict[int, set[int]], visited: dict[int, set[int]]):
+        """
+        Find which nodes are reachable from this starting point.
+        """
+        if starting_point in visited:
+            return visited[starting_point]
+        else:
+            assert starting_point in graph_qubits_to_neighbors
+            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
+            output = set(visited[starting_point])
+            for child in graph_qubits_to_neighbors[starting_point]:
+                if child != starting_point:
+                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
+            visited[starting_point] = output
+            return output
+
+    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
+    while available_starting_points:
+        sp = available_starting_points[0]
+        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
+        for node in nodes:
+            available_starting_points.remove(node)
+        lanes[lan_num] = nodes
+        lan_num += 1
+
+    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
+        """
+        Determine a mapping from qubit to the lane it is in for this specific circuit.
+        """
+        out = {}
+        for key, val in lanes_to_qubits.items():
+            for qb in val:
+                out[qb] = key
+        return out
+
+    return compute_qubits_to_lanes(lanes), lanes
+
+
+
+def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
+    """
+    Split a circuit into multiple subcircuits which do not talk across lanes.
+    """
+
+    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
+
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        group = []
+        group_lane = None
+        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
+
+        for op in sorted_layer:
+            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
+            qubits_used = op.qubits # This will be a list of qubits used.
+            # I am assuming that the qubits are indexed numerically and not by strings.
+            lane = qubits_to_lanes[qubits_used[0]]
+
+            if group_lane is None:
+                group_lane = lane
+                group.append(op)
+            elif group_lane == lane:
+                group.append(op)
+            else:
+                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+                group_lane = lane
+                group = [op]
+
+        if len(group) > 0:
+            # We have a left over group.
+            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+
+    return lanes_to_gates
+
+def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
+                                            implicit_idle_gate_name: str = "I") -> tuple[list[tuple],list[int], list[dict[int, int]]]:
+    """
+    Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
+
+    Also return a sequence detailing the number of lanes in each circuit.
+    Then, a sequence detailing the number of qubits in each lane for a circuit.
+    """
+
+    output = []
+    qubits_used_in_each_lane = []
+
+    for cir in circuit_list:
+
+        if implicit_idle_gate_name:
+            cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
+
+        qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
+        sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
+
+        output.extend(sub_cirs)
+        qubits_used_in_each_lane.append(lanes_to_qubits)
+    return output, qubits_used_in_each_lane
+
+
+class EvalTreeBasedUponLongestCommonSubstring():
+
+    def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: list[dict[int, tuple[int, ...]]]):
+        """
+        Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
+        """
+
+        assert len(qubits_used_in_each_lane) <= len(circuit_list)
+        external_matches = _compute_lcs_for_every_pair_of_circuits(circuit_list)
+        internal_matches = build_internal_tables(circuit_list)
+
+        max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+
+        C = len(circuit_list)
+        num_full_circuits = len(qubits_used_in_each_lane)
+        sequence_intro = {0: _np.arange(C)}
+
+        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+        cache_pos = C
+        new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
+
+        i = 0
+        while max_rounds > 1:
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = build_one_round_of_eval_tree(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
+            i += 1
+            external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
+            internal_matches = build_internal_tables(new_circuit_list)
+
+            max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+
+        self.circuit_list = new_circuit_list
+        self.cache = cache
+        self.num_circuits = C
+        self.qubits_used_in_each_lane = qubits_used_in_each_lane
+
+        self.sequence_intro = sequence_intro
+
+        swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                      0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00, -1.23259516e-32],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00, 1.23259516e-32,
+                                      0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                      1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32],
+                                   
+                                    [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,
+                                     0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00,0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                   
+                                    [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,
+                                    0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00],
+       
+                                    [ 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,
+                                    1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
+
+        self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")
+
+
+
+
+        self.cache_ind_to_num_qubits_needed = {}
+        offset = 0
+        for i in range(num_full_circuits):
+            for j in qubits_used_in_each_lane[i]:
+                cache_ind = offset + j
+                self.set_cache_sizes(cache_ind, len(qubits_used_in_each_lane[i][j]))
+
+            offset += len(qubits_used_in_each_lane[i])
+        
+        self.tensor_contraction_orders_by_circuit = {}
+        self.tensor_contraction_order_cache = {}
+        self.qubit_list_cache = {}
+
+        for i in range(num_full_circuits):
+            self.qubit_list_cache[i] = [qubits_used_in_each_lane[i][k] for k in sorted(qubits_used_in_each_lane[i])]
+            self.tensor_contraction_orders_by_circuit[i] = self.best_order_for_tensor_contraction(tuple(self.qubit_list_cache[i]))
+            
+
+        # for val in cache.values():
+        #     num_terms = len(val)
+        #     self._best_order_for_first_derivative(num_terms)
+
+    def set_cache_sizes(self, cache_ind: int, num_qubits: int):
+        """
+        Set the size to use the number of qubits specified.
+        """
+        if cache_ind in self.cache_ind_to_num_qubits_needed:
+            return # we have already set them all.
+        self.cache_ind_to_num_qubits_needed[cache_ind] = num_qubits
+        for child in self.cache:
+            if isinstance(child, int):
+                self.set_cache_sizes(child, num_qubits)
+
+    def collapse_circuits_to_process_matrices(self, model):
+        """
+        Compute the total product cache. Note that this may still have a tensor product
+        structure that the operator needs to combine again if they want to have the full 'dense' matrix.
+        """
+
+
+        round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+        saved: dict[int, _LinearOperator] = {}
+        
+        def look_up_operations(model, opTuple) -> _LinearOperator:
+
+            if hasattr(model, "operations"):
+                return model.operations[opTuple].to_dense()
+            elif hasattr(model, "operation_blks"):
+                return model.operation_blks[opTuple].to_dense()
+            else:
+                raise ValueError("Missing attribute")
+
+        for key in round_keys:
+            for cind in self.sequence_intro[key]:
+                cumulative_term = None
+                for term in self.cache[cind]:
+                    if isinstance(term, int) and cumulative_term is None:
+                        # look up result.
+                        cumulative_term = saved[term]
+                    elif isinstance(term, int) and not (cumulative_term is None):
+                        cumulative_term = saved[term] @ (cumulative_term)
+                    elif isinstance(term, LabelTupTup):
+                        val = 1
+                        for op in term:
+                            op_term = 1
+                            if op.num_qubits == 2:
+                                # We may need to do swaps.
+                                if op in saved:
+                                    op_term = saved[op]
+                                elif op.qubits[1] < op.qubits[0]:
+                                    # This is in the wrong order.
+                                    op_term = look_up_operations(model, op)
+                                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
+                                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
+                                    saved[op] = op_term # Save so we only need to this operation once.
+                                else:
+                                    op_term = look_up_operations(model, op)
+                            else:
+                                op_term = look_up_operations(model, op)
+
+                            val = _np.kron(val, op_term)
+                        #val = model.operation_blks["gates"][term[0]].to_dense()
+                        if cumulative_term is None:
+                            cumulative_term = val
+                        else:
+                            cumulative_term = val @ (cumulative_term)
+                if cumulative_term is None:
+                    saved[cind] = _np.eye(4**self.cache_ind_to_num_qubits_needed[cind]) # identity of the appropriate size.
+                else:
+                    saved[cind] = cumulative_term
+        if __debug__:
+            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
+            for key in self.cache:
+                assert key in saved
+        
+        return saved
+    
+    def reconstruct_full_matrices(self, process_matrices_cache):
+
+        output = []
+        start_pos = 0
+        for cir_ind in range(len(self.qubits_used_in_each_lane)):
+            lane_circuits = []
+            for i in range(self.qubits_used_in_each_lane[cir_ind]):
+                lane_circuits.append(process_matrices_cache[start_pos + i])
+            output.append(lane_circuits)
+            start_pos += self.inds_needed_to_reconstruct[cir_ind]
+
+        # Now we will do the contraction.
+        for cir_ind in range(len(self.inds_needed_to_reconstruct)):
+
+            order = self.tensor_contraction_orders_by_circuit[cir_ind]
+
+            while order:
+                sp = order[0]
+                if len(output[cir_ind][sp]) == 0:
+                    breakpoint()
+                output[cir_ind][sp] = _np.kron(output[cir_ind][sp], output[cir_ind][sp+1])
+                output[cir_ind][sp+1:] = output[cir_ind][sp+2:]
+                
+                # Adjust future indices
+                tmp = []
+                for new_val in order[1:]:
+                    tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
+                order = tmp
+
+            output[cir_ind] = output[cir_ind][0]
+            assert output[cir_ind].shape == (256, 256)
+        return output
+    # def compute_derivatives_using_cache(self, model, productCache):
+    #     """
+    #     We are interested in computing the derivative of the probabilities specified by a model
+    #     and the cached circuit list against the model parameters. We will assume that the model can take a
+    #     derivative with respect to a single gate operation. However, we need to handle the product rule.
+    #     """
+
+    #     productCache = self.fill_out_circuit_cache(model)
+
+    #     round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+    #     saved = {}
+
+    #     product_rule_cache: dict[int, list[int]] = {}
+    #     for key in round_keys:
+    #         for cind in self.sequence_intro[key]:
+
+
+                
+    #             cumulative_term = None
+    #             for term in self.cache[cind]:
+    #                 if isinstance(term, int) and cumulative_term is None:
+    #                     # look up result.
+    #                     cumulative_term = saved[term]
+    #                 elif isinstance(term, int) and not (cumulative_term is None):
+    #                     cumulative_term = saved[term] @ cumulative_term
+    #                 elif isinstance(term, LabelTupTup):
+    #                     val = 1
+    #                     for op in term:
+    #                         op_term = 1
+    #                         if op.num_qubits == 2:
+    #                             # We may need to do swaps.
+    #                             if op in saved:
+    #                                 op_term = saved[op]
+    #                             elif op.qubits[1] < op.qubits[0]:
+    #                                 # This is in the wrong order.
+    #                                 swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
+    #                                 op_term = model.operation_blks["gates"][op].to_dense()
+    #                                 op_term = swap_term @ op_term @ swap_term.T
+    #                                 saved[op] = op_term # Save so we only need to this operation once.
+    #                             else:
+    #                                 op_term = model.operation_blks["gates"][op].to_dense()
+    #                         else:
+    #                             op_term = model.operation_blks["gates"][op].to_dense()
+    #                         val = np.kron(val, op_term)
+    #                     #val = model.operation_blks["gates"][term[0]].to_dense()
+    #                     if cumulative_term is None:
+    #                         cumulative_term = val
+    #                     else:
+    #                         cumulative_term = val @ cumulative_term
+    #             saved[cind] = cumulative_term
+    #     return saved
+    
+    def cache_num_to_matrix_size(self, ind, output_cache):
+        if ind in output_cache:
+            return output_cache[ind]
+        else:
+            if ind not in self.cache:
+                assert ind in self.cache
+            children = self.cache[ind]
+            answer = 0
+            for child in children:
+                if isinstance(child, Label):
+                    lbls = child.num_qubits
+                    sub_probanswer = lbls
+                else:
+                    sub_probanswer = self.cache_num_to_matrix_size(child, output_cache)
+                answer = max(answer, sub_probanswer)
+            output_cache[ind] = answer
+            return answer
+
+
+    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...]):
+        
+
+        if qubit_list in self.tensor_contraction_order_cache:
+            return self.tensor_contraction_order_cache[qubit_list]
+
+        best_cost = _np.inf
+        best_order = []
+
+        for order in itertools.permutations(range(len(qubit_list)-1), len(qubit_list)-1):
+
+            my_list = [qb for qb in qubit_list] # force deep copy.
+            my_starting_points = [sp for sp in order]
+            cost = 0
+            early_exit = False
+            while my_starting_points and not early_exit:
+                sp = my_starting_points.pop(0)
+
+                cost += self._tensor_cost_model(my_list[sp], my_list[sp+1])
+                if cost <= best_cost:
+                    # modify sp for future.
+                    tmp = []
+                    for new_val in my_starting_points:
+                        tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
+                    my_starting_points = tmp
+
+                    q2 = my_list.pop(sp+1)
+                    my_list[sp] += q2
+                else:
+                    early_exit = True # This round is done because the partial sum was too big.
+
+            if cost < best_cost and not early_exit:
+                best_cost = cost
+                best_order = list(order)
+
+        # Store off the information.
+        self.tensor_contraction_order_cache[qubit_list] = best_order
+
+        return best_order
+
+    def _tensor_cost_model(self, num_qubits1, num_qubits2):
+        """
+        Assumes kronecker product of 2 square matrices.
+        """
+
+        return (4**num_qubits1)**2 * (4**num_qubits2)**2
+
+    """        
+    def _evaluate_product_rule(self, cind: int, rn: int):
+
+        sequence = self.cache[cind]
+        num_terms = len(sequence)
+        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
+
+        for sub_r in sorted(sub_rounds.keys())[::-1]:
+            sub_sequence = None
+            for sub_cind in sub_rounds[sub_r]:
+        
+                for term in sub_tree_cache[sub_cind]:
+                    if isinstance(term, tuple):
+                        # Then, this may be a partial derivative or an character in original sequence.
+                        if len(term) == 2:
+                            # Then this is taking a partial derivative.
+                            natural_term = term[1][0]
+                            if natural_term in self.derivative_cache:
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+                            else:
+                                # This should be a natural derivative.
+                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+
+                        # It is just an index to sequence for where to look in the cache.
+                        next_ind = term[0]
+                        sequence_val = sequence[next_ind]
+
+                        if isinstance(term, int) and cumulative_term is None:
+                            # look up result.
+                            cumulative_term = saved[term]
+                        elif isinstance(term, int) and not (cumulative_term is None):
+                            cumulative_term = saved[term] @ cumulative_term
+                        elif isinstance(term, LabelTupTup):
+                            val = 1
+                            for op in term:
+                            op_term = 1
+                            if op.num_qubits == 2:
+                                # We may need to do swaps.
+                                if op in saved:
+                                    op_term = saved[op]
+                                elif op.qubits[1] < op.qubits[0]:
+                                    # This is in the wrong order.
+                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
+                                    op_term = model.operation_blks["gates"][op].to_dense()
+                                    op_term = swap_term @ op_term @ swap_term.T
+                                    saved[op] = op_term # Save so we only need to this operation once.
+                                else:
+                                    op_term = model.operation_blks["gates"][op].to_dense()
+                            else:
+                                op_term = model.operation_blks["gates"][op].to_dense()
+                            val = _np.kron(val, op_term)
+                        #val = model.operation_blks["gates"][term[0]].to_dense()
+                        if cumulative_term is None:
+                            cumulative_term = val
+                        else:
+                            cumulative_term = val @ cumulative_term
+    """
+    
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 7642978e5..f3c2c8e85 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -17,10 +17,13 @@
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
 from pygsti.tools import slicetools as _slct
 
+NICK_USE_OLD_EVAL_TREE = False
 
 class _MatrixCOPALayoutAtom(_DistributableAtom):
     """
@@ -138,6 +141,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
             if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
                 double_expanded_ckt = cir.expand_subcircuits()
             double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
+
         self.tree = _EvalTree.create(double_expanded_nospam_circuits_plus_scratch)
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
@@ -151,7 +155,213 @@ def add_expanded_circuits(indices, add_to_this_dict):
         tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
         for i, c in expanded_nospam_circuits.items():
             for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
-                if spam_tuple not in tree_indices_by_spamtuple: tree_indices_by_spamtuple[spam_tuple] = []
+                if spam_tuple not in tree_indices_by_spamtuple:
+                    tree_indices_by_spamtuple[spam_tuple] = []
+                tree_indices_by_spamtuple[spam_tuple].append(i)
+
+        #Assign element indices, starting at `offset`
+        # now that we know how many of each spamtuple there are, assign final element indices.
+        local_offset = 0
+        self.indices_by_spamtuple = dict()  # values are (element_indices, tree_indices) tuples.
+        for spam_tuple, tree_indices in tree_indices_by_spamtuple.items():
+            self.indices_by_spamtuple[spam_tuple] = (slice(local_offset, local_offset + len(tree_indices)),
+                                                     _slct.list_to_slice(tree_indices, array_ok=True))
+            local_offset += len(tree_indices)
+            #TODO: allow tree_indices to be None or a slice?
+
+        element_slice = None  # slice(offset, offset + local_offset)  # *global* (of parent layout) element-index slice
+        num_elements = local_offset
+
+        elindex_outcome_tuples = {unique_i: list() for unique_i in range(len(unique_complete_circuits))}
+
+        for spam_tuple, (element_indices, tree_indices) in self.indices_by_spamtuple.items():
+            for elindex, tree_index in zip(_slct.indices(element_indices), _slct.to_array(tree_indices)):
+                outcome_by_spamtuple = expanded_nospam_circuit_outcomes[expanded_nospam_circuits[tree_index]]
+                outcome, unique_is = outcome_by_spamtuple[spam_tuple]
+                for unique_i in unique_is:
+                    elindex_outcome_tuples[unique_i].append((elindex, outcome))  # *local* element indices
+        self.elindex_outcome_tuples = elindex_outcome_tuples
+
+        super().__init__(element_slice, num_elements)
+
+    def nonscratch_cache_view(self, a, axis=None):
+        """
+        Create a view of array `a` restricting it to only the *final* results computed by this tree.
+
+        This need not be the entire array because there could be intermediate results
+        (e.g. "scratch space") that are excluded.
+
+        Parameters
+        ----------
+        a : ndarray
+            An array of results computed using this EvalTree,
+            such that the `axis`-th dimension equals the full
+            length of the tree.  The other dimensions of `a` are
+            unrestricted.
+
+        axis : int, optional
+            Specified the axis along which the selection of the
+            final elements is performed. If None, than this
+            selection if performed on flattened `a`.
+
+        Returns
+        -------
+        ndarray
+            Of the same shape as `a`, except for along the
+            specified axis, whose dimension has been reduced
+            to filter out the intermediate (non-final) results.
+        """
+        if axis is None:
+            return a[0:self._num_nonscratch_tree_items]
+        else:
+            sl = [slice(None)] * a.ndim
+            sl[axis] = slice(0, self._num_nonscratch_tree_items)
+            ret = a[tuple(sl)]
+            assert(ret.base is a or ret.base is a.base)  # check that what is returned is a view
+            assert(ret.size == 0 or _np.may_share_memory(ret, a))
+            return ret
+
+    @property
+    def cache_size(self):
+        """The cache size of this atom."""
+        return len(self.tree)
+    
+
+class _MatrixCOPALayoutAtomWithLCS(_DistributableAtom):
+    """
+    The atom ("atomic unit") for dividing up the element dimension in a :class:`MatrixCOPALayout`.
+
+    Parameters
+    ----------
+    unique_complete_circuits : list
+        A list that contains *all* the "complete" circuits for the parent layout.  This
+        atom only owns a subset of these, as given by `group` below.
+
+    unique_nospam_circuits : list
+        A list that contains the unique circuits within `unique_complete_circuits` once
+        their state preparations and measurements are removed.  A subset of these circuits
+        (see `group` below) are what fundamentally define the circuit outcomes that this atom
+        includes: it includes *all* the circuit outcomes of those circuits.
+
+    circuits_by_unique_nospam_circuits : dict
+       A dictionary with keys equal to the elements of `unique_nospam_circuits` and values
+       that are lists of indices into `unique_complete_circuits`.  Thus, this dictionary
+       maps each distinct circuit-without-SPAM circuit to the list of complete circuits
+       within `unique_complete_circuits` that correspond to it.
+
+    ds_circuits : list
+        A list of circuits parallel to `unique_complete_circuits` of these circuits
+        as they should be accessed from `dataset`.  This applies any aliases and
+        removes implied SPAM elements relative to `unique_complete_circuits`.
+
+    group : set
+        The set of indices into `unique_nospam_circuits` that define the circuit
+        outcomes owned by this atom.
+
+    helpful_scratch : set
+        A set of indices into `unique_nospam_circuits` that specify circuits that
+        aren't owned by this atom but are helpful in building up an efficient evaluation
+        tree.
+
+    model : Model
+        The model being used to construct this layout.  Used for expanding instruments
+        within the circuits.
+    
+    unique_circuits : list of Circuits
+        A list of the unique :class:`Circuit` objects representing the circuits this layout will include.
+
+    dataset : DataSet
+        The dataset, used to include only observed circuit outcomes in this atom
+        and therefore the parent layout.
+    """
+
+    def __init__(self, unique_complete_circuits, unique_nospam_circuits, circuits_by_unique_nospam_circuits,
+                 ds_circuits, group, helpful_scratch, model, unique_circuits, dataset=None, expanded_and_separated_circuit_cache=None,
+                 double_expanded_nospam_circuits_cache = None, implicit_idle_gate = None):
+
+        if expanded_and_separated_circuit_cache is None:
+            expanded_and_separated_circuit_cache = dict()
+
+        #Note: group gives unique_nospam_circuits indices, which circuits_by_unique_nospam_circuits
+        # turns into "unique complete circuit" indices, which the layout via it's to_unique can map
+        # to original circuit indices.
+        def add_expanded_circuits(indices, add_to_this_dict):
+            _expanded_nospam_circuit_outcomes = add_to_this_dict
+            for i in indices:
+                nospam_c = unique_nospam_circuits[i]
+                for unique_i in circuits_by_unique_nospam_circuits[nospam_c]:  # "unique" circuits: add SPAM to nospam_c
+                    #the cache is indexed into using the (potentially) incomplete circuits
+                    expc_outcomes = expanded_and_separated_circuit_cache.get(unique_circuits[unique_i], None)
+                    if expc_outcomes is None: #fall back on original non-cache behavior.
+                        observed_outcomes = None if (dataset is None) else dataset[ds_circuits[unique_i]].unique_outcomes
+                        expc_outcomes = model.expand_instruments_and_separate_povm(unique_complete_circuits[unique_i], observed_outcomes)
+                        #and add this new value to the cache.
+                        expanded_and_separated_circuit_cache[unique_circuits[unique_i]] = expc_outcomes 
+                    for sep_povm_c, outcomes in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
+                        prep_lbl = sep_povm_c.circuit_without_povm[0]
+                        exp_nospam_c = sep_povm_c.circuit_without_povm[1:]  # sep_povm_c *always* has prep lbl
+                        spam_tuples = [(prep_lbl, elabel) for elabel in sep_povm_c.full_effect_labels]
+                        outcome_by_spamtuple = {st:outcome for st, outcome in zip(spam_tuples, outcomes)}
+
+                        #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
+                        # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
+                        # keep track of a list of unique_i indices for each circut and spam tuple below.
+                        if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
+                            _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
+                        else:
+                            for st, outcome in outcome_by_spamtuple.items():
+                                if st in _expanded_nospam_circuit_outcomes[exp_nospam_c]:
+                                    existing_outcome, existing_unique_is = \
+                                        _expanded_nospam_circuit_outcomes[exp_nospam_c][st]
+                                    assert(existing_outcome == outcome), "Outcome should be same when spam tuples are!"
+                                    assert(unique_i not in existing_unique_is)  # SLOW - remove?
+                                    existing_unique_is.append(unique_i)
+                                else:
+                                    _expanded_nospam_circuit_outcomes[exp_nospam_c][st] = (outcome, [unique_i])
+
+        # keys = expanded circuits w/out SPAM layers; values = spamtuple => (outcome, unique_is) dictionary that
+        # keeps track of which "unique" circuit indices having each spamtuple / outcome.
+        expanded_nospam_circuit_outcomes = dict()
+        add_expanded_circuits(group, expanded_nospam_circuit_outcomes)
+        expanded_nospam_circuits = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes.keys())}
+
+        # add suggested scratch to the "final" elements as far as the tree creation is concerned
+        # - this allows these scratch element to help balance the tree.
+        if helpful_scratch:
+            expanded_nospam_circuit_outcomes_plus_scratch = expanded_nospam_circuit_outcomes.copy()
+            add_expanded_circuits(helpful_scratch, expanded_nospam_circuit_outcomes_plus_scratch)
+            expanded_nospam_circuits_plus_scratch = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes_plus_scratch.keys())}
+        else:
+            expanded_nospam_circuits_plus_scratch = expanded_nospam_circuits.copy()
+        
+        if double_expanded_nospam_circuits_cache is None:
+            double_expanded_nospam_circuits_cache = dict()
+        double_expanded_nospam_circuits_plus_scratch = dict()
+        for i, cir in expanded_nospam_circuits_plus_scratch.items():
+            # expand sub-circuits for a more efficient tree
+            double_expanded_ckt = double_expanded_nospam_circuits_cache.get(cir, None)
+            if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
+                double_expanded_ckt = cir.expand_subcircuits()
+            double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
+
+        vals = list(double_expanded_nospam_circuits_plus_scratch.values())
+        
+        circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
+        self.tree = _EvalTreeLCS(circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree)
+        #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
+
+        self._num_nonscratch_tree_items = len(expanded_nospam_circuits)  # put this in EvalTree?
+
+        # self.tree's elements give instructions for evaluating ("caching") no-spam quantities (e.g. products).
+        # Now we assign final element indices to the circuit outcomes corresponding to a given no-spam ("tree")
+        # quantity plus a spam-tuple. We order the final indices so that all the outcomes corresponding to a
+        # given spam-tuple are contiguous.
+
+        tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
+        for i, c in expanded_nospam_circuits.items():
+            for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
+                if spam_tuple not in tree_indices_by_spamtuple:
+                    tree_indices_by_spamtuple[spam_tuple] = []
                 tree_indices_by_spamtuple[spam_tuple].append(i)
 
         #Assign element indices, starting at `offset`
@@ -286,7 +496,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
         circuits. I.e. circuits with prep labels and POVM labels appended.
     """
 
-    def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=1,
+    def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=2,
                  num_param_dimension_processors=(), param_dimensions=(),
                  param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
                  layout_creation_circuit_cache = None):
@@ -368,13 +578,26 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
 
         def _create_atom(args):
             group, helpful_scratch_group = args
-            return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
+            if NICK_USE_OLD_EVAL_TREE:
+                return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 
                                          unique_circuits, dataset,
                                          self.expanded_and_separated_circuits_cache,
                                          self.expanded_subcircuits_no_spam_cache)
 
+            gatename = None
+            if hasattr(model._layer_rules, "_singleq_idle_layer_labels"):
+                keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
+                if model._layer_rules.implicit_idle_mode == "pad_1Q":
+                    gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
+            return _MatrixCOPALayoutAtomWithLCS(unique_complete_circuits, unique_nospam_circuits,
+                                         circuits_by_unique_nospam_circuits, ds_circuits,
+                                         group, helpful_scratch_group, model, 
+                                         unique_circuits, dataset,
+                                         self.expanded_and_separated_circuits_cache,
+                                         self.expanded_subcircuits_no_spam_cache, implicit_idle_gate=gatename)
+
         super().__init__(circuits, unique_circuits, to_unique, unique_complete_circuits,
                          _create_atom, list(zip(groups, helpful_scratch)), num_tree_processors,
                          num_param_dimension_processors, param_dimensions,
diff --git a/pygsti/models/modelconstruction.py b/pygsti/models/modelconstruction.py
index df4cfc879..57f6334f1 100644
--- a/pygsti/models/modelconstruction.py
+++ b/pygsti/models/modelconstruction.py
@@ -37,6 +37,7 @@
 from pygsti.models import gaugegroup as _gg
 from pygsti.models.localnoisemodel import LocalNoiseModel as _LocalNoiseModel
 from pygsti.models.cloudnoisemodel import CloudNoiseModel as _CloudNoiseModel
+from pygsti.models.singlequbitequivalencerules import EquivalentClassesLocalNoiseModel as _EquivalentClassesLocalNoiseModel
 from pygsti.baseobjs import label as _label
 from pygsti.baseobjs import statespace as _statespace
 from pygsti.baseobjs.basis import Basis as _Basis
@@ -1524,6 +1525,52 @@ def _setup_local_gates(processor_spec, evotype, modelnoise=None, custom_gates=No
                 if (noiseop is not None) else ideal_factory
     return gatedict
 
+def _create_crosstalk_free_model_with_equivalent_clases(qudit_to_spatially_equivalent_qudit ,processor_spec, modelnoise, custom_gates=None, evotype="default", simulator="auto",
+                                 on_construction_error='raise', independent_gates=False, independent_spam=True,
+                                 ensure_composed_gates=False, ideal_gate_type='auto', ideal_prep_type='auto',
+                                 ideal_povm_type='auto', implicit_idle_mode='none', basis='pp') -> _EquivalentClassesLocalNoiseModel:
+    """
+    Create a n-qudit "crosstalk-free" model while assuming that certain qudits are spatially equivalent for 1 qudit gates.
+
+    Similar to :meth:`create_crosstalk_free_model` but the noise is input more generally,
+    as a :class:`ModelNoise` object.  Arguments are the same as this function except that
+    `modelnoise` is given instead of several more specific noise-describing arguments.
+
+    Returns
+    -------
+    EquivalentClassesLocalNoiseModel
+    """
+
+    qudit_labels = processor_spec.qudit_labels
+    state_space = _statespace.QubitSpace(qudit_labels) if all([udim == 2 for udim in processor_spec.qudit_udims]) \
+        else _statespace.QuditSpace(qudit_labels, processor_spec.qudit_udims)
+    evotype = _Evotype.cast(evotype, state_space=state_space)
+    modelnoise = _OpModelNoise.cast(modelnoise)
+    modelnoise.reset_access_counters()
+
+    if ideal_gate_type == "auto":
+        ideal_gate_type = ('static standard', 'static clifford', 'static unitary')
+    if ideal_prep_type == "auto":
+        ideal_prep_type = _state.state_type_from_op_type(ideal_gate_type)
+    if ideal_povm_type == "auto":
+        ideal_povm_type = _povm.povm_type_from_op_type(ideal_gate_type)
+
+    gatedict = _setup_local_gates(processor_spec, evotype, modelnoise, custom_gates, ideal_gate_type, basis)
+
+    # (Note: global idle is now handled through processor-spec processing)
+
+    # SPAM:
+    local_noise = True
+    prep_layers, povm_layers = _create_spam_layers(processor_spec, modelnoise, local_noise,
+                                                   ideal_prep_type, ideal_povm_type, evotype,
+                                                   state_space, independent_spam, basis)
+
+    modelnoise.warn_about_zero_counters()
+    return _EquivalentClassesLocalNoiseModel(qudit_to_spatially_equivalent_qudit, processor_spec, gatedict, prep_layers, povm_layers,
+                            evotype, simulator, on_construction_error,
+                            independent_gates, ensure_composed_gates,
+                            implicit_idle_mode)
+
 
 def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                 depolarization_strengths=None, stochastic_error_probs=None, lindblad_error_coeffs=None,
@@ -1532,7 +1579,7 @@ def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                 evotype="default", simulator="auto", on_construction_error='raise',
                                 independent_gates=False, independent_spam=True, ensure_composed_gates=False,
                                 ideal_gate_type='auto', ideal_spam_type='computational', implicit_idle_mode='none',
-                                basis='pp'):
+                                basis='pp', qudit_to_equivalent_qudit:dict[int, int] = None):
     """
     Create a n-qudit "crosstalk-free" model.
 
@@ -1678,6 +1725,12 @@ def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                              depolarization_parameterization, stochastic_parameterization,
                                              lindblad_parameterization, allow_nonlocal=False)
 
+    if qudit_to_equivalent_qudit:
+        return _create_crosstalk_free_model_with_equivalent_clases(qudit_to_equivalent_qudit, processor_spec, modelnoise, custom_gates, evotype,
+                                                                   simulator, on_construction_error, independent_gates, independent_spam,
+                                                                   ensure_composed_gates, ideal_gate_type, ideal_spam_type, ideal_spam_type, implicit_idle_mode, basis)
+    
+
     return _create_crosstalk_free_model(processor_spec, modelnoise, custom_gates, evotype,
                                         simulator, on_construction_error, independent_gates, independent_spam,
                                         ensure_composed_gates, ideal_gate_type, ideal_spam_type, ideal_spam_type,
diff --git a/pygsti/models/singlequbitequivalencerules.py b/pygsti/models/singlequbitequivalencerules.py
new file mode 100644
index 000000000..065b60942
--- /dev/null
+++ b/pygsti/models/singlequbitequivalencerules.py
@@ -0,0 +1,115 @@
+from pygsti.models.localnoisemodel import _SimpleCompLayerRules, LocalNoiseModel as _LocalNoiseModel
+from pygsti.baseobjs.label import Label, LabelTup, LabelTupTup
+from pygsti.modelmembers.operations import opfactory as _opfactory
+
+
+
+class SingleQuditGateEquivalenceClassesLayerRules(_SimpleCompLayerRules):
+    """
+    Submodel which assumes that you have a set of qubits for which you trust the action of a single
+    qubit gate equally for all qubits within the set.
+    """
+
+    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label):
+
+        super().__init__(qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label)
+
+    def operation_layer_operator(self, model, layerlbl: Label, caches):
+        """
+        Create the operator corresponding to `layerlbl`.
+
+        Parameters
+        ----------
+        layerlbl : Label
+            A circuit layer label.
+
+        Returns
+        -------
+        LinearOperator
+        """
+
+        if layerlbl in caches['complete-layers']:
+            return caches['complete-layers'][layerlbl]
+        
+        if isinstance(layerlbl, LabelTupTup):
+            # This could be a multiple qubit gate or multiple single qubit gates.
+
+            group = []
+            changed = False
+            
+            for op in layerlbl:
+                assert isinstance(op, LabelTup)
+                qubits_used = op.qubits
+                if op.num_qubits == 1:
+                    if model._qubits_to_equiv_qubit[qubits_used[0]] != qubits_used[0]:
+                        new_label = Label(op.name, model._qubits_to_equiv_qubit[qubits_used[0]], op.time, *op.args)
+
+                        changed = True
+                        group.append(new_label)
+                    else:
+                        group.append(op)
+                else:
+                    group.append(op)
+
+            if changed:
+                new_args = None if layerlbl.args == () else layerlbl.args
+                new_time = 0.0 if layerlbl.time == None else layerlbl.time
+                new_label = Label(group)
+            else:
+                new_label = layerlbl
+
+            # Get the operator
+            if new_label in caches['complete-layers']:
+                caches['complete-layers'][layerlbl] = caches['complete-layers'][new_label]
+                return caches['complete-layers'][new_label]
+            else:
+
+                answer = super().operation_layer_operator(model, new_label, caches)
+                caches['complete-layers'][new_label] = answer
+                caches['complete-layers'][layerlbl] = answer
+                return answer
+
+
+        elif isinstance(layerlbl, LabelTup):
+
+            qubits_used = layerlbl.qubits
+            if layerlbl.num_qubits == 1:
+                if model._qubits_to_equiv_qubit[qubits_used[0]] != qubits_used[0]:
+                    new_args = None if layerlbl.args == () else layerlbl.args
+                    new_time = 0.0 if layerlbl.time == None else layerlbl.time
+                    new_label = Label(layerlbl.name, model._qubits_to_equiv_qubit[qubits_used[0]], new_time, new_args)
+
+                    # Get the operator
+                    if new_label in caches['complete-layers']:
+                        caches['complete-layers'][layerlbl] = caches['complete-layers'][new_label]
+                        return caches['complete-layers'][new_label]
+                    else:
+
+                        answer = super().operation_layer_operator(model, new_label, caches)
+                        caches['complete-layer'][new_label] = answer
+                        caches['complete-layer'][layerlbl] = answer
+                        return answer
+
+        return super().operation_layer_operator(model, layerlbl, caches)
+
+
+class EquivalentClassesLocalNoiseModel(_LocalNoiseModel):
+
+    def __init__(self, qubit_to_equivalent_qubit_for_single_qgates: dict, processor_spec, gatedict, prep_layers=None, povm_layers=None, evotype="default",
+                simulator="auto", on_construction_error='raise',
+                independent_gates=False, ensure_composed_gates=False, implicit_idle_mode="none"):
+
+
+        super().__init__(processor_spec, gatedict, prep_layers, povm_layers, evotype, simulator,
+                         on_construction_error, independent_gates, ensure_composed_gates, implicit_idle_mode)
+        
+        # Now we need to reset the layer rules to use the Equivalent class rules.
+
+        old_rules = self._layer_rules
+
+        new_rules = SingleQuditGateEquivalenceClassesLayerRules( old_rules.qubit_labels, old_rules.implicit_idle_mode,
+                                                                old_rules.single_qubit_idle_layer_labels, old_rules.global_idle_layer_label)
+        
+        self._layer_rules = new_rules
+        self._qubits_to_equiv_qubit = qubit_to_equivalent_qubit_for_single_qgates
+        self._reinit_opcaches() # Clear the caches for using the new rules.

From ced6d46a17a031e181fa5033ae3295967e75df5c Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 11:39:35 -0700
Subject: [PATCH 035/141] Lanes collapsed by looking through the qubits used
 and building up a layer, then combining the layers.

---
 pygsti/forwardsims/mapforwardsim.py    |   4 +
 pygsti/forwardsims/matrixforwardsim.py |  20 +-
 pygsti/layouts/evaltree.py             | 563 +++++++++++++++----------
 pygsti/layouts/matrixlayout.py         |  14 +-
 4 files changed, 375 insertions(+), 226 deletions(-)

diff --git a/pygsti/forwardsims/mapforwardsim.py b/pygsti/forwardsims/mapforwardsim.py
index 81ccf917c..7d59fb9ee 100644
--- a/pygsti/forwardsims/mapforwardsim.py
+++ b/pygsti/forwardsims/mapforwardsim.py
@@ -15,6 +15,7 @@
 
 import numpy as _np
 from numpy import linalg as _nla
+import time
 
 from pygsti.forwardsims.distforwardsim import DistributableForwardSimulator as _DistributableForwardSimulator
 from pygsti.forwardsims.forwardsim import ForwardSimulator as _ForwardSimulator
@@ -360,8 +361,11 @@ def create_copa_layout_circuit_cache(circuits, model, dataset=None):
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
         resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim)
+        start_time = time.time()
         self.calclib.mapfill_probs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]),  # all indices
                                         layout_atom, resource_alloc)
+        end_time = time.time()
+        print("Time to compute forward probs with map Forward after fixed layout (s): ", end_time - start_time)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index d8dccbc58..83fe41ceb 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -10,6 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
+import time
 import collections as _collections
 import time as _time
 import warnings as _warnings
@@ -3721,10 +3722,19 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         
         # Overestimate the amount of cache usage by assuming everything is the same size.
         dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
+        # resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
 
-        prodCache = layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
-        Gs = layout_atom.tree.reconstruct_full_matrices(prodCache)
+        starttime =time.time()
+        layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
+        endtime = time.time()
+
+        print("Time to collapse the process matrices (s): ", endtime - starttime)
+        starttime = time.time()
+        Gs = layout_atom.tree.reconstruct_full_matrices()
+        endtime = time.time()
+        print("Time to reconstruct the whole matrices (s): ", endtime - starttime)
+
+        starttime = time.time()
         old_err = _np.seterr(over='ignore')
         for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
             # "element indices" index a circuit outcome probability in array_to_fill's first dimension
@@ -3735,6 +3745,8 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
             _fas(array_to_fill, [element_indices],
                  self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
         _np.seterr(**old_err)
+        endtime = time.time()
+        print("Time to complete the spam operations (s): ", endtime - starttime)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
 
@@ -3761,7 +3773,7 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
                 iFinal = iParamToFinal[i]
                 vec = orig_vec.copy(); vec[i] += eps
                 self.model.from_vector(vec, close=True)
-                self._bulk_fill_probs_atom(probs2, layout_atom)
+                self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
                 array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 9b07ffa76..5f6f5fff9 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -10,6 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
+from __future__ import annotations
 import bisect as _bisect
 import time as _time  # DEBUG TIMERS
 import warnings as _warnings
@@ -23,6 +24,7 @@
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
 from typing import Sequence
+import time
 
 
 
@@ -613,10 +615,11 @@ def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
     best_subsequences = {}
     best_lengths = _np.zeros((len(circuit_list), len(circuit_list)))
     curr_best = 0
-    for i, cir0 in enumerate(circuit_list):
+    for i in range(len(circuit_list)-1, -1, -1): # Lets do this in reverse order
+        cir0 = circuit_list[i]
         if len(cir0) >= curr_best:
             # Could be the best.
-            for j in range(i+1, len(circuit_list)):
+            for j in range(i-1, -1, -1):
                 cir1 = circuit_list[j]
                 if len(cir1) >= curr_best:
                     table = _lcs_dp_version(cir0, cir1)
@@ -790,8 +793,36 @@ def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[
 
     return lanes_to_gates
 
-def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
-                                            implicit_idle_gate_name: str = "I") -> tuple[list[tuple],list[int], list[dict[int, int]]]:
+
+def _split_circuits_by_lanes(circuit_list):
+    # First eliminate the duplicate circuits.
+
+    unique_circuits = []
+    matching_inds: dict[int, set[int]] = {}
+    C = len(circuit_list)
+    seen_circs: dict[tuple[LabelTupTup, int]] = {}
+    cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+    for i in range(C):
+        my_cir = circuit_list[i]
+        if tuple(my_cir) in seen_circs:
+            cache[i] = seen_circs[tuple(my_cir)]
+        else:
+            seen_circs[tuple(my_cir)] = i
+
+    labels_to_circuits = {}
+    for my_cir in seen_circs:
+        line_labels = _Circuit(my_cir)._line_labels
+        if line_labels in labels_to_circuits:
+            labels_to_circuits[line_labels].append(my_cir)
+        else:
+            labels_to_circuits[line_labels] = [my_cir]
+        
+
+def setup_circuit_list_for_LCS_computations(
+        circuit_list: list[_Circuit],
+        implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
+                                                    dict[tuple[_Circuit], list[tuple[int, int]]],
+                                                    dict[tuple[int, ...], set[_Circuit]]]:
     """
     Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
 
@@ -799,10 +830,17 @@ def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
     Then, a sequence detailing the number of qubits in each lane for a circuit.
     """
 
-    output = []
-    qubits_used_in_each_lane = []
+    # output = []
+    # cir_id_to_lanes = []
+
+    # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
+    # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
 
-    for cir in circuit_list:
+    sub_cir_to_cir_id_and_lane_id: dict[tuple[_Circuit], list[tuple[int, int]]] = {}
+    line_labels_to_circuit_list: dict[tuple[int, ...], set[_Circuit]] = {}
+    cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
+
+    for i, cir in enumerate(circuit_list):
 
         if implicit_idle_gate_name:
             cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
@@ -810,30 +848,54 @@ def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
         qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
         sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
 
-        output.extend(sub_cirs)
-        qubits_used_in_each_lane.append(lanes_to_qubits)
-    return output, qubits_used_in_each_lane
+        assert len(sub_cirs) == len(lanes_to_qubits)
+        for j in range(len(sub_cirs)):
+            sc = _Circuit(sub_cirs[j])
+            lbls = sc._line_labels
+            if lbls in line_labels_to_circuit_list:
+                line_labels_to_circuit_list[lbls].append(sc)
+            else:
+                line_labels_to_circuit_list[lbls] = [sc]
+            if sc in sub_cir_to_cir_id_and_lane_id:
+                sub_cir_to_cir_id_and_lane_id[sc].append((i,j))
+            else:
+                sub_cir_to_cir_id_and_lane_id[sc] = [(i,j)]
+            if i in cir_ind_and_lane_id_to_sub_cir:
+                cir_ind_and_lane_id_to_sub_cir[i][j] = sc
+            else:
+                cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
+
+        # output.extend(sub_cirs)
+        # cir_id_to_lanes.append(lanes_to_qubits)
+    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
 
 class EvalTreeBasedUponLongestCommonSubstring():
 
-    def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: list[dict[int, tuple[int, ...]]]):
+    def __init__(self, circuit_list: list[LabelTupTup]):
         """
         Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
         """
 
-        assert len(qubits_used_in_each_lane) <= len(circuit_list)
+        self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
+
         external_matches = _compute_lcs_for_every_pair_of_circuits(circuit_list)
+        
+        best_external_match = _np.max(external_matches[0])
+        self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
+
+
         internal_matches = build_internal_tables(circuit_list)
+        best_internal_match = _np.max(internal_matches[0])
 
-        max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+        max_rounds = int(max(best_external_match,best_internal_match))
 
         C = len(circuit_list)
-        num_full_circuits = len(qubits_used_in_each_lane)
         sequence_intro = {0: _np.arange(C)}
 
-        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
         cache_pos = C
+        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+
         new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
 
         i = 0
@@ -841,18 +903,26 @@ def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: li
             new_circuit_list, cache_pos, cache, sequence_intro[i+1] = build_one_round_of_eval_tree(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
             i += 1
             external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
-            internal_matches = build_internal_tables(new_circuit_list)
 
-            max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+            if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
+                # We are not going to get a better internal match.
+                pass
+            else:
+                internal_matches = build_internal_tables(new_circuit_list)
+
+            best_external_match = _np.max(external_matches[0])
+            best_internal_match = _np.max(internal_matches[0])
+
+            max_rounds = int(max(best_external_match,best_internal_match))
 
         self.circuit_list = new_circuit_list
         self.cache = cache
         self.num_circuits = C
-        self.qubits_used_in_each_lane = qubits_used_in_each_lane
+        self.from_other = False
 
         self.sequence_intro = sequence_intro
 
-        swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+        self.swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
                                       0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00, -1.23259516e-32],
                                     [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00, 1.23259516e-32,
                                       0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
@@ -888,45 +958,56 @@ def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: li
                                     [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
 
-        self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")
-
+        # Assumes a perfect swap gate!
+        # self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")            
 
 
+    def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
+        """
+        Construct a tree from another tree.
+        """
+        
+        self.cache = other.cache
+        self.num_circuits = other.num_circuits
+        self.sequence_intro = other.sequence_intro
+        self.swap_gate = other.swap_gate
+        self.circuit_list = other.circuit_list
+        self.orig_circuit_list = other.orig_circuit_list
+        self.circuit_to_save_location = other.circuit_to_save_location
+        self.from_other = other
+
+        for ind in self.cache:
+            for i, term in enumerate(self.cache[ind]):
+                if isinstance(term, int):
+                    pass # The tree will stay the same.
+                elif isinstance(term, LabelTupTup):
+                    new_term = ()
+                    for op in term:
+                        new_qu = (qubit_label_exchange[qu] for qu in op.qubits)
+                        new_op = (op.name, *new_qu)
+                        new_term = (*new_term, new_op)
+                    self.cache[ind][i] = Label(new_term)
 
-        self.cache_ind_to_num_qubits_needed = {}
-        offset = 0
-        for i in range(num_full_circuits):
-            for j in qubits_used_in_each_lane[i]:
-                cache_ind = offset + j
-                self.set_cache_sizes(cache_ind, len(qubits_used_in_each_lane[i][j]))
-
-            offset += len(qubits_used_in_each_lane[i])
         
-        self.tensor_contraction_orders_by_circuit = {}
-        self.tensor_contraction_order_cache = {}
-        self.qubit_list_cache = {}
+        for icir in range(len(self.orig_circuit_list)):
+            self.orig_circuit_list[icir] = self.trace_through_cache_to_build_circuit(icir)
 
-        for i in range(num_full_circuits):
-            self.qubit_list_cache[i] = [qubits_used_in_each_lane[i][k] for k in sorted(qubits_used_in_each_lane[i])]
-            self.tensor_contraction_orders_by_circuit[i] = self.best_order_for_tensor_contraction(tuple(self.qubit_list_cache[i]))
-            
+        updated = {}
+        for cir, loc in self.circuit_to_save_location.items():
+            new_cir = ()
+            for layer in cir:
+                new_layer = ()
+                for op in layer:
+                    new_op = (op[0], *(qubit_label_exchange[qu] for qu in op[1:]))
+                    new_layer = (*new_layer, new_op)
+                new_cir = (*new_cir, new_layer)
+            updated[new_cir] = loc
+        self.circuit_to_save_location = updated
 
-        # for val in cache.values():
-        #     num_terms = len(val)
-        #     self._best_order_for_first_derivative(num_terms)
 
-    def set_cache_sizes(self, cache_ind: int, num_qubits: int):
-        """
-        Set the size to use the number of qubits specified.
-        """
-        if cache_ind in self.cache_ind_to_num_qubits_needed:
-            return # we have already set them all.
-        self.cache_ind_to_num_qubits_needed[cache_ind] = num_qubits
-        for child in self.cache:
-            if isinstance(child, int):
-                self.set_cache_sizes(child, num_qubits)
 
-    def collapse_circuits_to_process_matrices(self, model):
+
+    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int):
         """
         Compute the total product cache. Note that this may still have a tensor product
         structure that the operator needs to combine again if they want to have the full 'dense' matrix.
@@ -936,15 +1017,36 @@ def collapse_circuits_to_process_matrices(self, model):
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
         saved: dict[int, _LinearOperator] = {}
         
-        def look_up_operations(model, opTuple) -> _LinearOperator:
+        def look_up_operations(model, opTuple) -> _np.ndarray:
 
             if hasattr(model, "operations"):
                 return model.operations[opTuple].to_dense()
             elif hasattr(model, "operation_blks"):
-                return model.operation_blks[opTuple].to_dense()
+                if opTuple[0] not in model.operation_blks["gates"]:
+                    breakpoint()
+                return model.operation_blks["gates"][opTuple[0]].to_dense()
             else:
                 raise ValueError("Missing attribute")
 
+        def get_appropriate_gate(op, saved):
+            op_term = 1
+            if op.num_qubits == 2:
+                # We may need to do swaps.
+                if op in saved:
+                    op_term = saved[op]
+                elif op.qubits[1] < op.qubits[0]:
+                    # This is in the wrong order.
+                    op_term = look_up_operations(model, op)
+                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
+                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
+                    saved[op] = op_term # Save so we only need to this operation once.
+                else:
+                    op_term = look_up_operations(model, op)
+            else:
+                op_term = look_up_operations(model, op)
+            return op_term
+
+        expected_shape = (4**num_qubits_in_default, 4**num_qubits_in_default)
         for key in round_keys:
             for cind in self.sequence_intro[key]:
                 cumulative_term = None
@@ -952,11 +1054,100 @@ def look_up_operations(model, opTuple) -> _LinearOperator:
                     if isinstance(term, int) and cumulative_term is None:
                         # look up result.
                         cumulative_term = saved[term]
-                    elif isinstance(term, int) and not (cumulative_term is None):
+                    elif isinstance(term, int) and cumulative_term is not None:
                         cumulative_term = saved[term] @ (cumulative_term)
                     elif isinstance(term, LabelTupTup):
                         val = 1
-                        for op in term:
+                        qubits_used = [i for i in range(num_qubits_in_default)] # Qubits are assuming to be integer markers.
+                        while qubits_used:
+                            qu = qubits_used[0]
+                            gate_matrix = _np.eye(4)
+                            found = False
+                            op_ind = 0
+                            while not found and op_ind < len(term):
+                                op = term[op_ind]
+                                if qu in op.qubits:
+                                    gate_matrix = get_appropriate_gate(op, saved)
+                                    found = True
+                                    qubits_used = qubits_used[len(op.qubits):] # We assume that the qubits need to overlap for a specific gate. i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
+                                op_ind += 1
+                            val = _np.kron(val, gate_matrix)
+                            if not found:
+                                # Remove that qubit from list to check.
+                                qubits_used = qubits_used[1:]
+
+                        if val.shape != expected_shape:
+                            breakpoint()
+                        if cumulative_term is None:
+                            cumulative_term = val
+                        else:
+                            if val.shape[1] != cumulative_term.shape[0]:
+                                breakpoint()
+                            cumulative_term = val @ (cumulative_term)
+                if cumulative_term is None:
+                    saved[cind] = _np.eye(4**num_qubits_in_default) # identity of the appropriate size.
+                else:
+                    saved[cind] = cumulative_term
+        if __debug__:
+            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
+            for key in self.cache:
+                assert key in saved
+        
+        # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
+    
+        return saved, self.circuit_to_save_location 
+
+    def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
+
+        output = ()
+        for term in self.cache[cache_ind]:
+
+            if isinstance(term, Label):
+                output = (*output, term)
+            elif isinstance(term, int):
+                # Recurse down.
+                next_term = self.trace_through_cache_to_build_circuit(term)
+                output = (*output, *next_term)
+
+        return list(output)
+
+
+    """        
+    def _evaluate_product_rule(self, cind: int, rn: int):
+
+        sequence = self.cache[cind]
+        num_terms = len(sequence)
+        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
+
+        for sub_r in sorted(sub_rounds.keys())[::-1]:
+            sub_sequence = None
+            for sub_cind in sub_rounds[sub_r]:
+        
+                for term in sub_tree_cache[sub_cind]:
+                    if isinstance(term, tuple):
+                        # Then, this may be a partial derivative or an character in original sequence.
+                        if len(term) == 2:
+                            # Then this is taking a partial derivative.
+                            natural_term = term[1][0]
+                            if natural_term in self.derivative_cache:
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+                            else:
+                                # This should be a natural derivative.
+                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+
+                        # It is just an index to sequence for where to look in the cache.
+                        next_ind = term[0]
+                        sequence_val = sequence[next_ind]
+
+                        if isinstance(term, int) and cumulative_term is None:
+                            # look up result.
+                            cumulative_term = saved[term]
+                        elif isinstance(term, int) and not (cumulative_term is None):
+                            cumulative_term = saved[term] @ cumulative_term
+                        elif isinstance(term, LabelTupTup):
+                            val = 1
+                            for op in term:
                             op_term = 1
                             if op.num_qubits == 2:
                                 # We may need to do swaps.
@@ -964,54 +1155,109 @@ def look_up_operations(model, opTuple) -> _LinearOperator:
                                     op_term = saved[op]
                                 elif op.qubits[1] < op.qubits[0]:
                                     # This is in the wrong order.
-                                    op_term = look_up_operations(model, op)
-                                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
-                                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
+                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
+                                    op_term = model.operation_blks["gates"][op].to_dense()
+                                    op_term = swap_term @ op_term @ swap_term.T
                                     saved[op] = op_term # Save so we only need to this operation once.
                                 else:
-                                    op_term = look_up_operations(model, op)
+                                    op_term = model.operation_blks["gates"][op].to_dense()
                             else:
-                                op_term = look_up_operations(model, op)
-
+                                op_term = model.operation_blks["gates"][op].to_dense()
                             val = _np.kron(val, op_term)
                         #val = model.operation_blks["gates"][term[0]].to_dense()
                         if cumulative_term is None:
                             cumulative_term = val
                         else:
-                            cumulative_term = val @ (cumulative_term)
-                if cumulative_term is None:
-                    saved[cind] = _np.eye(4**self.cache_ind_to_num_qubits_needed[cind]) # identity of the appropriate size.
+                            cumulative_term = val @ cumulative_term
+    """
+    
+
+class CollectionOfLCSEvalTrees():
+
+    def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id, cir_id_and_lane_id_to_sub_cir):
+        
+        self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
+
+        ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE = False
+
+        size_to_tree: dict[int, tuple[int, ...]] = {}
+
+        self.line_lbls_to_cir_list = line_lbls_to_circuit_list
+
+        starttime = time.time()
+        for key, vals in line_lbls_to_circuit_list.items():
+            sub_cirs = [list(cir) for cir in vals]
+            if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
+                if len(key) not in size_to_tree:
+                    self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                    size_to_tree[len(key)] = key
                 else:
-                    saved[cind] = cumulative_term
-        if __debug__:
-            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
-            for key in self.cache:
-                assert key in saved
+                    sample = EvalTreeBasedUponLongestCommonSubstring(sub_cirs[:2]) # Build a small version to be corrected later.
+                    other_key = size_to_tree[len(key)]
+                    sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
+                    self.trees[key] = sample
+            else:
+                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                
+        endtime = time.time()
+
+        print(" Time to compute all the evaluation orders (s): ", endtime - starttime)
+
+
+        self.sub_cir_to_full_cir_id_and_lane_id = sub_cir_to_full_cir_id_and_lane_id
+        self.cir_id_and_lane_id_to_sub_cir = cir_id_and_lane_id_to_sub_cir
+
+        self.cir_id_to_tensor_order = {}
+        self.compute_tensor_orders()
+
+        self.saved_results = {}
+        self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
+
+    def collapse_circuits_to_process_matrices(self, model):
+        # Just collapse all of them.
         
-        return saved
-    
-    def reconstruct_full_matrices(self, process_matrices_cache):
+        self.saved_results = {}
+        for key in self.trees:
+            self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+
+    def reconstruct_full_matrices(self):
+
+        if len(self.saved_results) == 0:
+            return
+        
+        # Now we can do the combination.
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
 
         output = []
-        start_pos = 0
-        for cir_ind in range(len(self.qubits_used_in_each_lane)):
+        for icir in range(num_cirs):
             lane_circuits = []
-            for i in range(self.qubits_used_in_each_lane[cir_ind]):
-                lane_circuits.append(process_matrices_cache[start_pos + i])
-            output.append(lane_circuits)
-            start_pos += self.inds_needed_to_reconstruct[cir_ind]
+            for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                lblkey = cir._line_labels
 
-        # Now we will do the contraction.
-        for cir_ind in range(len(self.inds_needed_to_reconstruct)):
+                if len(cir.layertup) == 0:
 
-            order = self.tensor_contraction_orders_by_circuit[cir_ind]
+                    lane_circuits.append(_np.eye(4**(len(lblkey))))
+                else:
+                    if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
+                        print(lblkey)
+                        print(cir)
+                        breakpoint()
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+            output.append(lane_circuits)
+
+        # Need a map from lane id to computed location.
+        for icir in range(num_cirs):
 
+            order = self.cir_id_to_tensor_order[icir]
+            
+            
             while order:
                 sp = order[0]
-                if len(output[cir_ind][sp]) == 0:
-                    breakpoint()
-                output[cir_ind][sp] = _np.kron(output[cir_ind][sp], output[cir_ind][sp+1])
-                output[cir_ind][sp+1:] = output[cir_ind][sp+2:]
+                output[icir][sp] = _np.kron(output[icir][sp], output[icir][sp+1])
+                output[icir][sp+1:] = output[icir][sp+2:]
                 
                 # Adjust future indices
                 tmp = []
@@ -1019,85 +1265,30 @@ def reconstruct_full_matrices(self, process_matrices_cache):
                     tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
                 order = tmp
 
-            output[cir_ind] = output[cir_ind][0]
-            assert output[cir_ind].shape == (256, 256)
+            output[icir] = output[icir][0]
         return output
-    # def compute_derivatives_using_cache(self, model, productCache):
-    #     """
-    #     We are interested in computing the derivative of the probabilities specified by a model
-    #     and the cached circuit list against the model parameters. We will assume that the model can take a
-    #     derivative with respect to a single gate operation. However, we need to handle the product rule.
-    #     """
-
-    #     productCache = self.fill_out_circuit_cache(model)
-
-    #     round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-    #     saved = {}
+    
+    def compute_tensor_orders(self):
 
-    #     product_rule_cache: dict[int, list[int]] = {}
-    #     for key in round_keys:
-    #         for cind in self.sequence_intro[key]:
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
 
+        cache_struct = {}
 
-                
-    #             cumulative_term = None
-    #             for term in self.cache[cind]:
-    #                 if isinstance(term, int) and cumulative_term is None:
-    #                     # look up result.
-    #                     cumulative_term = saved[term]
-    #                 elif isinstance(term, int) and not (cumulative_term is None):
-    #                     cumulative_term = saved[term] @ cumulative_term
-    #                 elif isinstance(term, LabelTupTup):
-    #                     val = 1
-    #                     for op in term:
-    #                         op_term = 1
-    #                         if op.num_qubits == 2:
-    #                             # We may need to do swaps.
-    #                             if op in saved:
-    #                                 op_term = saved[op]
-    #                             elif op.qubits[1] < op.qubits[0]:
-    #                                 # This is in the wrong order.
-    #                                 swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
-    #                                 op_term = model.operation_blks["gates"][op].to_dense()
-    #                                 op_term = swap_term @ op_term @ swap_term.T
-    #                                 saved[op] = op_term # Save so we only need to this operation once.
-    #                             else:
-    #                                 op_term = model.operation_blks["gates"][op].to_dense()
-    #                         else:
-    #                             op_term = model.operation_blks["gates"][op].to_dense()
-    #                         val = np.kron(val, op_term)
-    #                     #val = model.operation_blks["gates"][term[0]].to_dense()
-    #                     if cumulative_term is None:
-    #                         cumulative_term = val
-    #                     else:
-    #                         cumulative_term = val @ cumulative_term
-    #             saved[cind] = cumulative_term
-    #     return saved
-    
-    def cache_num_to_matrix_size(self, ind, output_cache):
-        if ind in output_cache:
-            return output_cache[ind]
-        else:
-            if ind not in self.cache:
-                assert ind in self.cache
-            children = self.cache[ind]
-            answer = 0
-            for child in children:
-                if isinstance(child, Label):
-                    lbls = child.num_qubits
-                    sub_probanswer = lbls
-                else:
-                    sub_probanswer = self.cache_num_to_matrix_size(child, output_cache)
-                answer = max(answer, sub_probanswer)
-            output_cache[ind] = answer
-            return answer
+        for cir_id in range(num_cirs):
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            self.cir_id_to_tensor_order[cir_id] = self.best_order_for_tensor_contraction(qubit_list, cache_struct)
 
+        return
+            
 
-    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...]):
+    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...], cache):
         
 
-        if qubit_list in self.tensor_contraction_order_cache:
-            return self.tensor_contraction_order_cache[qubit_list]
+        if qubit_list in cache:
+            return cache[qubit_list]
 
         best_cost = _np.inf
         best_order = []
@@ -1129,7 +1320,7 @@ def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...]):
                 best_order = list(order)
 
         # Store off the information.
-        self.tensor_contraction_order_cache[qubit_list] = best_order
+        cache[qubit_list] = best_order
 
         return best_order
 
@@ -1139,63 +1330,3 @@ def _tensor_cost_model(self, num_qubits1, num_qubits2):
         """
 
         return (4**num_qubits1)**2 * (4**num_qubits2)**2
-
-    """        
-    def _evaluate_product_rule(self, cind: int, rn: int):
-
-        sequence = self.cache[cind]
-        num_terms = len(sequence)
-        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
-
-        for sub_r in sorted(sub_rounds.keys())[::-1]:
-            sub_sequence = None
-            for sub_cind in sub_rounds[sub_r]:
-        
-                for term in sub_tree_cache[sub_cind]:
-                    if isinstance(term, tuple):
-                        # Then, this may be a partial derivative or an character in original sequence.
-                        if len(term) == 2:
-                            # Then this is taking a partial derivative.
-                            natural_term = term[1][0]
-                            if natural_term in self.derivative_cache:
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-                            else:
-                                # This should be a natural derivative.
-                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-
-                        # It is just an index to sequence for where to look in the cache.
-                        next_ind = term[0]
-                        sequence_val = sequence[next_ind]
-
-                        if isinstance(term, int) and cumulative_term is None:
-                            # look up result.
-                            cumulative_term = saved[term]
-                        elif isinstance(term, int) and not (cumulative_term is None):
-                            cumulative_term = saved[term] @ cumulative_term
-                        elif isinstance(term, LabelTupTup):
-                            val = 1
-                            for op in term:
-                            op_term = 1
-                            if op.num_qubits == 2:
-                                # We may need to do swaps.
-                                if op in saved:
-                                    op_term = saved[op]
-                                elif op.qubits[1] < op.qubits[0]:
-                                    # This is in the wrong order.
-                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                                    op_term = swap_term @ op_term @ swap_term.T
-                                    saved[op] = op_term # Save so we only need to this operation once.
-                                else:
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                            else:
-                                op_term = model.operation_blks["gates"][op].to_dense()
-                            val = _np.kron(val, op_term)
-                        #val = model.operation_blks["gates"][term[0]].to_dense()
-                        if cumulative_term is None:
-                            cumulative_term = val
-                        else:
-                            cumulative_term = val @ cumulative_term
-    """
-    
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index f3c2c8e85..8e86b702b 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -16,8 +16,9 @@
 
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
-from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import CollectionOfLCSEvalTrees as _CollectionOfLCSEvalTrees
 from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import EvalTree as _EvalTree
 from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
@@ -346,8 +347,8 @@ def add_expanded_circuits(indices, add_to_this_dict):
 
         vals = list(double_expanded_nospam_circuits_plus_scratch.values())
         
-        circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
-        self.tree = _EvalTreeLCS(circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree)
+        cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
+        self.tree = _CollectionOfLCSEvalTrees(line_labels_to_circuit_list, sub_cir_to_cir_id_and_lane_id, cir_ind_and_lane_id_to_sub_cir)
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
         self._num_nonscratch_tree_items = len(expanded_nospam_circuits)  # put this in EvalTree?
@@ -588,9 +589,10 @@ def _create_atom(args):
 
             gatename = None
             if hasattr(model._layer_rules, "_singleq_idle_layer_labels"):
-                keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
-                if model._layer_rules.implicit_idle_mode == "pad_1Q":
-                    gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
+                if model._layer_rules._singleq_idle_layer_labels:
+                    keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
+                    if model._layer_rules.implicit_idle_mode == "pad_1Q":
+                        gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
             return _MatrixCOPALayoutAtomWithLCS(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 

From e7bb3c749752331da40d6966471d108c70c064a6 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 12:43:07 -0700
Subject: [PATCH 036/141] revert me, probably

---
 pygsti/baseobjs/statespace.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pygsti/baseobjs/statespace.py b/pygsti/baseobjs/statespace.py
index c083024e7..c64a0d523 100644
--- a/pygsti/baseobjs/statespace.py
+++ b/pygsti/baseobjs/statespace.py
@@ -683,6 +683,12 @@ def __getstate__(self):
     def __setstate__(self, state_dict):
         for k, v in state_dict.items():
             self.__dict__[k] = v
+            try:
+                _ = self.__getattribute__(k)
+            except AttributeError:
+                _ = self.__dict__.pop(k)
+                self.__dict__['_' + k] = v
+                _ = self.__getattribute__(k)
         #reinitialize the hash
         self._hash = hash((self.tensor_product_blocks_labels,
                            self.tensor_product_blocks_dimensions,

From 70843b4e66d1a70cbffde2530d33a88c8a1687a5 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 15:40:11 -0700
Subject: [PATCH 037/141] Compute dense process matrix not with function in
 EvalTreeLCS.

---
 pygsti/forwardsims/matrixforwardsim.py |  16 +--
 pygsti/layouts/evaltree.py             | 148 +++++++++++++++----------
 pygsti/layouts/matrixlayout.py         |   6 +-
 test/unit/objects/test_forwardsim.py   |   7 +-
 4 files changed, 105 insertions(+), 72 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 83fe41ceb..b7a96ab5b 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -23,6 +23,7 @@
 from pygsti.forwardsims.forwardsim import _bytes_for_array_types
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
 from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations, CollectionOfLCSEvalTrees
 from pygsti.layouts.matrixlayout import MatrixCOPALayout as _MatrixCOPALayout
 from pygsti.layouts.matrixlayout import _MatrixCOPALayoutAtomWithLCS
 from pygsti.baseobjs.profiler import DummyProfiler as _DummyProfiler
@@ -1040,7 +1041,7 @@ def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
         return hProdCache
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None, use_old_tree_style: bool = True):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
 
@@ -1127,7 +1128,7 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
 
         layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
                                    na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
-                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
+                                   layout_creation_circuit_cache=layout_creation_circuit_cache, use_old_tree_style=use_old_tree_style)
 
         if mem_limit is not None:
             loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
@@ -3709,12 +3710,13 @@ def bulk_product(self, circuits, scale=False, resource_alloc=None):
             (final_product[i] = scaleValues[i] * prods[i]).
         """
         resource_alloc = _ResourceAllocation.cast(resource_alloc)
-        nCircuits = len(circuits)
 
-        eval_tree = _EvalTreeLCS(circuits)
-        prodCache = eval_tree.fill_out_circuit_cache(self.model)
-        Gs = prodCache[0:nCircuits]
+        my_data = setup_circuit_list_for_LCS_computations(circuits, None)
+
+        full_tree = CollectionOfLCSEvalTrees(my_data[2], my_data[1], my_data[0])
 
+        full_tree.collapse_circuits_to_process_matrices(self.model)
+        Gs = full_tree.reconstruct_full_matrices()
 
         return Gs
 
@@ -3777,4 +3779,4 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
                 array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
-        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache)
\ No newline at end of file
+        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 5f6f5fff9..44c82fec9 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -869,10 +869,52 @@ def setup_circuit_list_for_LCS_computations(
         # cir_id_to_lanes.append(lanes_to_qubits)
     return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
+def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
+    """
+    Look up the dense representation of a gate in the model.
+    """
+
+
+    if hasattr(model, "operations"):
+        return model.operations[opTuple].to_dense()
+    elif hasattr(model, "operation_blks"):
+        if opTuple[0] not in model.operation_blks["gates"]:
+            breakpoint()
+        return model.operation_blks["gates"][opTuple[0]].to_dense()
+    else:
+        raise ValueError("Missing attribute")
+
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
+    op_term = 1
+    if op.num_qubits == 2:
+        # We may need to do swaps.
+        if op in saved:
+            op_term = saved[op]
+        elif op.qubits[1] < op.qubits[0]:
+            # This is in the wrong order.
+            op_term = model_and_gate_to_dense_rep(model, op)
+            op_term = swap_dense @ (op_term) @ swap_dense
+            saved[op] = op_term # Save so we only need to this operation once.
+        else:
+            op_term = model_and_gate_to_dense_rep(model, op)
+    else:
+        op_term = model_and_gate_to_dense_rep(model, op)
+    return op_term
+
+def combine_two_gates(cumulative_term, next_dense_matrix):
+    """
+    Note that the visual representation was
+
+    State Prep | CumulativeTerm | NextDense | Measure
+
+    which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
+    """
+    return next_dense_matrix @ cumulative_term
+
 
 class EvalTreeBasedUponLongestCommonSubstring():
 
-    def __init__(self, circuit_list: list[LabelTupTup]):
+    def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
         """
         Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
         """
@@ -883,6 +925,7 @@ def __init__(self, circuit_list: list[LabelTupTup]):
         
         best_external_match = _np.max(external_matches[0])
         self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
+        self.qubit_start_point = qubit_starting_loc
 
 
         internal_matches = build_internal_tables(circuit_list)
@@ -1017,73 +1060,58 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
         saved: dict[int, _LinearOperator] = {}
         
-        def look_up_operations(model, opTuple) -> _np.ndarray:
 
-            if hasattr(model, "operations"):
-                return model.operations[opTuple].to_dense()
-            elif hasattr(model, "operation_blks"):
-                if opTuple[0] not in model.operation_blks["gates"]:
-                    breakpoint()
-                return model.operation_blks["gates"][opTuple[0]].to_dense()
+
+        def cache_lookup_and_product(cumulative_term, term_to_extend_with: int):
+            if cumulative_term is None:
+                # look up result.
+                return saved[term]
+            elif isinstance(term, int) and cumulative_term is not None:
+                return combine_two_gates(cumulative_term, saved[term_to_extend_with]) 
+
+
+
+        def collapse_cache_line(cumulative_term, term_to_extend_with: int | LabelTupTup):
+
+            if isinstance(term_to_extend_with, int):
+                return cache_lookup_and_product(cumulative_term, term_to_extend_with)
+
             else:
-                raise ValueError("Missing attribute")
-
-        def get_appropriate_gate(op, saved):
-            op_term = 1
-            if op.num_qubits == 2:
-                # We may need to do swaps.
-                if op in saved:
-                    op_term = saved[op]
-                elif op.qubits[1] < op.qubits[0]:
-                    # This is in the wrong order.
-                    op_term = look_up_operations(model, op)
-                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
-                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
-                    saved[op] = op_term # Save so we only need to this operation once.
+                val = 1
+                qubits_used = [i for i in range(num_qubits_in_default)]
+                while qubits_used:
+                    qu = qubits_used[0]
+                    gate_matrix = _np.eye(4)
+                    found = False
+                    op_ind = self.qubit_start_point # Handle circuits with only qubits (i, i+k) where k is number of qubits in the subsystem.
+                    while not found and op_ind < len(term):
+                        op = term[op_ind]
+                        if qu in op.qubits:
+                            gate_matrix = get_dense_representation_of_gate_with_perfect_swap_gates(model, op, saved, self.swap_gate)
+                            found = True
+                            # We assume that the qubits need to overlap for a specific gate.
+                            # i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
+                            qubits_used = qubits_used[len(op.qubits):]
+                        op_ind += 1
+                    val = _np.kron(val, gate_matrix)
+                    if not found:
+                        # Remove that qubit from list to check.
+                        qubits_used = qubits_used[1:]
+
+                if val.shape != expected_shape:
+                    breakpoint()
+                if cumulative_term is None:
+                    return val
                 else:
-                    op_term = look_up_operations(model, op)
-            else:
-                op_term = look_up_operations(model, op)
-            return op_term
+                    return combine_two_gates(cumulative_term, val)
 
         expected_shape = (4**num_qubits_in_default, 4**num_qubits_in_default)
         for key in round_keys:
             for cind in self.sequence_intro[key]:
                 cumulative_term = None
                 for term in self.cache[cind]:
-                    if isinstance(term, int) and cumulative_term is None:
-                        # look up result.
-                        cumulative_term = saved[term]
-                    elif isinstance(term, int) and cumulative_term is not None:
-                        cumulative_term = saved[term] @ (cumulative_term)
-                    elif isinstance(term, LabelTupTup):
-                        val = 1
-                        qubits_used = [i for i in range(num_qubits_in_default)] # Qubits are assuming to be integer markers.
-                        while qubits_used:
-                            qu = qubits_used[0]
-                            gate_matrix = _np.eye(4)
-                            found = False
-                            op_ind = 0
-                            while not found and op_ind < len(term):
-                                op = term[op_ind]
-                                if qu in op.qubits:
-                                    gate_matrix = get_appropriate_gate(op, saved)
-                                    found = True
-                                    qubits_used = qubits_used[len(op.qubits):] # We assume that the qubits need to overlap for a specific gate. i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
-                                op_ind += 1
-                            val = _np.kron(val, gate_matrix)
-                            if not found:
-                                # Remove that qubit from list to check.
-                                qubits_used = qubits_used[1:]
-
-                        if val.shape != expected_shape:
-                            breakpoint()
-                        if cumulative_term is None:
-                            cumulative_term = val
-                        else:
-                            if val.shape[1] != cumulative_term.shape[0]:
-                                breakpoint()
-                            cumulative_term = val @ (cumulative_term)
+                    cumulative_term = collapse_cache_line(cumulative_term, term)
+                        
                 if cumulative_term is None:
                     saved[cind] = _np.eye(4**num_qubits_in_default) # identity of the appropriate size.
                 else:
@@ -1197,7 +1225,7 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
                     sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
                     self.trees[key] = sample
             else:
-                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs, sorted(key)[0])
                 
         endtime = time.time()
 
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 8e86b702b..e2137e355 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -24,8 +24,6 @@
 from pygsti.tools import listtools as _lt
 from pygsti.tools import slicetools as _slct
 
-NICK_USE_OLD_EVAL_TREE = False
-
 class _MatrixCOPALayoutAtom(_DistributableAtom):
     """
     The atom ("atomic unit") for dividing up the element dimension in a :class:`MatrixCOPALayout`.
@@ -500,7 +498,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
     def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=2,
                  num_param_dimension_processors=(), param_dimensions=(),
                  param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
-                 layout_creation_circuit_cache = None):
+                 layout_creation_circuit_cache = None, use_old_tree_style: bool = True):
 
         #OUTDATED: TODO - revise this:
         # 1. pre-process => get complete circuits => spam-tuples list for each no-spam circuit (no expanding yet)
@@ -579,7 +577,7 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
 
         def _create_atom(args):
             group, helpful_scratch_group = args
-            if NICK_USE_OLD_EVAL_TREE:
+            if use_old_tree_style:
                 return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 5c608baee..2c742f533 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -9,6 +9,7 @@
     MapForwardSimulator, SimpleMapForwardSimulator, \
     MatrixForwardSimulator,  SimpleMatrixForwardSimulator, \
     TorchForwardSimulator
+from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
 from pygsti.models import ExplicitOpModel
 from pygsti.circuits import Circuit, create_lsgst_circuit_lists
 from pygsti.baseobjs import Label as L
@@ -280,7 +281,8 @@ def setUp(self):
             SimpleMapForwardSimulator(),
             SimpleMatrixForwardSimulator(),
             MapForwardSimulator(),
-            MatrixForwardSimulator()
+            MatrixForwardSimulator(),
+            # LCSEvalTreeMatrixForwardSimulator()
         ]
         if TorchForwardSimulator.ENABLED:
             sims.append(TorchForwardSimulator())
@@ -359,3 +361,6 @@ def test_map_fwdsim(self):
     def test_matrix_fwdsim(self):
         self._run(MatrixForwardSimulator)
 
+    def test_lcs_matrix_fwdsim(self):
+        self._run(LCSEvalTreeMatrixForwardSimulator)
+

From 3581769a3a1fc35534d74ae83991e29187d7c326 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 15:44:13 -0700
Subject: [PATCH 038/141] deactivate test

---
 ...propagation.py => tempdeactivated_test_errorgenpropagation.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/unit/objects/{test_errorgenpropagation.py => tempdeactivated_test_errorgenpropagation.py} (100%)

diff --git a/test/unit/objects/test_errorgenpropagation.py b/test/unit/objects/tempdeactivated_test_errorgenpropagation.py
similarity index 100%
rename from test/unit/objects/test_errorgenpropagation.py
rename to test/unit/objects/tempdeactivated_test_errorgenpropagation.py

From 12723be495a32028dbc5484b7d7a1d2075549d47 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 16:03:03 -0700
Subject: [PATCH 039/141] rename another test file

---
 ...errgenproptools.py => tempdeactivated_test_errgenproptools.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/unit/tools/{test_errgenproptools.py => tempdeactivated_test_errgenproptools.py} (100%)

diff --git a/test/unit/tools/test_errgenproptools.py b/test/unit/tools/tempdeactivated_test_errgenproptools.py
similarity index 100%
rename from test/unit/tools/test_errgenproptools.py
rename to test/unit/tools/tempdeactivated_test_errgenproptools.py

From 56b80044dad92bc261a22f98f6dbaf734a044a76 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 16:45:26 -0700
Subject: [PATCH 040/141] remove unused class

---
 pygsti/forwardsims/matrixforwardsim.py | 1566 ------------------------
 1 file changed, 1566 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index b7a96ab5b..7822dd75d 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2113,1572 +2113,6 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                               dataset, ds_cache)
 
 
-class NicksMatrixForwardSimulator(_DistributableForwardSimulator, SimpleMatrixForwardSimulator):
-    """
-    Computes circuit outcome probabilities by multiplying together circuit-layer process matrices.
-
-    Interfaces with a model via its `circuit_layer_operator` method and extracts a dense matrix
-    representation of operators by calling their `to_dense` method.  An "evaluation tree" that
-    composes all of the circuits using pairwise "joins"  is constructed by a :class:`MatrixCOPALayout`
-    layout object, and this tree then directs pairwise multiplications of process matrices to compute
-    circuit outcome probabilities.  Derivatives are computed analytically, using operators'
-    `deriv_wrt_params` methods.
-
-    Parameters
-    ----------
-    model : Model, optional
-        The parent model of this simulator.  It's fine if this is `None` at first,
-        but it will need to be set (by assigning `self.model` before using this simulator.
-
-    distribute_by_timestamp : bool, optional
-        When `True`, treat the data as time dependent, and distribute the computation of outcome
-        probabilitiesby assigning groups of processors to the distinct time stamps within the
-        dataset.  This means of distribution be used only when the circuits themselves contain
-        no time delay infomation (all circuit layer durations are 0), as operators are cached
-        at the "start" time of each circuit, i.e., the timestamp in the data set.  If `False`,
-        then the data is treated in a time-independent way, and the overall counts for each outcome
-        are used.  If support for intra-circuit time dependence is needed, you must use a different
-        forward simulator (e.g. :class:`MapForwardSimulator`).
-
-    num_atoms : int, optional
-        The number of atoms (sub-evaluation-trees) to use when creating the layout (i.e. when calling
-        :meth:`create_layout`).  This determines how many units the element (circuit outcome
-        probability) dimension is divided into, and doesn't have to correclate with the number of
-        processors.  When multiple processors are used, if `num_atoms` is less than the number of
-        processors then `num_atoms` should divide the number of processors evenly, so that
-        `num_atoms // num_procs` groups of processors can be used to divide the computation
-        over parameter dimensions.
-
-    processor_grid : tuple optional
-        Specifies how the total number of processors should be divided into a number of
-        atom-processors, 1st-parameter-deriv-processors, and 2nd-parameter-deriv-processors.
-        Each level of specification is optional, so this can be a 1-, 2-, or 3- tuple of
-        integers (or None).  Multiplying the elements of `processor_grid` together should give
-        at most the total number of processors.
-
-    param_blk_sizes : tuple, optional
-        The parameter block sizes along the first or first & second parameter dimensions - so
-        this can be a 0-, 1- or 2-tuple of integers or `None` values.  A block size of `None`
-        means that there should be no division into blocks, and that each block processor
-        computes all of its parameter indices at once.
-    """
-
-    @classmethod
-    def _array_types_for_method(cls, method_name):
-        # The array types of *intermediate* or *returned* values within various class methods (for memory estimates)
-        if method_name == '_bulk_fill_probs_block': return cls._array_types_for_method('_compute_product_cache')
-        if method_name == '_bulk_fill_dprobs_block':
-            return cls._array_types_for_method('_compute_product_cache') \
-                + cls._array_types_for_method('_compute_dproduct_cache')
-        if method_name == '_bulk_fill_hprobs_block':
-            return cls._array_types_for_method('_compute_product_cache') \
-                + cls._array_types_for_method('_compute_dproduct_cache') \
-                + cls._array_types_for_method('_compute_hproduct_cache')
-
-        if method_name == '_compute_product_cache': return ('zdd', 'z', 'z')  # cache of gates, scales, and scaleVals
-        if method_name == '_compute_dproduct_cache': return ('zddb',)  # cache x dim x dim x distributed_nparams
-        if method_name == '_compute_hproduct_cache': return ('zddbb',)  # cache x dim x dim x dist_np1 x dist_np2
-        return super()._array_types_for_method(method_name)
-
-    def __init__(self, model=None, distribute_by_timestamp=False, num_atoms=None, processor_grid=None,
-                 param_blk_sizes=None):
-        super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
-        self._mode = "distribute_by_timestamp" if distribute_by_timestamp else "time_independent"
-        self.swap_gate_superop = unitary_to_superop(internal_gate_unitaries()["SWAP"])
-
-        # We are also going to set up lanes to use.
-
-        # Fix it to 5 qubits.
-        self._lanes_used = {0: {0}, 1: {1}, 2: {2,3}, 3: {4}}
-        self._qubits_to_lanes = {0: 0, 1: 1, 2:2, 3:2, 4:3}
-
-
-    def _to_nice_serialization(self):
-        state = super()._to_nice_serialization()
-        state.update({'mode': self._mode,
-                      # (don't serialize parent model or processor distribution info)
-                      })
-        return state
-
-    @classmethod
-    def _from_nice_serialization(cls, state):
-        #Note: resets processor-distribution information
-        return cls(None, state['mode'] == "distribute_by_timestamp")
-
-    def copy(self):
-        """
-        Return a shallow copy of this MatrixForwardSimulator
-
-        Returns
-        -------
-        MatrixForwardSimulator
-        """
-        return MatrixForwardSimulator(self.model)
-
-    def _compute_product_cache(self, layout_atom_tree, resource_alloc):
-        """
-        Computes an array of operation sequence products (process matrices).
-
-        Note: will *not* parallelize computation:  parallelization should be
-        done at a higher level.
-        """
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-
-        #Note: resource_alloc gives procs that could work together to perform
-        # computation, e.g. paralllel dot products but NOT to just partition
-        # futher (e.g. among the wrt_slices) as this is done in the layout.
-        # This function doesn't make use of resource_alloc - all procs compute the same thing.
-
-        eval_tree = layout_atom_tree
-        cacheSize = len(eval_tree)
-        
-        # This is the maximum size any operator can be. However, we are going to make use of the minimum size.
-        prodCache = _np.zeros((cacheSize, dim, dim), 'd')
-        prodCache = [[] for _ in range(cacheSize)] # Build the cache dynamically.
-        scaleCache = _np.zeros(cacheSize, 'd')
-
-        for iDest, iRight, iLeft in eval_tree:
-
-            #Special case of an "initial operation" that can be filled directly
-            if iRight is None:  # then iLeft gives operation:
-                opLabel = iLeft
-                if opLabel is None:
-                    prodCache[iDest] = _np.identity(dim)
-                    # Note: scaleCache[i] = 0.0 from initialization
-                else:
-                    small_gate = 1
-                    if isinstance(opLabel, LabelTup):
-                        small_gate = self.model.operation_blks["gates"][opLabel].to_dense(on_space="minimal")
-                        # We know that this operator is the whole lane.
-
-                        qubits = opLabel.qubits
-                        if len(qubits) == 2:
-                            if qubits[0] > qubits[1]:
-                                # We need to swap.
-                                small_gate = self.swap_gate_superop.T @ small_gate @ self.swap_gate_superop
-
-                    elif isinstance(opLabel, LabelTupTup):
-                        # We need to iterate through this operator in order to build up the right system.
-                        for ind in range(len(opLabel)):
-                            next_matrix = self.model.operation_blks["gates"][opLabel[ind]].to_dense(on_space="minimal")
-                            # Do we need to insert the swap gates?
-                            qubits = opLabel[ind].qubits
-                            if len(qubits) == 2:
-                                if qubits[0] > qubits[1]:
-                                    # We need to swap.
-                                    next_matrix = self.swap_gate_superop.T @ next_matrix @ self.swap_gate_superop
-
-                            small_gate = _np.kron(small_gate, next_matrix)
-                    # gate = self.model.circuit_layer_operator(opLabel, 'op').to_dense(on_space='minimal')
-                    nG = max(_nla.norm(small_gate), 1.0)
-                    prodCache[iDest] = small_gate / nG
-                    scaleCache[iDest] = _np.log(nG)
-                continue
-
-            # combine iLeft + iRight => iDest
-            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
-            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
-            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
-            L, R = prodCache[iLeft], prodCache[iRight]
-            prodCache[iDest] = L @ R
-            scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
-
-            if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
-                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]), 1e-300)
-                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
-                sL, sR = L / nL, R / nR
-                prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
-
-
-        if __debug__:
-            # So that it can be optimized out when called with -o.
-
-            for i in range(cacheSize):
-                # since all scaled gates start with norm <= 1, products should all have norm <= 1
-                assert len((~_np.isfinite(prodCache[i])).nonzero()[0]) == 0
-
-        return prodCache, scaleCache
-
-    def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
-                                resource_alloc=None, wrt_slice=None, profiler=None):
-        """
-        Computes a tree of product derivatives in a linear cache space. Will
-        use derivative columns to parallelize computation.
-        """
-
-        if profiler is None: profiler = _dummy_profiler
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        nDerivCols = self.model.num_params if (wrt_slice is None) \
-            else _slct.length(wrt_slice)
-        deriv_shape = (nDerivCols, dim, dim)
-        eval_tree = layout_atom_tree
-        cacheSize = len(eval_tree)
-
-        #Note: resource_alloc gives procs that could work together to perform
-        # computation, e.g. paralllel dot products but NOT to just partition
-        # futher (e.g. among the wrt_slices) as this is done in the layout.
-        # This function doesn't make use of resource_alloc - all procs compute the same thing.
-
-        ## ------------------------------------------------------------------
-        #
-        ##print("MPI: _compute_dproduct_cache begin: %d deriv cols" % nDerivCols)
-        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
-        #    #print("MPI: _compute_dproduct_cache called w/comm size %d" % comm.Get_size())
-        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
-        #
-        #    if resource_alloc.comm.Get_size() > nDerivCols:
-        #
-        #        #If there are more processors than deriv cols, give a
-        #        # warning -- note that we *cannot* make use of a tree being
-        #        # split because there's no good way to reconstruct the
-        #        # *non-final* parent-tree elements from those of the sub-trees.
-        #        _warnings.warn("Increased speed could be obtained by giving dproduct cache computation"
-        #                       " *fewer* processors, as there are more cpus than derivative columns.")
-        #
-        #    # Use comm to distribute columns
-        #    allDerivColSlice = slice(0, nDerivCols) if (wrt_slice is None) else wrt_slice
-        #    _, myDerivColSlice, _, sub_resource_alloc = \
-        #        _mpit.distribute_slice(allDerivColSlice, resource_alloc.comm)
-        #    #print("MPI: _compute_dproduct_cache over %d cols (%s) (rank %d computing %s)" \
-        #    #    % (nDerivCols, str(allDerivColIndices), comm.Get_rank(), str(myDerivColIndices)))
-        #    if sub_resource_alloc is not None and sub_resource_alloc.comm is not None \
-        #       and sub_resource_alloc.comm.Get_size() > 1:
-        #        _warnings.warn("Too many processors to make use of in "
-        #                       " _compute_dproduct_cache.")
-        #        if sub_resource_alloc.comm.Get_rank() > 0: myDerivColSlice = slice(0, 0)
-        #        #don't compute anything on "extra", i.e. rank != 0, cpus
-        #
-        #    my_results = self._compute_dproduct_cache(
-        #        layout_atom_tree, prod_cache, scale_cache, None, myDerivColSlice, profiler)
-        #    # pass None as comm, *not* mySubComm, since we can't do any
-        #    #  further parallelization
-        #
-        #    tm = _time.time()
-        #    all_results = resource_alloc.comm.allgather(my_results)
-        #    profiler.add_time("MPI IPC", tm)
-        #    return _np.concatenate(all_results, axis=1)  # TODO: remove this concat w/better gather?
-        #
-        ## ------------------------------------------------------------------
-
-        tSerialStart = _time.time()
-        dProdCache = _np.zeros((cacheSize,) + deriv_shape)
-        wrtIndices = _slct.indices(wrt_slice) if (wrt_slice is not None) else None
-
-        for iDest, iRight, iLeft in eval_tree:
-
-            #Special case of an "initial operation" that can be filled directly
-            if iRight is None:  # then iLeft gives operation:
-                opLabel = iLeft
-                if opLabel is None:
-                    dProdCache[iDest] = _np.zeros(deriv_shape)
-                else:
-                    #doperation = self.dproduct( (opLabel,) , wrt_filter=wrtIndices)
-                    doperation = self._doperation(opLabel, wrt_filter=wrtIndices)
-                    dProdCache[iDest] = doperation / _np.exp(scale_cache[iDest])
-                continue
-
-            tm = _time.time()
-
-            # combine iLeft + iRight => i
-            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
-            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
-            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
-            L, R = prod_cache[iLeft], prod_cache[iRight]
-            dL, dR = dProdCache[iLeft], dProdCache[iRight]
-            dProdCache[iDest] = _np.dot(dL, R) + \
-                _np.swapaxes(_np.dot(L, dR), 0, 1)  # dot(dS, T) + dot(S, dT)
-            profiler.add_time("compute_dproduct_cache: dots", tm)
-            profiler.add_count("compute_dproduct_cache: dots")
-
-            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
-            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
-                dProdCache[iDest] /= _np.exp(scale)
-                if dProdCache[iDest].max() < _DSMALL and dProdCache[iDest].min() > -_DSMALL:
-                    _warnings.warn("Scaled dProd small in order to keep prod managable.")
-            elif (_np.count_nonzero(dProdCache[iDest]) and dProdCache[iDest].max() < _DSMALL
-                  and dProdCache[iDest].min() > -_DSMALL):
-                _warnings.warn("Would have scaled dProd but now will not alter scale_cache.")
-
-        #profiler.print_mem("DEBUGMEM: POINT2"); profiler.comm.barrier()
-
-        profiler.add_time("compute_dproduct_cache: serial", tSerialStart)
-        profiler.add_count("compute_dproduct_cache: num columns", nDerivCols)
-
-        return dProdCache
-
-    def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
-                                d_prod_cache2, scale_cache, resource_alloc=None,
-                                wrt_slice1=None, wrt_slice2=None):
-        """
-        Computes a tree of product 2nd derivatives in a linear cache space. Will
-        use derivative rows and columns to parallelize computation.
-        """
-
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-
-        # Note: dProdCache?.shape = (#circuits,#params_to_diff_wrt,dim,dim)
-        nDerivCols1 = d_prod_cache1.shape[1]
-        nDerivCols2 = d_prod_cache2.shape[1]
-        assert(wrt_slice1 is None or _slct.length(wrt_slice1) == nDerivCols1)
-        assert(wrt_slice2 is None or _slct.length(wrt_slice2) == nDerivCols2)
-        hessn_shape = (nDerivCols1, nDerivCols2, dim, dim)
-        eval_tree = layout_atom_tree
-        cacheSize = len(eval_tree)
-
-        #Note: resource_alloc gives procs that could work together to perform
-        # computation, e.g. paralllel dot products but NOT to just partition
-        # futher (e.g. among the wrt_slices) as this is done in the layout.
-        # This function doesn't make use of resource_alloc - all procs compute the same thing.
-
-        ## ------------------------------------------------------------------
-        #
-        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
-        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
-        #
-        #    if resource_alloc.comm.Get_size() > nDerivCols1 * nDerivCols2:
-        #        #If there are more processors than deriv cells, give a
-        #        # warning -- note that we *cannot* make use of a tree being
-        #        # split because there's no good way to reconstruct the
-        #        # *non-final* parent-tree elements from those of the sub-trees.
-        #        _warnings.warn("Increased speed could be obtained"
-        #                       " by giving hproduct cache computation"
-        #                       " *fewer* processors and *smaller* (sub-)tree"
-        #                       " (e.g. by splitting tree beforehand), as there"
-        #                       " are more cpus than hessian elements.")  # pragma: no cover
-        #
-        #    # allocate final result memory
-        #    hProdCache = _np.zeros((cacheSize,) + hessn_shape)
-        #
-        #    # Use comm to distribute columns
-        #    allDeriv1ColSlice = slice(0, nDerivCols1)
-        #    allDeriv2ColSlice = slice(0, nDerivCols2)
-        #    deriv1Slices, myDeriv1ColSlice, deriv1Owners, mySubComm = \
-        #        _mpit.distribute_slice(allDeriv1ColSlice, resource_alloc.comm)
-        #
-        #    # Get slice into entire range of model params so that
-        #    #  per-gate hessians can be computed properly
-        #    if wrt_slice1 is not None and wrt_slice1.start is not None:
-        #        myHessianSlice1 = _slct.shift(myDeriv1ColSlice, wrt_slice1.start)
-        #    else: myHessianSlice1 = myDeriv1ColSlice
-        #
-        #    #print("MPI: _compute_hproduct_cache over %d cols (rank %d computing %s)" \
-        #    #    % (nDerivCols2, comm.Get_rank(), str(myDerivColSlice)))
-        #
-        #    if mySubComm is not None and mySubComm.Get_size() > 1:
-        #        deriv2Slices, myDeriv2ColSlice, deriv2Owners, mySubSubComm = \
-        #            _mpit.distribute_slice(allDeriv2ColSlice, mySubComm)
-        #
-        #        # Get slice into entire range of model params (see above)
-        #        if wrt_slice2 is not None and wrt_slice2.start is not None:
-        #            myHessianSlice2 = _slct.shift(myDeriv2ColSlice, wrt_slice2.start)
-        #        else: myHessianSlice2 = myDeriv2ColSlice
-        #
-        #        if mySubSubComm is not None and mySubSubComm.Get_size() > 1:
-        #            _warnings.warn("Too many processors to make use of in "
-        #                           " _compute_hproduct_cache.")
-        #            #TODO: remove: not needed now that we track owners
-        #            #if mySubSubComm.Get_rank() > 0: myDeriv2ColSlice = slice(0,0)
-        #            #  #don't compute anything on "extra", i.e. rank != 0, cpus
-        #
-        #        hProdCache[:, myDeriv1ColSlice, myDeriv2ColSlice] = self._compute_hproduct_cache(
-        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice],
-        #            d_prod_cache2[:, myDeriv2ColSlice], scale_cache, None, myHessianSlice1, myHessianSlice2)
-        #        # pass None as comm, *not* mySubSubComm, since we can't do any further parallelization
-        #
-        #        #NOTE: we only need to gather to the root processor (TODO: update this)
-        #        _mpit.gather_slices(deriv2Slices, deriv2Owners, hProdCache, [None, myDeriv1ColSlice],
-        #                            2, mySubComm)  # , gather_mem_limit) #gather over col-distribution (Deriv2)
-        #        #note: gathering axis 2 of hProdCache[:,myDeriv1ColSlice],
-        #        #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
-        #    else:
-        #        #compute "Deriv1" row-derivatives distribution only; don't use column distribution
-        #        hProdCache[:, myDeriv1ColSlice] = self._compute_hproduct_cache(
-        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice], d_prod_cache2,
-        #            scale_cache, None, myHessianSlice1, wrt_slice2)
-        #        # pass None as comm, *not* mySubComm (this is ok, see "if" condition above)
-        #
-        #    #NOTE: we only need to gather to the root processor (TODO: update this)
-        #    _mpit.gather_slices(deriv1Slices, deriv1Owners, hProdCache, [], 1, resource_alloc.comm)
-        #    #, gather_mem_limit) #gather over row-distribution (Deriv1)
-        #    #note: gathering axis 1 of hProdCache,
-        #    #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
-        #
-        #    return hProdCache
-        #
-        ## ------------------------------------------------------------------
-
-        hProdCache = _np.zeros((cacheSize,) + hessn_shape)
-        wrtIndices1 = _slct.indices(wrt_slice1) if (wrt_slice1 is not None) else None
-        wrtIndices2 = _slct.indices(wrt_slice2) if (wrt_slice2 is not None) else None
-
-        for iDest, iRight, iLeft in eval_tree:
-
-            #Special case of an "initial operation" that can be filled directly
-            if iRight is None:  # then iLeft gives operation:
-                opLabel = iLeft
-                if opLabel is None:
-                    hProdCache[iDest] = _np.zeros(hessn_shape)
-                elif not self.model.circuit_layer_operator(opLabel, 'op').has_nonzero_hessian():
-                    #all gate elements are at most linear in params, so
-                    # all hessians for single- or zero-circuits are zero.
-                    hProdCache[iDest] = _np.zeros(hessn_shape)
-                else:
-                    hoperation = self._hoperation(opLabel,
-                                                  wrt_filter1=wrtIndices1,
-                                                  wrt_filter2=wrtIndices2)
-                    hProdCache[iDest] = hoperation / _np.exp(scale_cache[iDest])
-                continue
-
-            # combine iLeft + iRight => i
-            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
-            # (Dest,iLeft,iRight,iFinal) = tup implies circuit[iDest] = circuit[iLeft] + circuit[iRight], but we want:
-            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
-            L, R = prod_cache[iLeft], prod_cache[iRight]
-            dL1, dR1 = d_prod_cache1[iLeft], d_prod_cache1[iRight]
-            dL2, dR2 = d_prod_cache2[iLeft], d_prod_cache2[iRight]
-            hL, hR = hProdCache[iLeft], hProdCache[iRight]
-            # Note: L, R = GxG ; dL,dR = vgs x GxG ; hL,hR = vgs x vgs x GxG
-
-            dLdRa = _np.swapaxes(_np.dot(dL1, dR2), 1, 2)
-            dLdRb = _np.swapaxes(_np.dot(dL2, dR1), 1, 2)
-            dLdR_sym = dLdRa + _np.swapaxes(dLdRb, 0, 1)
-
-            hProdCache[iDest] = _np.dot(hL, R) + dLdR_sym + _np.transpose(_np.dot(L, hR), (1, 2, 0, 3))
-
-            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
-            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
-                hProdCache[iDest] /= _np.exp(scale)
-                if hProdCache[iDest].max() < _HSMALL and hProdCache[iDest].min() > -_HSMALL:
-                    _warnings.warn("Scaled hProd small in order to keep prod managable.")
-            elif (_np.count_nonzero(hProdCache[iDest]) and hProdCache[iDest].max() < _HSMALL
-                  and hProdCache[iDest].min() > -_HSMALL):
-                _warnings.warn("hProd is small (oh well!).")
-
-        return hProdCache
-
-    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
-        """
-        Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
-
-        Parameters
-        ----------
-        circuits : list
-            The circuits whose outcome probabilities should be included in the layout.
-
-        dataset : DataSet
-            The source of data counts that will be compared to the circuit outcome
-            probabilities.  The computed outcome probabilities are limited to those
-            with counts present in `dataset`.
-
-        resource_alloc : ResourceAllocation
-            A available resources and allocation information.  These factors influence how
-            the layout (evaluation strategy) is constructed.
-
-        array_types : tuple, optional
-            A tuple of string-valued array types.  See :meth:`ForwardSimulator.create_layout`.
-
-        derivative_dimensions : int or tuple[int], optional
-            Optionally, the parameter-space dimension used when taking first
-            and second derivatives with respect to the cirucit outcome probabilities.  This must be
-            non-None when `array_types` contains `'ep'` or `'epp'` types.
-            If a tuple, then must be length 1.
-
-        verbosity : int or VerbosityPrinter
-            Determines how much output to send to stdout.  0 means no output, higher
-            integers mean more output.
-
-        layout_creation_circuit_cache : dict, optional (default None)
-            A precomputed dictionary serving as a cache for completed
-            circuits. I.e. circuits with prep labels and POVM labels appended.
-            Along with other useful pre-computed circuit structures used in layout
-            creation.
-            
-        Returns
-        -------
-        MatrixCOPALayout
-        """
-        # There are two types of quantities we adjust to create a good layout: "group-counts" and "processor-counts"
-        #  - group counts:  natoms, nblks, nblks2 give how many indpendently computed groups/ranges of circuits,
-        #                   1st parameters, and 2nd parameters are used.  Making these larger can reduce memory
-        #                   consumption by reducing intermediate memory usage.
-        #  - processor counts: na, np, np2 give how many "atom-processors", "param-processors" and "param2-processors"
-        #                      are used to process data along each given direction.  These values essentially specify
-        #                      how the physical procesors are divided by giving the number of (roughly equal) intervals
-        #                      exist along each dimension of the physical processor "grid".  Thus, thees values are set
-        #                      based on the total number of cores available and how many dimensions are being computed.
-
-        resource_alloc = _ResourceAllocation.cast(resource_alloc)
-        mem_limit = resource_alloc.mem_limit - resource_alloc.allocated_memory \
-            if (resource_alloc.mem_limit is not None) else None  # *per-processor* memory limit
-        printer = _VerbosityPrinter.create_printer(verbosity, resource_alloc)
-        nprocs = resource_alloc.comm_size
-        comm = resource_alloc.comm
-        if isinstance(derivative_dimensions, int):
-            num_params = derivative_dimensions
-        elif isinstance(derivative_dimensions, tuple):
-            assert len(derivative_dimensions) == 1
-            num_params = derivative_dimensions[0]
-        else:
-            num_params = self.model.num_params
-        C = 1.0 / (1024.0**3)
-
-        if mem_limit is not None:
-            if mem_limit <= 0:
-                raise MemoryError("Attempted layout creation w/memory limit = %g <= 0!" % mem_limit)
-            printer.log("Layout creation w/mem limit = %.2fGB" % (mem_limit * C))
-
-        natoms, na, npp, param_dimensions, param_blk_sizes = self._compute_processor_distribution(
-            array_types, nprocs, num_params, len(circuits), default_natoms=1)
-
-        if self._mode == "distribute_by_timestamp":
-            #Special case: time dependent data that gets grouped & distributed by unique timestamp
-            # To to this, we override above values of natoms, na, and npp:
-            natoms = 1  # save all processor division for within the (single) atom, for different timestamps
-            na, npp = 1, (1, 1)  # save all processor division for within the (single) atom, for different timestamps
-
-        printer.log("MatrixLayout: %d processors divided into %s (= %d) grid along circuit and parameter directions." %
-                    (nprocs, ' x '.join(map(str, (na,) + npp)), _np.prod((na,) + npp)))
-        printer.log("   %d atoms, parameter block size limits %s" % (natoms, str(param_blk_sizes)))
-        assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
-
-        layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
-                                   na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
-                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
-
-        if mem_limit is not None:
-            loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
-            loc_nparams2 = num_params / npp[1] if len(npp) > 1 else 0
-            blk1 = param_blk_sizes[0] if len(param_blk_sizes) > 0 else 0
-            blk2 = param_blk_sizes[1] if len(param_blk_sizes) > 1 else 0
-            if blk1 is None: blk1 = loc_nparams1
-            if blk2 is None: blk2 = loc_nparams2
-            global_layout = layout.global_layout
-            if comm is not None:
-                from mpi4py import MPI
-                max_local_els = comm.allreduce(layout.num_elements, op=MPI.MAX)    # layout.max_atom_elements
-                max_atom_els = comm.allreduce(layout.max_atom_elements, op=MPI.MAX)
-                max_local_circuits = comm.allreduce(layout.num_circuits, op=MPI.MAX)
-                max_atom_cachesize = comm.allreduce(layout.max_atom_cachesize, op=MPI.MAX)
-            else:
-                max_local_els = layout.num_elements
-                max_atom_els = layout.max_atom_elements
-                max_local_circuits = layout.num_circuits
-                max_atom_cachesize = layout.max_atom_cachesize
-            mem_estimate = _bytes_for_array_types(array_types, global_layout.num_elements, max_local_els, max_atom_els,
-                                                  global_layout.num_circuits, max_local_circuits,
-                                                  layout._param_dimensions, (loc_nparams1, loc_nparams2),
-                                                  (blk1, blk2), max_atom_cachesize,
-                                                  self.model.evotype.minimal_dim(self.model.state_space))
-
-            GB = 1.0 / 1024.0**3
-            if mem_estimate > mem_limit:
-                raise MemoryError("Not enough memory for desired layout! (limit=%.1fGB, required=%.1fGB)" % (
-                    mem_limit * GB, mem_estimate * GB))
-            else:
-                printer.log("   Esimated memory required = %.1fGB" % (mem_estimate * GB))
-
-        return layout
-    
-    @staticmethod
-    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
-        """
-        Helper function for pre-computing/pre-processing circuits structures
-        used in matrix layout creation.
-        """
-        cache = dict()
-        completed_circuits, split_circuits = model.complete_circuits(circuits, return_split=True)
-
-        cache['completed_circuits'] = {ckt: comp_ckt for ckt, comp_ckt in zip(circuits, completed_circuits)}
-        cache['split_circuits'] = {ckt: split_ckt for ckt, split_ckt in zip(circuits, split_circuits)}
-
-        if dataset is not None:
-            aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
-            ds_circuits = _lt.apply_aliases_to_circuits(circuits, aliases)
-            unique_outcomes_list = []
-            for ckt in ds_circuits:
-                ds_row = dataset[ckt]
-                unique_outcomes_list.append(ds_row.unique_outcomes if ds_row is not None else None)
-        else:
-            unique_outcomes_list = [None]*len(circuits)
-
-        expanded_circuit_outcome_list = model.bulk_expand_instruments_and_separate_povm(circuits, 
-                                                                                        observed_outcomes_list = unique_outcomes_list, 
-                                                                                        split_circuits = split_circuits)
-        
-        expanded_circuit_cache = {ckt: expanded_ckt for ckt,expanded_ckt in zip(circuits, expanded_circuit_outcome_list)}
-                    
-        cache['expanded_and_separated_circuits'] = expanded_circuit_cache
-
-        expanded_subcircuits_no_spam_cache = dict()
-        for expc_outcomes in cache['expanded_and_separated_circuits'].values():
-            for sep_povm_c, _ in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
-                exp_nospam_c = sep_povm_c.circuit_without_povm[1:] 
-                expanded_subcircuits_no_spam_cache[exp_nospam_c] = exp_nospam_c.expand_subcircuits()
-
-        cache['expanded_subcircuits_no_spam'] = expanded_subcircuits_no_spam_cache
-
-        return cache
-
-    def _scale_exp(self, scale_exps):
-        old_err = _np.seterr(over='ignore')
-        scaleVals = _np.exp(scale_exps)  # may overflow, but OK if infs occur here
-        _np.seterr(**old_err)
-        return scaleVals
-
-    def _rho_e_from_spam_tuple(self, spam_tuple):
-        # This calculator uses the convention that rho has shape (N,1)
-        rholabel, elabel = spam_tuple
-        rho = self.model.circuit_layer_operator(rholabel, 'prep').to_dense(on_space='minimal')[:, None]
-        E = _np.conjugate(_np.transpose(self.model.circuit_layer_operator(
-            elabel, 'povm').to_dense(on_space='minimal')[:, None]))
-        return rho, E
-
-    def _probs_from_rho_e(self, rho, e, gs, scale_vals):
-        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
-
-        #Compute probability and save in return array
-        # want vp[iFinal] = float(dot(e, dot(G, rho)))
-        #  vp[i] = sum_k,l e[0,k] gs[i,k,l] rho[l,0] * scale_vals[i]
-        #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
-        #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
-        #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
-        return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
-        # shape == (len(circuit_list),) ; may overflow but OK
-
-    def _dprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs, scale_vals, wrt_slice=None):
-        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
-
-        rholabel, elabel = spam_tuple
-        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
-        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
-        nCircuits = gs.shape[0]
-
-        nDerivCols = self.model.num_params if wrt_slice is None else _slct.length(wrt_slice)
-
-        # GATE DERIVS (assume d_gs is already sized/filtered) -------------------
-        assert(d_gs.shape[1] == nDerivCols), "d_gs must be pre-filtered!"
-
-        #Compute d(probability)/dOps and save in return list (now have G,dG => product, dprod_dOps)
-        #  prod, dprod_dOps = G,dG
-        # dp_dOps[i,j] = sum_k,l e[0,k] d_gs[i,j,k,l] rho[l,0]
-        # dp_dOps[i,j] = sum_k e[0,k] dot( d_gs, rho )[i,j,k,0]
-        # dp_dOps[i,j] = dot( e, dot( d_gs, rho ) )[0,i,j,0]
-        # dp_dOps      = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,3))
-        old_err2 = _np.seterr(invalid='ignore', over='ignore')
-        path = _np.einsum_path('hk,ijkl,lm->ij', e, d_gs, rho, optimize='optimal')
-        dp_dOps = _np.einsum('hk,ijkl,lm->ij', e, d_gs, rho, optimize=path[0]) * scale_vals[:, None]
-        _np.seterr(**old_err2)
-        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols)
-        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
-        #  this case set to zero since we can't tell whether it's + or - inf anyway...
-        dp_dOps[_np.isnan(dp_dOps)] = 0
-
-        #SPAM -------------
-
-        if self.model._param_interposer is not None:
-            #When there is an interposer, we compute derivs wrt *all* the ops params (inefficient?),
-            # then apply interposer, then take desired wrt_filter columns:
-            nOpDerivCols = self.model._param_interposer.num_op_params
-
-            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
-            _fas(dp_drhos, [None, rhoVec.gpindices],
-                 _np.squeeze(_np.dot(_np.dot(e, gs), rhoVec.deriv_wrt_params()),  # *don't* apply wrt filter here
-                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
-            dp_drhos = _np.dot(dp_drhos, self.model._param_interposer.deriv_op_params_wrt_model_params())
-            if wrt_slice is not None: dp_drhos = dp_drhos[:, wrt_slice]
-
-            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
-            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
-            _fas(dp_dEs, [None, EVec.gpindices], _np.dot(dp_dAnyE, EVec.deriv_wrt_params()))
-            dp_dEs = _np.dot(dp_dEs, self.model._param_interposer.deriv_op_params_wrt_model_params())
-            if wrt_slice is not None: dp_dEs = dp_dEs[:, wrt_slice]
-
-        else:
-            #Simpler case of no interposer
-            nOpDerivCols = nDerivCols
-
-            rho_wrtFilter, rho_gpindices = self._process_wrt_filter(
-                wrt_slice, self.model.circuit_layer_operator(rholabel, 'prep'))
-            E_wrtFilter, E_gpindices = self._process_wrt_filter(
-                wrt_slice, self.model.circuit_layer_operator(elabel, 'povm'))
-
-            # Get: dp_drhos[i, rho_gpindices] = dot(e,gs[i],drho/drhoP)
-            # dp_drhos[i,J0+J] = sum_kl e[0,k] gs[i,k,l] drhoP[l,J]
-            # dp_drhos[i,J0+J] = dot(e, gs, drhoP)[0,i,J]
-            # dp_drhos[:,J0+J] = squeeze(dot(e, gs, drhoP),axis=(0,))[:,J]
-            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
-            _fas(dp_drhos, [None, rho_gpindices],
-                 _np.squeeze(_np.dot(_np.dot(e, gs),
-                                     rhoVec.deriv_wrt_params(rho_wrtFilter)),
-                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
-
-            # Get: dp_dEs[i, E_gpindices] = dot(transpose(dE/dEP),gs[i],rho))
-            # dp_dEs[i,J0+J] = sum_lj dEPT[J,j] gs[i,j,l] rho[l,0]
-            # dp_dEs[i,J0+J] = sum_j dEP[j,J] dot(gs, rho)[i,j]
-            # dp_dEs[i,J0+J] = sum_j dot(gs, rho)[i,j,0] dEP[j,J]
-            # dp_dEs[i,J0+J] = dot(squeeze(dot(gs, rho),2), dEP)[i,J]
-            # dp_dEs[:,J0+J] = dot(squeeze(dot(gs, rho),axis=(2,)), dEP)[:,J]
-            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
-            # may overflow, but OK (deriv w.r.t any of self.effects - independent of which)
-            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
-            _fas(dp_dEs, [None, E_gpindices],
-                 _np.dot(dp_dAnyE, EVec.deriv_wrt_params(E_wrtFilter)))
-
-        sub_vdp = dp_drhos + dp_dEs + dp_dOps
-        return sub_vdp
-
-    def _hprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs1, d_gs2, h_gs, scale_vals,
-                           wrt_slice1=None, wrt_slice2=None):
-        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
-
-        rholabel, elabel = spam_tuple
-        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
-        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
-        nCircuits = gs.shape[0]
-
-        rho_wrtFilter1, rho_gpindices1 = self._process_wrt_filter(
-            wrt_slice1, self.model.circuit_layer_operator(rholabel, 'prep'))
-        rho_wrtFilter2, rho_gpindices2 = self._process_wrt_filter(
-            wrt_slice2, self.model.circuit_layer_operator(rholabel, 'prep'))
-        E_wrtFilter1, E_gpindices1 = self._process_wrt_filter(
-            wrt_slice1, self.model.circuit_layer_operator(elabel, 'povm'))
-        E_wrtFilter2, E_gpindices2 = self._process_wrt_filter(
-            wrt_slice2, self.model.circuit_layer_operator(elabel, 'povm'))
-
-        nDerivCols1 = self.model.num_params if wrt_slice1 is None else _slct.length(wrt_slice1)
-        nDerivCols2 = self.model.num_params if wrt_slice2 is None else _slct.length(wrt_slice2)
-
-        #flt1 = self._get_filter_info(wrtSlices1)
-        #flt2 = self._get_filter_info(wrtSlices2)
-
-        # GATE DERIVS (assume h_gs is already sized/filtered) -------------------
-        assert(h_gs.shape[1] == nDerivCols1), "h_gs must be pre-filtered!"
-        assert(h_gs.shape[2] == nDerivCols2), "h_gs must be pre-filtered!"
-
-        #Compute d2(probability)/dGates2 and save in return list
-        # d2pr_dOps2[i,j,k] = sum_l,m e[0,l] h_gs[i,j,k,l,m] rho[m,0]
-        # d2pr_dOps2[i,j,k] = sum_l e[0,l] dot( d_gs, rho )[i,j,k,l,0]
-        # d2pr_dOps2[i,j,k] = dot( e, dot( d_gs, rho ) )[0,i,j,k,0]
-        # d2pr_dOps2        = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,4))
-        old_err2 = _np.seterr(invalid='ignore', over='ignore')
-        d2pr_dOps2 = _np.squeeze(_np.dot(e, _np.dot(h_gs, rho)), axis=(0, 4)) * scale_vals[:, None, None]
-        _np.seterr(**old_err2)
-
-        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols, nDerivCols)
-        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
-        #  this case set to zero since we can't tell whether it's + or - inf anyway...
-        d2pr_dOps2[_np.isnan(d2pr_dOps2)] = 0
-
-        # SPAM DERIVS (assume d_gs1 and d_gs2 are already sized/filtered) --------
-        assert(d_gs1.shape[1] == nDerivCols1), "d_gs1 must be pre-filtered!"
-        assert(d_gs2.shape[1] == nDerivCols2), "d_gs1 must be pre-filtered!"
-
-        # Get: d2pr_drhos[i, j, rho_gpindices] = dot(e,d_gs[i,j],drho/drhoP))
-        # d2pr_drhos[i,j,J0+J] = sum_kl e[0,k] d_gs[i,j,k,l] drhoP[l,J]
-        # d2pr_drhos[i,j,J0+J] = dot(e, d_gs, drhoP)[0,i,j,J]
-        # d2pr_drhos[:,:,J0+J] = squeeze(dot(e, d_gs, drhoP),axis=(0,))[:,:,J]
-        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
-        d2pr_drhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-        _fas(d2pr_drhos1, [None, None, rho_gpindices2],
-             _np.squeeze(_np.dot(_np.dot(e, d_gs1), drho), axis=(0,))
-             * scale_vals[:, None, None])  # overflow OK
-
-        # get d2pr_drhos where gate derivatives are wrt the 2nd set of gate parameters
-        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
-            assert(nDerivCols1 == nDerivCols2)
-            d2pr_drhos2 = _np.transpose(d2pr_drhos1, (0, 2, 1))
-        else:
-            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
-            d2pr_drhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
-            _fas(d2pr_drhos2, [None, None, rho_gpindices1],
-                 _np.squeeze(_np.dot(_np.dot(e, d_gs2), drho), axis=(0,))
-                 * scale_vals[:, None, None])  # overflow OK
-            d2pr_drhos2 = _np.transpose(d2pr_drhos2, (0, 2, 1))
-
-        # Get: d2pr_dEs[i, j, E_gpindices] = dot(transpose(dE/dEP),d_gs[i,j],rho)
-        # d2pr_dEs[i,j,J0+J] = sum_kl dEPT[J,k] d_gs[i,j,k,l] rho[l,0]
-        # d2pr_dEs[i,j,J0+J] = sum_k dEP[k,J] dot(d_gs, rho)[i,j,k,0]
-        # d2pr_dEs[i,j,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[i,j,J]
-        # d2pr_dEs[:,:,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[:,:,J]
-        d2pr_dEs1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-        dp_dAnyE = _np.squeeze(_np.dot(d_gs1, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
-        devec = EVec.deriv_wrt_params(E_wrtFilter2)
-        _fas(d2pr_dEs1, [None, None, E_gpindices2],
-             _np.dot(dp_dAnyE, devec))
-
-        # get d2pr_dEs where gate derivatives are wrt the 2nd set of gate parameters
-        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
-            assert(nDerivCols1 == nDerivCols2)
-            d2pr_dEs2 = _np.transpose(d2pr_dEs1, (0, 2, 1))
-        else:
-            d2pr_dEs2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
-            dp_dAnyE = _np.squeeze(_np.dot(d_gs2, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
-            devec = EVec.deriv_wrt_params(E_wrtFilter1)
-            _fas(d2pr_dEs2, [None, None, E_gpindices1], _np.dot(dp_dAnyE, devec))
-            d2pr_dEs2 = _np.transpose(d2pr_dEs2, (0, 2, 1))
-
-        # Get: d2pr_dErhos[i, e_offset[eIndex]:e_offset[eIndex+1], e_offset[rhoIndex]:e_offset[rhoIndex+1]] =
-        #    dEP^T * prod[i,:,:] * drhoP
-        # d2pr_dErhos[i,J0+J,K0+K] = sum jk dEPT[J,j] prod[i,j,k] drhoP[k,K]
-        # d2pr_dErhos[i,J0+J,K0+K] = sum j dEPT[J,j] dot(prod,drhoP)[i,j,K]
-        # d2pr_dErhos[i,J0+J,K0+K] = dot(dEPT,prod,drhoP)[J,i,K]
-        # d2pr_dErhos[i,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[i,J,K]
-        # d2pr_dErhos[:,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[:,J,K]
-        d2pr_dErhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
-        dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
-        devec = EVec.deriv_wrt_params(E_wrtFilter1)
-        _fas(d2pr_dErhos1, (None, E_gpindices1, rho_gpindices2),
-             _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
-
-        # get d2pr_dEs where e derivatives are wrt the 2nd set of gate parameters
-        if wrt_slice1 == wrt_slice2:  # Note: this doesn't involve gate derivatives
-            d2pr_dErhos2 = _np.transpose(d2pr_dErhos1, (0, 2, 1))
-        else:
-            d2pr_dErhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
-            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
-            dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
-            devec = EVec.deriv_wrt_params(E_wrtFilter2)
-            _fas(d2pr_dErhos2, [None, E_gpindices2, rho_gpindices1],
-                 _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
-            d2pr_dErhos2 = _np.transpose(d2pr_dErhos2, (0, 2, 1))
-
-        #Note: these 2nd derivatives are non-zero when the spam vectors have
-        # a more than linear dependence on their parameters.
-        if self.model.circuit_layer_operator(rholabel, 'prep').has_nonzero_hessian():
-            dp_dAnyRho = _np.dot(e, gs).squeeze(0) * scale_vals[:, None]  # overflow OK
-            d2pr_d2rhos = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-            _fas(d2pr_d2rhos, [None, rho_gpindices1, rho_gpindices2],
-                 _np.tensordot(dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep').hessian_wrt_params(
-                     rho_wrtFilter1, rho_wrtFilter2), (1, 0)))
-            # _np.einsum('ij,jkl->ikl', dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep') \
-            #    .hessian_wrt_params(rho_wrtFilter1, rho_wrtFilter2))
-        else:
-            d2pr_d2rhos = 0
-
-        if self.model.circuit_layer_operator(elabel, 'povm').has_nonzero_hessian():
-            dp_dAnyE = _np.dot(gs, rho).squeeze(2) * scale_vals[:, None]  # overflow OK
-            d2pr_d2Es = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-            _fas(d2pr_d2Es, [None, E_gpindices1, E_gpindices2],
-                 _np.tensordot(dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
-                     E_wrtFilter1, E_wrtFilter2), (1, 0)))
-            # _np.einsum('ij,jkl->ikl', dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
-            #    E_wrtFilter1, E_wrtFilter2))
-        else:
-            d2pr_d2Es = 0
-
-        # END SPAM DERIVS -----------------------
-
-        ret = d2pr_d2rhos + d2pr_dErhos2 + d2pr_drhos2    # wrt rho
-        ret += d2pr_dErhos1 + d2pr_d2Es + d2pr_dEs2      # wrt e
-        ret += d2pr_drhos1 + d2pr_dEs1 + d2pr_dOps2   # wrt gates
-
-        return ret
-
-    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
-        #Free memory from previous subtree iteration before computing caches
-        scaleVals = Gs = prodCache = scaleCache = None
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2)  # prod cache
-
-        #Fill cache info
-        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
-
-        if not resource_alloc.is_host_leader:
-            # (same as "if resource_alloc.host_comm is not None and resource_alloc.host_comm.rank != 0")
-            # we cannot further utilize multiplie processors when computing a single block.  The required
-            # ending condition is that array_to_fill on each processor has been filled.  But if memory
-            # is being shared and resource_alloc contains multiple processors on a single host, we only
-            # want *one* (the rank=0) processor to perform the computation, since array_to_fill will be
-            # shared memory that we don't want to have muliple procs using simultaneously to compute the
-            # same thing.  Thus, we just do nothing on all of the non-root host_comm processors.
-            # We could also print a warning (?), or we could carefully guard any shared mem updates
-            # using "if resource_alloc.is_host_leader" conditions (if we could use  multiple procs elsewhere).
-            return
-
-        #use cached data to final values
-        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
-        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
-        # ( n_circuits, dim, dim )
-
-        old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
-            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
-            #  to the the element indices when `spamtuple` is used.
-            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices],
-                 self._probs_from_rho_e(rho, E, Gs[tree_indices], scaleVals[tree_indices]))
-        _np.seterr(**old_err)
-
-    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim * dim * _slct.length(param_slice))
-        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
-        dProdCache = self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
-                                                  resource_alloc, param_slice)
-        if not resource_alloc.is_host_leader:
-            return  # Non-root host processors aren't used anymore to compute the result on the root proc
-
-        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
-        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
-        dGs = layout_atom.nonscratch_cache_view(dProdCache, axis=0)
-
-        old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices, dest_param_slice], self._dprobs_from_rho_e(
-                spam_tuple, rho, E, Gs[tree_indices], dGs[tree_indices], scaleVals[tree_indices], param_slice))
-
-        _np.seterr(**old_err)
-
-    def _bulk_fill_hprobs_atom(self, array_to_fill, dest_param_slice1, dest_param_slice2, layout_atom,
-                               param_slice1, param_slice2, resource_alloc):
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2
-                                                 * _slct.length(param_slice1) * _slct.length(param_slice2))
-        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
-        dProdCache1 = self._compute_dproduct_cache(
-            layout_atom.tree, prodCache, scaleCache, resource_alloc, param_slice1)  # computed on rank=0 only
-        dProdCache2 = dProdCache1 if (param_slice1 == param_slice2) else \
-            self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
-                                         resource_alloc, param_slice2)  # computed on rank=0 only
-        hProdCache = self._compute_hproduct_cache(layout_atom.tree, prodCache, dProdCache1,
-                                                  dProdCache2, scaleCache, resource_alloc,
-                                                  param_slice1, param_slice2)  # computed on rank=0 only
-
-        if not resource_alloc.is_host_leader:
-            return  # Non-root host processors aren't used anymore to compute the result on the root proc
-
-        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
-        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
-        dGs1 = layout_atom.nonscratch_cache_view(dProdCache1, axis=0)
-        dGs2 = layout_atom.nonscratch_cache_view(dProdCache2, axis=0)
-        #( n_circuits, nDerivColsX, dim, dim )
-
-        hGs = layout_atom.nonscratch_cache_view(hProdCache, axis=0)
-        #( n_circuits, len(wrt_filter1), len(wrt_filter2), dim, dim )
-
-        old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices, dest_param_slice1, dest_param_slice2], self._hprobs_from_rho_e(
-                spam_tuple, rho, E, Gs[tree_indices], dGs1[tree_indices], dGs2[tree_indices],
-                hGs[tree_indices], scaleVals[tree_indices], param_slice1, param_slice2))
-
-        _np.seterr(**old_err)
-
-    def bulk_product(self, circuits, scale=False, resource_alloc=None):
-        """
-        Compute the products of many circuits at once.
-
-        Parameters
-        ----------
-        circuits : list of Circuits
-            The circuits to compute products for.  These should *not* have any preparation or
-            measurement layers.
-
-        scale : bool, optional
-            When True, return a scaling factor (see below).
-
-        resource_alloc : ResourceAllocation
-            Available resources for this computation. Includes the number of processors
-            (MPI comm) and memory limit.
-
-        Returns
-        -------
-        prods : numpy array
-            Array of shape S x G x G, where:
-            - S == the number of operation sequences
-            - G == the linear dimension of a operation matrix (G x G operation matrices).
-        scaleValues : numpy array
-            Only returned when scale == True. A length-S array specifying
-            the scaling that needs to be applied to the resulting products
-            (final_product[i] = scaleValues[i] * prods[i]).
-        """
-        resource_alloc = _ResourceAllocation.cast(resource_alloc)
-
-        # Need to break these circuits down into lanes first.
-        def compute_subcircuits(circuit, lanes_to_qubits_used, qubits_to_lanes):
-
-            lanes_to_gates = [[] for _ in range(len(lanes_to_qubits_used))]
-            for layer in circuit:
-                if isinstance(layer, LabelTupTup):
-                    group = []
-                    nused = 0
-                    for op in layer:
-                        qubits_used = op.qubits
-                        lane = qubits_to_lanes[qubits_used[0]]
-                        if nused + len(qubits_used) == len(lanes_to_qubits_used[lane]):
-                            group.append(op)
-                            lanes_to_gates[lane].append(LabelTupTup(tuple(group)))
-                            nused = 0
-                            group = []
-                        elif nused + len(qubits_used) < len(lanes_to_qubits_used[lane]):
-                            nused += len(qubits_used)
-                            group.append(op)
-                        else:
-                            raise ValueError("Too many indices")
-                elif isinstance(layer, LabelTup):
-                    qubits_used = layer.qubits
-                    lanes_to_gates[qubits_to_lanes[qubits_used[0]]] = layer
-            return lanes_to_gates
-
-        full_list = []
-        for cir in circuits:
-            full_list.append(compute_subcircuits(cir, self._lanes_used, self._qubits_to_lanes))
-
-
-        nCircuits = len(circuits)
-
-        eval_tree = _EvalTree.create(full_list)
-        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
-
-        # Now the cache will also hold the circuit lanes.
-        # So 0:nCircuits*nLanes will hold all the Gs.
-        # Tensor back up in a [(lane)*nLanes, (lane+1)*nLanes]
-
-        sval = _np.zeros(len(circuits))
-        gates = [1 for _ in circuits]
-
-        for ind in range():
-            for lane in range(len(self._lanes_used)):
-                gates[ind] = _np.kron(gates[ind], prodCache[lane + ind*len(self._lanes_used)])
-                sval[ind] += scaleCache[lane + ind*len(self._lanes_used)]
-
-        gates = _np.array(gates)
-        old_err = _np.seterr(over="ignore")
-        gates *= _np.exp(sval)[:, None, None]
-        _np.seterr(**old_err)
-
-
-        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
-        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
-        Gs = prodCache[0:nCircuits]
-
-        if scale:
-            return Gs, scaleVals
-        else:
-            old_err = _np.seterr(over='ignore')
-            Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
-            _np.seterr(**old_err)
-            return Gs
-
-    def bulk_dproduct(self, circuits, flat=False, return_prods=False,
-                      scale=False, resource_alloc=None, wrt_filter=None):
-        """
-        Compute the derivative of a many operation sequences at once.
-
-        Parameters
-        ----------
-        circuits : list of Circuits
-            The circuits to compute products for.  These should *not* have any preparation or
-            measurement layers.
-
-        flat : bool, optional
-            Affects the shape of the returned derivative array (see below).
-
-        return_prods : bool, optional
-            when set to True, additionally return the probabilities.
-
-        scale : bool, optional
-            When True, return a scaling factor (see below).
-
-        resource_alloc : ResourceAllocation
-            Available resources for this computation. Includes the number of processors
-            (MPI comm) and memory limit.
-
-        wrt_filter : list of ints, optional
-            If not None, a list of integers specifying which gate parameters
-            to include in the derivative.  Each element is an index into an
-            array of gate parameters ordered by concatenating each gate's
-            parameters (in the order specified by the model).  This argument
-            is used internally for distributing derivative calculations across
-            multiple processors.
-
-        Returns
-        -------
-        derivs : numpy array
-            * if flat == False, an array of shape S x M x G x G, where:
-              - S == len(circuits)
-              - M == the length of the vectorized model
-              - G == the linear dimension of a operation matrix (G x G operation matrices)
-              and derivs[i,j,k,l] holds the derivative of the (k,l)-th entry
-              of the i-th operation sequence product with respect to the j-th model
-              parameter.
-            * if flat == True, an array of shape S*N x M where:
-              - N == the number of entries in a single flattened gate (ordering same as numpy.flatten),
-              - S,M == as above,
-              and deriv[i,j] holds the derivative of the (i % G^2)-th entry of
-              the (i / G^2)-th flattened operation sequence product  with respect to
-              the j-th model parameter.
-        products : numpy array
-            Only returned when return_prods == True.  An array of shape
-            S x G x G; products[i] is the i-th operation sequence product.
-        scaleVals : numpy array
-            Only returned when scale == True.  An array of shape S such that
-            scaleVals[i] contains the multiplicative scaling needed for
-            the derivatives and/or products for the i-th operation sequence.
-        """
-        nCircuits = len(circuits)
-        nDerivCols = self.model.num_params if (wrt_filter is None) else _slct.length(wrt_filter)
-
-        wrtSlice = _slct.list_to_slice(wrt_filter) if (wrt_filter is not None) else None
-        #TODO: just allow slices as argument: wrt_filter -> wrtSlice?
-
-        resource_alloc = _ResourceAllocation.cast(resource_alloc)
-
-        eval_tree = _EvalTree.create(circuits)
-        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
-        dProdCache = self._compute_dproduct_cache(eval_tree, prodCache, scaleCache,
-                                                  resource_alloc.comm, wrtSlice)
-
-        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
-        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
-        Gs = prodCache[0:nCircuits]
-        dGs = dProdCache[0:nCircuits]
-
-        if not scale:
-            old_err = _np.seterr(over='ignore', invalid='ignore')
-            if return_prods:
-                Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
-
-            # may overflow or get nans (invalid), but ok
-            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 3) * scaleVals, 0, 3)
-            # convert nans to zero, as these occur b/c an inf scaleVal is mult by a zero deriv value, and we
-            dGs[_np.isnan(dGs)] = 0
-            _np.seterr(**old_err)
-
-        if flat:
-            # cols = deriv cols, rows = flattened everything else
-            dim = self.model.evotype.minimal_dim(self.model.state_space)
-            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 1).reshape(
-                (nDerivCols, nCircuits * dim**2)), 0, 1)
-
-        if return_prods:
-            return (dGs, Gs, scaleVals) if scale else (dGs, Gs)
-        else:
-            return (dGs, scaleVals) if scale else dGs
-
-    ## ---------------------------------------------------------------------------------------------
-    ## TIME DEPENDENT functionality ----------------------------------------------------------------
-    ## ---------------------------------------------------------------------------------------------
-
-    def _ds_quantities(self, timestamp, ds_cache, layout, dataset, TIMETOL=1e-6):
-        if timestamp not in ds_cache:
-            if 'truncated_ds' not in ds_cache:
-                ds_cache['truncated_ds'] = dataset.truncate(layout.circuits)
-            trunc_dataset = ds_cache['truncated_ds']
-
-            if 'ds_for_time' not in ds_cache:
-                #tStart = _time.time()
-                ds_cache['ds_for_time'] = trunc_dataset.split_by_time()
-                #print("DB: Split dataset by time in %.1fs (%d timestamps)" % (_time.time() - tStart,
-                #                                                              len(ds_cache['ds_for_time'])))
-
-            if timestamp not in ds_cache['ds_for_time']:
-                return (None, None, None, None, None)
-
-            #Similar to MDC store's add_count_vectors function -- maybe consolidate in FUTURE?
-            counts = _np.empty(layout.num_elements, 'd')
-            totals = _np.empty(layout.num_elements, 'd')
-            dataset_at_t = ds_cache['ds_for_time'][timestamp]  # trunc_dataset.time_slice(timestamp, timestamp+TIMETOL)
-
-            firsts = []; indicesOfCircuitsWithOmittedData = []
-            for (i, circuit) in enumerate(layout.circuits):  # should be 'ds_circuits' really
-                inds = layout.indices_for_index(i)
-                if circuit in dataset_at_t:
-                    cnts = dataset_at_t[circuit].counts
-                else:
-                    cnts = {}  # Note: this will cause 0 totals, which will need to be handled downstream
-                totals[inds] = sum(cnts.values())  # dataset[opStr].total
-                counts[inds] = [cnts.get(x, 0) for x in layout.outcomes_for_index(i)]
-                lklen = _slct.length(inds)  # consolidate w/ `add_omitted_freqs`?
-                if 0 < lklen < self.model.compute_num_outcomes(circuit):
-                    firsts.append(_slct.to_array(inds)[0])
-                    indicesOfCircuitsWithOmittedData.append(i)
-
-            if len(firsts) > 0:
-                firsts = _np.array(firsts, 'i')
-                indicesOfCircuitsWithOmittedData = _np.array(indicesOfCircuitsWithOmittedData, 'i')
-                #print("DB: SPARSE DATA: %d of %d rows have sparse data" % (len(firsts), len(layout.circuits)))
-            else:
-                firsts = indicesOfCircuitsWithOmittedData = None
-
-            #if self.circuits.circuit_weights is not None:
-            #  SEE add_count_vectors
-
-            nonzero_totals = _np.where(_np.abs(totals) < 1e-10, 1e-10, totals)  # avoid divide-by-zero error on nxt line
-            freqs = counts / nonzero_totals
-            ds_cache[timestamp] = (counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData)
-
-        return ds_cache[timestamp]
-
-    def _bulk_fill_timedep_objfn(self, raw_objective, array_to_fill, layout, ds_circuits,
-                                 num_total_outcomes, dataset, ds_cache=None):
-
-        assert(self._mode == "distribute_by_timestamp"), \
-            ("Must set `distribute_by_timestamp=True` to use a "
-             "time-dependent objective function with MatrixForwardSimulator!")
-
-        resource_alloc = layout.resource_alloc()
-        atom_resource_alloc = layout.resource_alloc('atom-processing')
-        atom_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
-
-        #Split timestamps up between processors - maybe do this in a time-dep layout?
-        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
-        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
-            _mpit.distribute_indices(list(range(len(all_timestamps))), atom_resource_alloc)
-        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
-
-        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
-                                                                  (layout.num_elements,), 'd')
-        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
-        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
-        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
-
-        array_to_fill[:] = 0.0
-        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
-        assert(my_array_to_fill.shape == (layout.num_elements,))
-
-        for timestamp_index in my_timestamp_inds:
-            timestamp = all_timestamps[timestamp_index]
-
-            # compute objective at time timestamp
-            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
-                self._ds_quantities(timestamp, ds_cache, layout, dataset)
-            if counts is None: return  # no data at this time => no contribution
-
-            for _, obj in self.model._iter_parameterized_objs():
-                obj.set_time(timestamp)
-            for opcache in self.model._opcaches.values():
-                for obj in opcache.values():
-                    obj.set_time(timestamp)
-
-            for atom in layout.atoms:  # layout only holds local atoms
-                self._bulk_fill_probs_atom(probs_array[atom.element_slice], atom, timestamp_processing_ralloc)
-
-            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
-            # (similar to DistributableForwardSimulator._bulk_fill_probs)
-
-            terms = raw_objective.terms(probs_array, counts, totals, freqs)
-            if firsts is not None and shared_mem_leader:  # consolidate with `_update_terms_for_omitted_probs`
-                omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
-                                                 for i in indicesOfCircuitsWithOmittedData])
-                terms[firsts] += raw_objective.zero_freq_terms(totals[firsts], omitted_probs)
-            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
-
-            my_array_to_fill += terms
-
-        #collect/gather results (SUM local arrays together)
-        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
-
-        _smt.cleanup_shared_ndarray(probs_array_shm)
-
-    def _bulk_fill_timedep_dobjfn(self, raw_objective, array_to_fill, layout, ds_circuits,
-                                  num_total_outcomes, dataset, ds_cache=None):
-
-        assert(self._mode == "distribute_by_timestamp"), \
-            ("Must set `distribute_by_timestamp=True` to use a "
-             "time-dependent objective function with MatrixForwardSimulator!")
-
-        resource_alloc = layout.resource_alloc()
-        param_resource_alloc = layout.resource_alloc('param-processing')
-        param_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
-
-        #Split timestamps up between processors - maybe do this in a time-dep layout?
-        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
-        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
-            _mpit.distribute_indices(list(range(len(all_timestamps))), param_resource_alloc)
-        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
-
-        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
-                                                                  (layout.num_elements,), 'd')
-        dprobs_array, dprobs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
-                                                                    (layout.num_elements, self.model.num_params), 'd')
-        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
-        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
-        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
-
-        array_to_fill[:] = 0.0
-        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
-        all_param_slice = slice(0, self.model.num_params)  # All params computed at once for now
-        assert(my_array_to_fill.shape == (layout.num_elements, self.model.num_params))
-
-        for timestamp_index in my_timestamp_inds:
-            timestamp = all_timestamps[timestamp_index]
-            # compute objective at time layout_atom.time
-            #print("DB: Rank %d : layout atom for t=" % resource_alloc.comm.rank, layout_atom.timestamp)
-
-            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
-                self._ds_quantities(timestamp, ds_cache, layout, dataset)
-
-            for _, obj in self.model._iter_parameterized_objs():
-                obj.set_time(timestamp)
-            for opcache in self.model._opcaches.values():
-                for obj in opcache.values():
-                    obj.set_time(timestamp)
-
-            for atom in layout.atoms:  # layout only holds local atoms
-                self._bulk_fill_probs_atom(probs_array, atom, timestamp_processing_ralloc)
-                self._bulk_fill_dprobs_atom(dprobs_array, all_param_slice, atom,
-                                            all_param_slice, timestamp_processing_ralloc)
-
-            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
-            # (similar to DistributableForwardSimulator._bulk_fill_probs)
-
-            if shared_mem_leader:
-                if firsts is not None:  # consolidate with TimeIndependentMDCObjectiveFunction.dterms?
-                    dprobs_omitted_rowsum = _np.empty((len(firsts), self.model.num_params), 'd')
-                    for ii, i in enumerate(indicesOfCircuitsWithOmittedData):
-                        dprobs_omitted_rowsum[ii, :] = _np.sum(dprobs_array[layout.indices_for_index(i), :], axis=0)
-
-                dprobs_array *= raw_objective.dterms(probs_array, counts, totals, freqs)[:, None]
-
-                if firsts is not None:  # consolidate with _update_dterms_for_omitted_probs?
-                    omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
-                                                     for i in indicesOfCircuitsWithOmittedData])
-                    dprobs_array[firsts] -= raw_objective.zero_freq_dterms(totals[firsts], omitted_probs)[:, None] \
-                        * dprobs_omitted_rowsum
-            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
-
-            my_array_to_fill += dprobs_array
-
-        #collect/gather results (SUM local arrays together)
-        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
-
-        _smt.cleanup_shared_ndarray(probs_array_shm)
-        _smt.cleanup_shared_ndarray(dprobs_array_shm)
-
-    def bulk_fill_timedep_chi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                               min_prob_clip_for_weighting, prob_clip_interval, ds_cache=None):
-        """
-        Compute the chi2 contributions for an entire tree of circuits, allowing for time dependent operations.
-
-        Computation is performed by summing together the contributions for each time the circuit is
-        run, as given by the timestamps in `dataset`.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated 1D numpy array of length equal to the
-            total number of computed elements (i.e. layout.num_elements)
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the chi2 contributions.
-
-        min_prob_clip_for_weighting : float, optional
-            Sets the minimum and maximum probability p allowed in the chi^2
-            weights: N/(p*(1-p)) by clipping probability p values to lie within
-            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
-        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
-                                   layout.resource_alloc())
-        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                             dataset, ds_cache)
-
-    def bulk_fill_timedep_dchi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                                min_prob_clip_for_weighting, prob_clip_interval, chi2_array_to_fill=None,
-                                ds_cache=None):
-        """
-        Compute the chi2 jacobian contributions for an entire tree of circuits, allowing for time dependent operations.
-
-        Similar to :meth:`bulk_fill_timedep_chi2` but compute the *jacobian*
-        of the summed chi2 contributions for each circuit with respect to the
-        model's parameters.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated ExM numpy array where E is the total number of
-            computed elements (i.e. layout.num_elements) and M is the
-            number of model parameters.
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the chi2 contributions.
-
-        min_prob_clip_for_weighting : float, optional
-            Sets the minimum and maximum probability p allowed in the chi^2
-            weights: N/(p*(1-p)) by clipping probability p values to lie within
-            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        chi2_array_to_fill : numpy array, optional
-            when not None, an already-allocated length-E numpy array that is filled
-            with the per-circuit chi2 contributions, just like in
-            bulk_fill_timedep_chi2(...).
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
-        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
-                                   layout.resource_alloc())
-        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                              dataset, ds_cache)
-
-    def bulk_fill_timedep_loglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                                 min_prob_clip, radius, prob_clip_interval, ds_cache=None):
-        """
-        Compute the log-likelihood contributions (within the "poisson picture") for an entire tree of circuits.
-
-        Computation is performed by summing together the contributions for each time the circuit is run,
-        as given by the timestamps in `dataset`.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated 1D numpy array of length equal to the
-            total number of computed elements (i.e. layout.num_elements)
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the logl contributions.
-
-        min_prob_clip : float, optional
-            The minimum probability treated normally in the evaluation of the
-            log-likelihood.  A penalty function replaces the true log-likelihood
-            for probabilities that lie below this threshold so that the
-            log-likelihood never becomes undefined (which improves optimizer
-            performance).
-
-        radius : float, optional
-            Specifies the severity of rounding used to "patch" the
-            zero-frequency terms of the log-likelihood.
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
-        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
-                                                  layout.resource_alloc())
-        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                             dataset, ds_cache)
-
-    def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                                  min_prob_clip, radius, prob_clip_interval, logl_array_to_fill=None, ds_cache=None):
-        """
-        Compute the ("poisson picture")log-likelihood jacobian contributions for an entire tree of circuits.
-
-        Similar to :meth:`bulk_fill_timedep_loglpp` but compute the *jacobian*
-        of the summed logl (in posison picture) contributions for each circuit
-        with respect to the model's parameters.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated ExM numpy array where E is the total number of
-            computed elements (i.e. layout.num_elements) and M is the
-            number of model parameters.
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the logl contributions.
-
-        min_prob_clip : float
-            a regularization parameter for the log-likelihood objective function.
-
-        radius : float
-            a regularization parameter for the log-likelihood objective function.
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        logl_array_to_fill : numpy array, optional
-            when not None, an already-allocated length-E numpy array that is filled
-            with the per-circuit logl contributions, just like in
-            bulk_fill_timedep_loglpp(...).
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
-        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
-                                                  layout.resource_alloc())
-        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                              dataset, ds_cache)
-
-
 class LCSEvalTreeMatrixForwardSimulator(MatrixForwardSimulator):
 
     def bulk_product(self, circuits, scale=False, resource_alloc=None):

From 6003f75fda76c0ff0968c9af1f4b128a1fd75314 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 16:47:02 -0700
Subject: [PATCH 041/141] Regions in evaltree.

---
 pygsti/layouts/evaltree.py | 54 +++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 44c82fec9..8886078e3 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -453,7 +453,7 @@ def _get_start_indices(max_intersect):
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
 
-
+#region Longest Common Subsequence
 
 def _best_matching_only(A: Sequence, B: Sequence) -> int:
     """
@@ -472,7 +472,7 @@ def _best_matching_only(A: Sequence, B: Sequence) -> int:
 
 
 
-def _lcs_dp_version(A, B):
+def _lcs_dp_version(A: Sequence, B: Sequence):
     """
     Compute the longest common substring between A and B using
     dynamic programming.
@@ -500,7 +500,7 @@ def setup_lcs_dynamic_programming_table(A, B):
     """
     return _np.zeros((len(A) + 1, len(B) + 1))
 
-def build_one_round_of_eval_tree(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
+def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
     if table_data_and_sequences:
         table, sequences = table_data_and_sequences
     else:
@@ -575,7 +575,7 @@ def build_one_round_of_eval_tree(circuits, table_data_and_sequences, internal_ta
 
     return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
 
-def locate_sequences_in_AB(A, B, dp_table) -> tuple[int, int, int]:
+def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
     """
     Finds the indices of the starting points of the sequences in A and B.
 
@@ -624,7 +624,7 @@ def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
                 if len(cir1) >= curr_best:
                     table = _lcs_dp_version(cir0, cir1)
                     best_lengths[i,j] = table[0,0]
-                    best_subsequences[(i,j)] = locate_sequences_in_AB(cir0, cir1, table)
+                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
                     curr_best = max(best_lengths[i,j], curr_best)
                 else:
                     best_lengths[i,j] = -1
@@ -636,10 +636,12 @@ def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
     return best_lengths, best_subsequences
 
 
-def _longest_common_internal_subsequence(A: _Circuit) -> tuple[int, dict[tuple, list[int]]]:
+def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
     """
     Compute the longest common subsequence within a single circuit A.
 
+    Cost ~ O(L^3 / 8) where L is the length of A
+
     Returns:
     ---------
     int - length of longest common subsequences within A
@@ -670,6 +672,8 @@ def _longest_common_internal_subsequence(A: _Circuit) -> tuple[int, dict[tuple,
 def build_internal_tables(circuit_list):
     """
     Compute all the longest common internal sequences for each circuit A in circuit_list
+
+    Total cost is O(C L^3).
     """
 
     C = len(circuit_list)
@@ -683,6 +687,10 @@ def build_internal_tables(circuit_list):
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
 
+
+#endregion Longest Common Subsequence
+
+#region Split circuit list into lists of subcircuits
 def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
     """
     Add in explicit idles to the labels for each layer.
@@ -793,31 +801,6 @@ def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[
 
     return lanes_to_gates
 
-
-def _split_circuits_by_lanes(circuit_list):
-    # First eliminate the duplicate circuits.
-
-    unique_circuits = []
-    matching_inds: dict[int, set[int]] = {}
-    C = len(circuit_list)
-    seen_circs: dict[tuple[LabelTupTup, int]] = {}
-    cache = {i: circuit_list[i] for i in range(len(circuit_list))}
-    for i in range(C):
-        my_cir = circuit_list[i]
-        if tuple(my_cir) in seen_circs:
-            cache[i] = seen_circs[tuple(my_cir)]
-        else:
-            seen_circs[tuple(my_cir)] = i
-
-    labels_to_circuits = {}
-    for my_cir in seen_circs:
-        line_labels = _Circuit(my_cir)._line_labels
-        if line_labels in labels_to_circuits:
-            labels_to_circuits[line_labels].append(my_cir)
-        else:
-            labels_to_circuits[line_labels] = [my_cir]
-        
-
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
@@ -869,6 +852,11 @@ def setup_circuit_list_for_LCS_computations(
         # cir_id_to_lanes.append(lanes_to_qubits)
     return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
+#endregion Split Circuits by lanes helpers
+
+
+#region Lane Collapsing Helpers
+
 def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
     """
     Look up the dense representation of a gate in the model.
@@ -910,7 +898,7 @@ def combine_two_gates(cumulative_term, next_dense_matrix):
     which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
     """
     return next_dense_matrix @ cumulative_term
-
+#endregion Lane Collapsing Helpers
 
 class EvalTreeBasedUponLongestCommonSubstring():
 
@@ -943,7 +931,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         i = 0
         while max_rounds > 1:
-            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = build_one_round_of_eval_tree(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = _conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
             i += 1
             external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
 

From 1ca3b2db5dfc78f530ae1f2578d0b172edb6c502 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 16:52:23 -0700
Subject: [PATCH 042/141] inline a function

---
 pygsti/layouts/evaltree.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 8886078e3..812ea56e5 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -477,12 +477,9 @@ def _lcs_dp_version(A: Sequence, B: Sequence):
     Compute the longest common substring between A and B using
     dynamic programming.
 
-    
     This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
-
     """
-    
-    table = setup_lcs_dynamic_programming_table(A, B)
+    table = _np.zeros((len(A) + 1, len(B) + 1))
     n, m = table.shape
     for i in range(n-2, -1, -1):
         for j in range(m-2, -1, -1):
@@ -494,11 +491,6 @@ def _lcs_dp_version(A: Sequence, B: Sequence):
             table[i,j] = max(opt1, opt2, opt3)
     return table
 
-def setup_lcs_dynamic_programming_table(A, B):
-    """
-    Create the table used for LCS dynamic programming.
-    """
-    return _np.zeros((len(A) + 1, len(B) + 1))
 
 def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
     if table_data_and_sequences:

From 6f43b2adf2aadd47d45b1fcdc9478ea696b5cd58 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 17:08:08 -0700
Subject: [PATCH 043/141] whitespace

---
 pygsti/layouts/evaltree.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 812ea56e5..3cbb3b144 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -453,6 +453,7 @@ def _get_start_indices(max_intersect):
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
 
+
 #region Longest Common Subsequence
 
 def _best_matching_only(A: Sequence, B: Sequence) -> int:
@@ -471,7 +472,6 @@ def _best_matching_only(A: Sequence, B: Sequence) -> int:
     return len(A[:i])
 
 
-
 def _lcs_dp_version(A: Sequence, B: Sequence):
     """
     Compute the longest common substring between A and B using
@@ -567,6 +567,7 @@ def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences,
 
     return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
 
+
 def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
     """
     Finds the indices of the starting points of the sequences in A and B.
@@ -600,6 +601,7 @@ def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
             return i-1, j-1, dp_table[i,j]
     return None, None, None
 
+
 def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
     """
     Computes the LCS for every pair of circuits A,B in circuit_list
@@ -661,6 +663,7 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
             return best, best_ind
     return best, best_ind
 
+
 def build_internal_tables(circuit_list):
     """
     Compute all the longest common internal sequences for each circuit A in circuit_list
@@ -679,10 +682,11 @@ def build_internal_tables(circuit_list):
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
 
-
 #endregion Longest Common Subsequence
 
+
 #region Split circuit list into lists of subcircuits
+
 def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
     """
     Add in explicit idles to the labels for each layer.
@@ -756,7 +760,6 @@ def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, i
     return compute_qubits_to_lanes(lanes), lanes
 
 
-
 def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
     """
     Split a circuit into multiple subcircuits which do not talk across lanes.
@@ -793,6 +796,7 @@ def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[
 
     return lanes_to_gates
 
+
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
@@ -864,6 +868,7 @@ def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
     else:
         raise ValueError("Missing attribute")
 
+
 def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     op_term = 1
     if op.num_qubits == 2:
@@ -881,6 +886,7 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
         op_term = model_and_gate_to_dense_rep(model, op)
     return op_term
 
+
 def combine_two_gates(cumulative_term, next_dense_matrix):
     """
     Note that the visual representation was
@@ -890,8 +896,10 @@ def combine_two_gates(cumulative_term, next_dense_matrix):
     which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
     """
     return next_dense_matrix @ cumulative_term
+
 #endregion Lane Collapsing Helpers
 
+
 class EvalTreeBasedUponLongestCommonSubstring():
 
     def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
@@ -984,7 +992,6 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         # Assumes a perfect swap gate!
         # self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")            
 
-
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
         Construct a tree from another tree.
@@ -1027,9 +1034,6 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
             updated[new_cir] = loc
         self.circuit_to_save_location = updated
 
-
-
-
     def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int):
         """
         Compute the total product cache. Note that this may still have a tensor product
@@ -1119,7 +1123,6 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         return list(output)
 
-
     """        
     def _evaluate_product_rule(self, cind: int, rn: int):
 
@@ -1178,7 +1181,7 @@ def _evaluate_product_rule(self, cind: int, rn: int):
                         else:
                             cumulative_term = val @ cumulative_term
     """
-    
+
 
 class CollectionOfLCSEvalTrees():
 
@@ -1291,7 +1294,6 @@ def compute_tensor_orders(self):
 
         return
             
-
     def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...], cache):
         
 

From 613379a161a01559a98a6756c002f69e952e8be2 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 17:13:45 -0700
Subject: [PATCH 044/141] Get the dense operator in the minimal space required
 for a gate operation.

---
 pygsti/models/explicitmodel.py   | 28 +++++++++++++++++++++++++++-
 pygsti/models/layerrules.py      | 19 ++++++++++++++++++-
 pygsti/models/localnoisemodel.py | 23 +++++++++++++++++++++--
 3 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index 01eed989b..b9f1bad67 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -34,7 +34,7 @@
 from pygsti.modelmembers.operations import opfactory as _opfactory
 from pygsti.baseobjs.basis import Basis as _Basis
 from pygsti.baseobjs.basis import BuiltinBasis as _BuiltinBasis, DirectSumBasis as _DirectSumBasis
-from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
+from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel, LabelTup as _LabelTup
 from pygsti.baseobjs import statespace as _statespace
 from pygsti.tools import basistools as _bt
 from pygsti.tools import jamiolkowski as _jt
@@ -45,6 +45,8 @@
 from pygsti.tools import listtools as _lt
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
 
+from pygsti.modelmembers.operations import EmbeddedOp as _EmbeddedOp, ComposedOp as _ComposedOp
+
 
 class ExplicitOpModel(_mdl.OpModel):
     """
@@ -1739,3 +1741,27 @@ def operation_layer_operator(self, model, layerlbl, caches):
             return model.operations[layerlbl]
         else:
             return _opfactory.op_from_factories(model.factories, layerlbl)
+        
+    def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+
+        operation = model.operations["gates"][lbl]
+
+        if isinstance(operation, _EmbeddedOp):
+            return operation.embedded_op.to_dense()
+        elif isinstance(operation, _ComposedOp):
+            breakpoint()
+        return operation.to_dense('minimal')
+
diff --git a/pygsti/models/layerrules.py b/pygsti/models/layerrules.py
index d1fcd9357..5060d3087 100644
--- a/pygsti/models/layerrules.py
+++ b/pygsti/models/layerrules.py
@@ -13,7 +13,8 @@
 
 from pygsti.modelmembers import operations as _op
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
-
+from pygsti.baseobjs.label import LabelTup as _LabelTup
+from pygsti.models.model import OpModel as _OpModel
 
 class LayerRules(_NicelySerializable):
     """
@@ -110,3 +111,19 @@ def operation_layer_operator(self, model, layerlbl, cache):
         """
         #raise KeyError(f"Cannot create operator for non-primitive layer: {layerlbl}")
         raise KeyError("Cannot create operator for non-primitive layer: %s" % str(layerlbl))
+    
+    def get_dense_process_matrix_represention_for_gate(self, model: _OpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+        raise KeyError("Cannot find a dense operator for layer: %s" % str(lbl))
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index cc46bb770..d9e21ceee 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -37,6 +37,7 @@
 from pygsti.tools import optools as _ot
 from pygsti.tools import listtools as _lt
 from pygsti.processors.processorspec import ProcessorSpec as _ProcessorSpec, QubitProcessorSpec as _QubitProcessorSpec
+from pygsti.baseobjs.label import LabelTup as _LabelTup
 
 
 class LocalNoiseModel(_ImplicitOpModel):
@@ -171,7 +172,7 @@ def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None,
         idle_names = processor_spec.idle_gate_names
         global_idle_layer_label = processor_spec.global_idle_layer_label
 
-        layer_rules = _SimpleCompLayerRules(qudit_labels, implicit_idle_mode, None, global_idle_layer_label)
+        layer_rules = _SimpleCompLayerRules(qudit_labels, implicit_idle_mode, None, global_idle_layer_label, independent_gates=independent_gates)
 
         super(LocalNoiseModel, self).__init__(state_space, layer_rules, 'pp',
                                               simulator=simulator, evotype=evotype)
@@ -406,7 +407,7 @@ def rescale(coeffs):
 
 class _SimpleCompLayerRules(_LayerRules):
 
-    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label):
+    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label, independent_gates):
         super().__init__()
         self.implicit_idle_mode = implicit_idle_mode  # how to handle implied idles ("blanks") in circuits
         self.qubit_labels = qubit_labels
@@ -414,6 +415,7 @@ def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels,
         self._add_global_idle_to_all_layers = False
         self._add_padded_idle = False
         self.use_op_caching = True  # expert functionality - can be turned off if needed
+        self._spacial_homogeneity_assumed = independent_gates
 
         if implicit_idle_mode not in ('none', 'add_global', 'only_global', 'pad_1Q'):
             raise ValueError("Invalid `implicit_idle_mode`: '%s'" % str(implicit_idle_mode))
@@ -613,6 +615,23 @@ def _layer_component_operation(self, model, complbl, cache):
             ret = _opfactory.op_from_factories(model.factories['layers'], complbl)
         return ret
 
+    def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+
+        key = lbl.name if self._spacial_homogeneity_assumed else lbl
+        return model.operation_blks["gates"][key].to_dense()
 
 
 

From 5de4bff9f463116afccd77a58c667851dad8915d Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 17:24:12 -0700
Subject: [PATCH 045/141] Extract LCS work.

---
 pygsti/layouts/evaltree.py    | 237 +--------------------------------
 pygsti/tools/sequencetools.py | 241 ++++++++++++++++++++++++++++++++++
 2 files changed, 245 insertions(+), 233 deletions(-)
 create mode 100644 pygsti/tools/sequencetools.py

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 3cbb3b144..f493c7cc3 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -23,7 +23,7 @@
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
-from typing import Sequence
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, build_internal_tables
 import time
 
 
@@ -454,235 +454,6 @@ def _get_start_indices(max_intersect):
         return disjointLists, helpfulScratchLists
 
 
-#region Longest Common Subsequence
-
-def _best_matching_only(A: Sequence, B: Sequence) -> int:
-    """
-    Returns:
-    -----
-    int - the length of the longest matching prefix between A and B.
-    """
-    i = 0
-    n = len(A)
-    m = len(B)
-    while i < n and i < m:
-        if A[i] != B[i]:
-            return len(A[:i])
-        i += 1
-    return len(A[:i])
-
-
-def _lcs_dp_version(A: Sequence, B: Sequence):
-    """
-    Compute the longest common substring between A and B using
-    dynamic programming.
-
-    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
-    """
-    table = _np.zeros((len(A) + 1, len(B) + 1))
-    n, m = table.shape
-    for i in range(n-2, -1, -1):
-        for j in range(m-2, -1, -1):
-            opt1 = 0
-            if A[i] == B[j]:
-                opt1 = _best_matching_only(A[i:], B[j:])
-            opt2 = table[i, j+1]
-            opt3 = table[i+1, j]
-            table[i,j] = max(opt1, opt2, opt3)
-    return table
-
-
-def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
-    if table_data_and_sequences:
-        table, sequences = table_data_and_sequences
-    else:
-        table, sequences = _compute_lcs_for_every_pair_of_circuits(circuits)
-
-    if internal_tables_and_sequences:
-        internal_subtable, internal_subsequences = internal_tables_and_sequences
-    else:
-        internal_subtable, internal_subsequences = build_internal_tables(circuits)
-
-    best_index = _np.where(table == _np.max(table))
-    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
-    updated_circuits = circuits
-    cache_num = starting_cache_num
-
-    # Build sequence dict
-    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
-
-    if _np.max(internal_subtable) >= _np.max(table):
-        # We are only going to replace if this was the longest substring.
-        for cir_ind in best_internal_index[0]:
-            for seq in internal_subsequences[cir_ind]:
-                key = tuple(seq)
-                if key in all_subsequences_to_replace:
-                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
-                else:
-                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
-
-    if _np.max(table) >= _np.max(internal_subtable):
-        for ii in range(len(best_index[0])):
-            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
-            cir_index = best_index[0][ii]
-            cir_index2 = best_index[1][ii]
-            seq = updated_circuits[cir_index][starting_point: int(starting_point + length+1)]
-
-            key = tuple(seq)
-            if key in all_subsequences_to_replace:
-                if cir_index not in all_subsequences_to_replace[key]:
-                    # We did not already handle this with internal subsequences.
-                    all_subsequences_to_replace[key][cir_index] = [starting_point]
-                if cir_index2 not in all_subsequences_to_replace[key]:
-                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
-
-            else:
-                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
-
-
-    # Handle the updates.
-    old_cache_num = cache_num
-    for seq, cdict in all_subsequences_to_replace.items():
-        w = len(seq)
-        if  w > 1 or (not isinstance(seq[0], int)):
-            # We have reached an item which we can just compute.
-            for cir_ind in cdict:
-                my_cir = updated_circuits[cir_ind]
-                sp = 0
-                while sp+w <= len(my_cir):
-                    if list(my_cir[sp: sp+w]) == list(seq):
-                        my_cir[sp: sp + w] = [cache_num]
-
-                    sp += 1
-                updated_circuits[cir_ind] = my_cir
-
-                cache_struct[cir_ind] = updated_circuits[cir_ind]
-
-            updated_circuits.append(list(seq))
-            cache_struct[cache_num] = updated_circuits[cache_num]
-
-            cache_num += 1
-
-    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
-
-    return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
-
-
-def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
-    """
-    Finds the indices of the starting points of the sequences in A and B.
-
-    Returns:
-    ---------
-    int - starting index in A of LCS(A,B)
-    int - starting index in B of LCS(A,B)
-    int - length of LCS(A,B)
-    """
-    n, m = dp_table.shape
-    i = 0
-    j = 0
-    while i < n-1 and j < m -1:
-        curr = dp_table[i,j]
-        opt1 = dp_table[i+1, j+1]
-        opt2 = dp_table[i+1, j]
-        opt3 = dp_table[i, j+1]
-        options = [opt1, opt2, opt3]
-        if _np.all(curr == options):
-            i += 1
-            j += 1
-        elif opt2 > opt1 and opt2 > opt3:
-            i += 1
-        elif opt3 > opt2 and opt3 > opt1:
-            j += 1
-        else:
-            # All three options are equal. So we should march the diagonal.
-            i += 1
-            j += 1
-            return i-1, j-1, dp_table[i,j]
-    return None, None, None
-
-
-def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
-    """
-    Computes the LCS for every pair of circuits A,B in circuit_list
-    """
-    best_subsequences = {}
-    best_lengths = _np.zeros((len(circuit_list), len(circuit_list)))
-    curr_best = 0
-    for i in range(len(circuit_list)-1, -1, -1): # Lets do this in reverse order
-        cir0 = circuit_list[i]
-        if len(cir0) >= curr_best:
-            # Could be the best.
-            for j in range(i-1, -1, -1):
-                cir1 = circuit_list[j]
-                if len(cir1) >= curr_best:
-                    table = _lcs_dp_version(cir0, cir1)
-                    best_lengths[i,j] = table[0,0]
-                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
-                    curr_best = max(best_lengths[i,j], curr_best)
-                else:
-                    best_lengths[i,j] = -1
-                    best_subsequences[(i,j)] = (None, None, None)
-        else:
-            # Skipped because cannot be the best yet.
-            best_lengths[i,j] = -1
-            best_subsequences[(i,j)] = (None, None, None)
-    return best_lengths, best_subsequences
-
-
-def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
-    """
-    Compute the longest common subsequence within a single circuit A.
-
-    Cost ~ O(L^3 / 8) where L is the length of A
-
-    Returns:
-    ---------
-    int - length of longest common subsequences within A
-    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
-    """
-    n = len(A)
-    best = 0
-    best_ind = {}
-    changed = False
-    for w in range(1, int(_np.floor(n / 2) + 1)):
-        for sp in range(n - w):
-            window = A[sp: sp + w]
-            for match in range(sp+ w, n-w + 1):
-                if A[match: match + w] == window:
-                    if best == w:
-                        if tuple(window) in best_ind:
-                            best_ind[tuple(window)].add(match)
-                        else:
-                            best_ind[tuple(window)] = {sp, match}
-                    else:
-                        best_ind = {tuple(window): {sp, match}}
-                        changed = True
-                        best = w
-        if not changed:
-            return best, best_ind
-    return best, best_ind
-
-
-def build_internal_tables(circuit_list):
-    """
-    Compute all the longest common internal sequences for each circuit A in circuit_list
-
-    Total cost is O(C L^3).
-    """
-
-    C = len(circuit_list)
-    the_table = _np.zeros(C)
-    seq_table = [[] for _ in range(C)]
-
-    curr_best = 1
-    for i in range(C):
-        if len(circuit_list[i]) >= curr_best:
-            the_table[i], seq_table[i] = _longest_common_internal_subsequence(circuit_list[i])
-            curr_best = max(curr_best, the_table[i])
-    return the_table, seq_table
-
-#endregion Longest Common Subsequence
 
 
 #region Split circuit list into lists of subcircuits
@@ -909,7 +680,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
 
-        external_matches = _compute_lcs_for_every_pair_of_circuits(circuit_list)
+        external_matches = _compute_lcs_for_every_pair_of_sequences(circuit_list)
         
         best_external_match = _np.max(external_matches[0])
         self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
@@ -931,9 +702,9 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         i = 0
         while max_rounds > 1:
-            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = _conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache)
             i += 1
-            external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
+            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list)
 
             if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
                 # We are not going to get a better internal match.
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
new file mode 100644
index 000000000..69c2f505d
--- /dev/null
+++ b/pygsti/tools/sequencetools.py
@@ -0,0 +1,241 @@
+from typing import Sequence
+import numpy as _np
+
+
+#region Longest Common Subsequence
+
+def _best_matching_only(A: Sequence, B: Sequence) -> int:
+    """
+    Returns:
+    -----
+    int - the length of the longest matching prefix between A and B.
+    """
+    i = 0
+    n = len(A)
+    m = len(B)
+    while i < n and i < m:
+        if A[i] != B[i]:
+            return len(A[:i])
+        i += 1
+    return len(A[:i])
+
+
+def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
+    """
+    Compute the longest common substring between A and B using
+    dynamic programming.
+
+    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
+    """
+    table = _np.zeros((len(A) + 1, len(B) + 1))
+    n, m = table.shape
+    for i in range(n-2, -1, -1):
+        for j in range(m-2, -1, -1):
+            opt1 = 0
+            if A[i] == B[j]:
+                opt1 = _best_matching_only(A[i:], B[j:])
+            opt2 = table[i, j+1]
+            opt3 = table[i+1, j]
+            table[i,j] = max(opt1, opt2, opt3)
+    return table
+
+
+def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_data_and_sequences,
+                                            internal_tables_and_sequences,
+                                            starting_cache_num,
+                                            cache_struct):
+    """
+    Simplify the set of sequences by contracting the set of longest common subsequences.
+
+    Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
+    """
+    if table_data_and_sequences:
+        table, sequences = table_data_and_sequences
+    else:
+        table, sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = build_internal_tables(sequences)
+
+    best_index = _np.where(table == _np.max(table))
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_sequences = sequences
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
+
+    if _np.max(internal_subtable) >= _np.max(table):
+        # We are only going to replace if this was the longest substring.
+        for cir_ind in best_internal_index[0]:
+            for seq in internal_subsequences[cir_ind]:
+                key = tuple(seq)
+                if key in all_subsequences_to_replace:
+                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+                else:
+                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    if _np.max(table) >= _np.max(internal_subtable):
+        for ii in range(len(best_index[0])):
+            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
+            cir_index = best_index[0][ii]
+            cir_index2 = best_index[1][ii]
+            seq = updated_sequences[cir_index][starting_point: int(starting_point + length+1)]
+
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                if cir_index not in all_subsequences_to_replace[key]:
+                    # We did not already handle this with internal subsequences.
+                    all_subsequences_to_replace[key][cir_index] = [starting_point]
+                if cir_index2 not in all_subsequences_to_replace[key]:
+                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
+
+            else:
+                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
+
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_sequences[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+
+                    sp += 1
+                updated_sequences[cir_ind] = my_cir
+
+                cache_struct[cir_ind] = updated_sequences[cir_ind]
+
+            updated_sequences.append(list(seq))
+            cache_struct[cache_num] = updated_sequences[cache_num]
+
+            cache_num += 1
+
+    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+
+    return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round
+
+
+def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int, int, int]:
+    """
+    Finds the starting positions for the longest common subsequence.
+
+    Returns:
+    ---------
+    int - starting index in A of LCS(A,B)
+    int - starting index in B of LCS(A,B)
+    int - length of LCS(A,B)
+    """
+    n, m = dp_table.shape
+    i = 0
+    j = 0
+    while i < n-1 and j < m -1:
+        curr = dp_table[i,j]
+        opt1 = dp_table[i+1, j+1]
+        opt2 = dp_table[i+1, j]
+        opt3 = dp_table[i, j+1]
+        options = [opt1, opt2, opt3]
+        if _np.all(curr == options):
+            i += 1
+            j += 1
+        elif opt2 > opt1 and opt2 > opt3:
+            i += 1
+        elif opt3 > opt2 and opt3 > opt1:
+            j += 1
+        else:
+            # All three options are equal. So we should march the diagonal.
+            i += 1
+            j += 1
+            return i-1, j-1, dp_table[i,j]
+    return None, None, None
+
+
+def _compute_lcs_for_every_pair_of_sequences(sequences: list):
+    """
+    Computes the LCS for every pair of sequences A,B in sequences
+    """
+    best_subsequences = {}
+    best_lengths = _np.zeros((len(sequences), len(sequences)))
+    curr_best = 0
+    for i in range(len(sequences)-1, -1, -1): # Lets do this in reverse order
+        cir0 = sequences[i]
+        if len(cir0) >= curr_best:
+            # Could be the best.
+            for j in range(i-1, -1, -1):
+                cir1 = sequences[j]
+                if len(cir1) >= curr_best:
+                    table = _lcs_dp_version(cir0, cir1)
+                    best_lengths[i,j] = table[0,0]
+                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
+                    curr_best = max(best_lengths[i,j], curr_best)
+                else:
+                    best_lengths[i,j] = -1
+                    best_subsequences[(i,j)] = (None, None, None)
+        else:
+            # Skipped because cannot be the best yet.
+            best_lengths[i,j] = -1
+            best_subsequences[(i,j)] = (None, None, None)
+    return best_lengths, best_subsequences
+
+
+def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
+    """
+    Compute the longest common subsequence within a single circuit A.
+
+    Cost ~ O(L^3 / 8) where L is the length of A
+
+    Returns:
+    ---------
+    int - length of longest common subsequences within A
+    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
+    """
+    n = len(A)
+    best = 0
+    best_ind = {}
+    changed = False
+    for w in range(1, int(_np.floor(n / 2) + 1)):
+        for sp in range(n - w):
+            window = A[sp: sp + w]
+            for match in range(sp+ w, n-w + 1):
+                if A[match: match + w] == window:
+                    if best == w:
+                        if tuple(window) in best_ind:
+                            best_ind[tuple(window)].add(match)
+                        else:
+                            best_ind[tuple(window)] = {sp, match}
+                    else:
+                        best_ind = {tuple(window): {sp, match}}
+                        changed = True
+                        best = w
+        if not changed:
+            return best, best_ind
+    return best, best_ind
+
+
+def build_internal_tables(sequences):
+    """
+    Compute all the longest common internal sequences for each circuit A in sequences
+
+    Total cost is O(C L^3).
+    """
+
+    C = len(sequences)
+    the_table = _np.zeros(C)
+    seq_table = [[] for _ in range(C)]
+
+    curr_best = 1
+    for i in range(C):
+        if len(sequences[i]) >= curr_best:
+            the_table[i], seq_table[i] = _longest_common_internal_subsequence(sequences[i])
+            curr_best = max(curr_best, the_table[i])
+    return the_table, seq_table
+
+#endregion Longest Common Subsequence

From abb94ad7cfdaa8648fbb328a7783a81ea3c42106 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 11 Jul 2025 11:37:43 -0700
Subject: [PATCH 046/141] Add in test cases for sequencetools

---
 pygsti/tools/sequencetools.py         | 27 +++++----
 test/unit/tools/test_sequencetools.py | 81 +++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 14 deletions(-)
 create mode 100644 test/unit/tools/test_sequencetools.py

diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 69c2f505d..fe7e10b05 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -50,18 +50,18 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
     Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
     """
     if table_data_and_sequences:
-        table, sequences = table_data_and_sequences
+        table, external_sequences = table_data_and_sequences
     else:
-        table, sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
+        table, external_sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
 
     if internal_tables_and_sequences:
         internal_subtable, internal_subsequences = internal_tables_and_sequences
     else:
-        internal_subtable, internal_subsequences = build_internal_tables(sequences)
+        internal_subtable, internal_subsequences = create_tables_for_internal_LCS(sequences)
 
     best_index = _np.where(table == _np.max(table))
     best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
-    updated_sequences = sequences
+    updated_sequences = [seq for seq in sequences]
     cache_num = starting_cache_num
 
     # Build sequence dict
@@ -79,10 +79,10 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
 
     if _np.max(table) >= _np.max(internal_subtable):
         for ii in range(len(best_index[0])):
-            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
             cir_index = best_index[0][ii]
             cir_index2 = best_index[1][ii]
-            seq = updated_sequences[cir_index][starting_point: int(starting_point + length+1)]
+            starting_point, starting_point_2, length = external_sequences[(cir_index, cir_index2)]
+            seq = updated_sequences[cir_index][starting_point: int(starting_point + length)]
 
             key = tuple(seq)
             if key in all_subsequences_to_replace:
@@ -139,9 +139,9 @@ def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int,
     j = 0
     while i < n-1 and j < m -1:
         curr = dp_table[i,j]
-        opt1 = dp_table[i+1, j+1]
-        opt2 = dp_table[i+1, j]
-        opt3 = dp_table[i, j+1]
+        opt1 = dp_table[i+1, j+1] # Use
+        opt2 = dp_table[i+1, j] # Eliminate A prefix
+        opt3 = dp_table[i, j+1] # Eliminate B prefix
         options = [opt1, opt2, opt3]
         if _np.all(curr == options):
             i += 1
@@ -152,9 +152,7 @@ def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int,
             j += 1
         else:
             # All three options are equal. So we should march the diagonal.
-            i += 1
-            j += 1
-            return i-1, j-1, dp_table[i,j]
+            return i, j, dp_table[0,0]
     return None, None, None
 
 
@@ -220,7 +218,8 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
     return best, best_ind
 
 
-def build_internal_tables(sequences):
+def create_tables_for_internal_LCS(sequences: list[Sequence]) -> tuple[_np.ndarray,
+                                                        list[dict[tuple, list[int]]]]:
     """
     Compute all the longest common internal sequences for each circuit A in sequences
 
@@ -233,7 +232,7 @@ def build_internal_tables(sequences):
 
     curr_best = 1
     for i in range(C):
-        if len(sequences[i]) >= curr_best:
+        if len(sequences[i]) >= 2*curr_best:
             the_table[i], seq_table[i] = _longest_common_internal_subsequence(sequences[i])
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
new file mode 100644
index 000000000..a11e5a1c0
--- /dev/null
+++ b/test/unit/tools/test_sequencetools.py
@@ -0,0 +1,81 @@
+import numpy as np
+from pygsti.tools.sequencetools import _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification
+
+def test_external_matches():
+
+    my_strings = ["ABAARCR12LIO", "QWERTYASDFGH", "QWEELLKJAT"]
+
+    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings)
+
+    assert np.max(tables) == 3
+
+    assert len(np.where(np.max(tables) == tables)[0]) == 1 # There is only one sequence present in this case.
+
+
+    if (1,2) in sequences:
+        assert sequences[(1,2)] == (0, 0, 3)
+    else:
+        assert (2,1) in sequences
+        assert sequences[(2,1)] == (0, 0, 3)
+
+
+def test_internal_matches():
+
+    my_strings = ["RACECAR", "AAAAQAAAA", "QWERTYQWEQWEQWE"]
+
+    tables, sequences = create_tables_for_internal_LCS(my_strings)
+
+    assert np.max(tables) == 4
+
+
+    assert sequences[1][tuple("AAAA")] == {0, 5}
+
+
+    my_strings = [my_strings[0]] + [my_strings[2]]
+
+    tables, sequences = create_tables_for_internal_LCS(my_strings)
+
+    assert np.max(tables) == 3
+    assert sequences[1][tuple("QWE")] == {0, 6, 9, 12}
+
+
+def test_one_round_update_collecting_tables_first():
+
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+    ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+    ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+    internal = create_tables_for_internal_LCS(example)
+    external = _compute_lcs_for_every_pair_of_sequences(example)
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, external, internal, len(example), cache)
+
+    assert len(updated) == 4
+    assert "".join(updated[3]) == "AAAA"
+
+    assert cache[1] == [3,"Q",3]
+    assert np.allclose(seq_intro, np.array(3))
+
+    assert num == len(updated)
+
+
+def test_one_round_update_without_collecting_tables_first():
+
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+    ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+    ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+
+    assert len(updated) == 4
+    assert "".join(updated[3]) == "AAAA"
+
+    assert cache[1] == [3,"Q",3]
+    assert np.allclose(seq_intro, np.array(3))
+
+    assert num == len(updated)
\ No newline at end of file

From 81e805afe5e2d742e8a52309e87d9b9e3a0ec130 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 11 Jul 2025 11:40:22 -0700
Subject: [PATCH 047/141] tiny simplification

---
 pygsti/tools/sequencetools.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index fe7e10b05..69c5b770b 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -4,7 +4,7 @@
 
 #region Longest Common Subsequence
 
-def _best_matching_only(A: Sequence, B: Sequence) -> int:
+def len_lcp(A: Sequence, B: Sequence) -> int:
     """
     Returns:
     -----
@@ -15,9 +15,9 @@ def _best_matching_only(A: Sequence, B: Sequence) -> int:
     m = len(B)
     while i < n and i < m:
         if A[i] != B[i]:
-            return len(A[:i])
+            return i
         i += 1
-    return len(A[:i])
+    return i
 
 
 def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
@@ -33,7 +33,7 @@ def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
         for j in range(m-2, -1, -1):
             opt1 = 0
             if A[i] == B[j]:
-                opt1 = _best_matching_only(A[i:], B[j:])
+                opt1 = len_lcp(A[i:], B[j:])
             opt2 = table[i, j+1]
             opt3 = table[i+1, j]
             table[i,j] = max(opt1, opt2, opt3)

From 23c6a3a38f827835bfe15e0ac03e097e03d2a9a4 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 11 Jul 2025 12:58:20 -0700
Subject: [PATCH 048/141] add more test cases

---
 pygsti/layouts/evaltree.py            | 30 ++++++++++++---------------
 pygsti/tools/sequencetools.py         | 11 +++++++---
 test/unit/tools/test_sequencetools.py | 24 ++++++++++++++++++++-
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index f493c7cc3..3f7048015 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -23,7 +23,7 @@
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
-from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, build_internal_tables
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 import time
 
 
@@ -476,6 +476,12 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I")
 
 def _compute_qubit_to_lanes_mapping_for_circuit(circuit, num_qubits: int) -> tuple[dict[int, int], dict[int, tuple[int]]]:
     """
+    Parameters:
+    ------------
+    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
+
+    num_qubits: int - The total number of qubits expected in the circuit.
+
     Returns
     --------
     Dictionary mapping qubit number to lane number in the circuit.
@@ -630,15 +636,7 @@ def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
     """
 
 
-    if hasattr(model, "operations"):
-        return model.operations[opTuple].to_dense()
-    elif hasattr(model, "operation_blks"):
-        if opTuple[0] not in model.operation_blks["gates"]:
-            breakpoint()
-        return model.operation_blks["gates"][opTuple[0]].to_dense()
-    else:
-        raise ValueError("Missing attribute")
-
+    return 
 
 def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     op_term = 1
@@ -648,13 +646,13 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:
             # This is in the wrong order.
-            op_term = model_and_gate_to_dense_rep(model, op)
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
             op_term = swap_dense @ (op_term) @ swap_dense
             saved[op] = op_term # Save so we only need to this operation once.
         else:
-            op_term = model_and_gate_to_dense_rep(model, op)
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
     else:
-        op_term = model_and_gate_to_dense_rep(model, op)
+        op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
     return op_term
 
 
@@ -687,7 +685,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         self.qubit_start_point = qubit_starting_loc
 
 
-        internal_matches = build_internal_tables(circuit_list)
+        internal_matches = create_tables_for_internal_LCS(circuit_list)
         best_internal_match = _np.max(internal_matches[0])
 
         max_rounds = int(max(best_external_match,best_internal_match))
@@ -710,14 +708,13 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
                 # We are not going to get a better internal match.
                 pass
             else:
-                internal_matches = build_internal_tables(new_circuit_list)
+                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
 
             best_external_match = _np.max(external_matches[0])
             best_internal_match = _np.max(internal_matches[0])
 
             max_rounds = int(max(best_external_match,best_internal_match))
 
-        self.circuit_list = new_circuit_list
         self.cache = cache
         self.num_circuits = C
         self.from_other = False
@@ -772,7 +769,6 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
         self.num_circuits = other.num_circuits
         self.sequence_intro = other.sequence_intro
         self.swap_gate = other.swap_gate
-        self.circuit_list = other.circuit_list
         self.orig_circuit_list = other.orig_circuit_list
         self.circuit_to_save_location = other.circuit_to_save_location
         self.from_other = other
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 69c5b770b..ff0e8c00f 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -100,6 +100,7 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
     old_cache_num = cache_num
     for seq, cdict in all_subsequences_to_replace.items():
         w = len(seq)
+        update_made = 0
         if  w > 1 or (not isinstance(seq[0], int)):
             # We have reached an item which we can just compute.
             for cir_ind in cdict:
@@ -108,16 +109,20 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
                 while sp+w <= len(my_cir):
                     if list(my_cir[sp: sp+w]) == list(seq):
                         my_cir[sp: sp + w] = [cache_num]
+                        update_made = 1
 
                     sp += 1
                 updated_sequences[cir_ind] = my_cir
 
                 cache_struct[cir_ind] = updated_sequences[cir_ind]
 
-            updated_sequences.append(list(seq))
-            cache_struct[cache_num] = updated_sequences[cache_num]
+            if update_made:
+                # There may have been multiple overlapping subsequences in the same sequence.
+                # (e.g. QWEQWEQWERQWE has QWE, WEQ, and EQW all happen and all are length 3 subsequences.)
+                updated_sequences.append(list(seq))
+                cache_struct[cache_num] = updated_sequences[cache_num]
 
-            cache_num += 1
+                cache_num += 1
 
     sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
 
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
index a11e5a1c0..d51ae5e17 100644
--- a/test/unit/tools/test_sequencetools.py
+++ b/test/unit/tools/test_sequencetools.py
@@ -78,4 +78,26 @@ def test_one_round_update_without_collecting_tables_first():
     assert cache[1] == [3,"Q",3]
     assert np.allclose(seq_intro, np.array(3))
 
-    assert num == len(updated)
\ No newline at end of file
+    assert num == len(updated)
+
+
+def test_update_only_adds_those_strings_which_are_actually_used():
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+        ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+        ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+
+    r2, num, c2, s2 = conduct_one_round_of_lcs_simplification(updated, None, None, num, cache)
+
+    assert len(r2) == num
+
+    assert len(s2) == 1
+
+    assert 4 in c2[2]
+
+    assert len(c2[4]) == 3
+

From 2ea2a6dfc9d15b3dfc9de3f379a12e098e1feb12 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 11 Jul 2025 12:58:56 -0700
Subject: [PATCH 049/141] Simplify

---
 pygsti/layouts/evaltree.py       | 9 ++-------
 pygsti/models/layerrules.py      | 3 +--
 pygsti/models/localnoisemodel.py | 4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 3f7048015..353145466 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -630,15 +630,10 @@ def setup_circuit_list_for_LCS_computations(
 
 #region Lane Collapsing Helpers
 
-def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     """
-    Look up the dense representation of a gate in the model.
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_i+1, qu_i).
     """
-
-
-    return 
-
-def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     op_term = 1
     if op.num_qubits == 2:
         # We may need to do swaps.
diff --git a/pygsti/models/layerrules.py b/pygsti/models/layerrules.py
index 5060d3087..d47cd21ab 100644
--- a/pygsti/models/layerrules.py
+++ b/pygsti/models/layerrules.py
@@ -14,7 +14,6 @@
 from pygsti.modelmembers import operations as _op
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
 from pygsti.baseobjs.label import LabelTup as _LabelTup
-from pygsti.models.model import OpModel as _OpModel
 
 class LayerRules(_NicelySerializable):
     """
@@ -112,7 +111,7 @@ def operation_layer_operator(self, model, layerlbl, cache):
         #raise KeyError(f"Cannot create operator for non-primitive layer: {layerlbl}")
         raise KeyError("Cannot create operator for non-primitive layer: %s" % str(layerlbl))
     
-    def get_dense_process_matrix_represention_for_gate(self, model: _OpModel, lbl: _LabelTup):
+    def get_dense_process_matrix_represention_for_gate(self, model, lbl: _LabelTup):
         """
         Get the dense process matrix corresponding to the lbl.
         Note this should be the minimal size required to represent the dense operator.
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index d9e21ceee..2a7b24fea 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -415,7 +415,7 @@ def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels,
         self._add_global_idle_to_all_layers = False
         self._add_padded_idle = False
         self.use_op_caching = True  # expert functionality - can be turned off if needed
-        self._spacial_homogeneity_assumed = independent_gates
+        self._spatial_homogeneity_assumed = not independent_gates
 
         if implicit_idle_mode not in ('none', 'add_global', 'only_global', 'pad_1Q'):
             raise ValueError("Invalid `implicit_idle_mode`: '%s'" % str(implicit_idle_mode))
@@ -630,7 +630,7 @@ def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel
         _np.ndarray
         """
 
-        key = lbl.name if self._spacial_homogeneity_assumed else lbl
+        key = lbl.name if self._spatial_homogeneity_assumed else lbl
         return model.operation_blks["gates"][key].to_dense()
 
 

From eb2ce641e78c9539a8964793aef7d25c43bf6f25 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Mon, 14 Jul 2025 13:59:01 -0700
Subject: [PATCH 050/141] Add padded idles into the circuit as necessary.

---
 pygsti/layouts/evaltree.py       | 228 +++++++++++++------------------
 pygsti/models/localnoisemodel.py |  16 ++-
 2 files changed, 108 insertions(+), 136 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 353145466..844787f87 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -22,10 +22,11 @@
 from pygsti.baseobjs.label import LabelTupTup, Label
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
+from pygsti.baseobjs.basis import get_num_qubits_in_basis
 import itertools
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 import time
-
+from typing import Iterable
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -586,9 +587,6 @@ def setup_circuit_list_for_LCS_computations(
     Then, a sequence detailing the number of qubits in each lane for a circuit.
     """
 
-    # output = []
-    # cir_id_to_lanes = []
-
     # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
     # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
 
@@ -606,7 +604,7 @@ def setup_circuit_list_for_LCS_computations(
 
         assert len(sub_cirs) == len(lanes_to_qubits)
         for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j])
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]))
             lbls = sc._line_labels
             if lbls in line_labels_to_circuit_list:
                 line_labels_to_circuit_list[lbls].append(sc)
@@ -621,8 +619,6 @@ def setup_circuit_list_for_LCS_computations(
             else:
                 cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
 
-        # output.extend(sub_cirs)
-        # cir_id_to_lanes.append(lanes_to_qubits)
     return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
 #endregion Split Circuits by lanes helpers
@@ -634,9 +630,9 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
     """
     Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_i+1, qu_i).
     """
-    op_term = 1
     if op.num_qubits == 2:
         # We may need to do swaps.
+        op_term = 1
         if op in saved:
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:
@@ -646,9 +642,8 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
             saved[op] = op_term # Save so we only need to this operation once.
         else:
             op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-    else:
-        op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-    return op_term
+        return op_term
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
 
 
 def combine_two_gates(cumulative_term, next_dense_matrix):
@@ -752,9 +747,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
                                     [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
 
-        # Assumes a perfect swap gate!
-        # self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")            
-
+      
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
         Construct a tree from another tree.
@@ -804,62 +797,18 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
 
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-        saved: dict[int, _LinearOperator] = {}
+        saved: dict[int | LabelTupTup, _np.ndarray] = {}
         
-
-
-        def cache_lookup_and_product(cumulative_term, term_to_extend_with: int):
-            if cumulative_term is None:
-                # look up result.
-                return saved[term]
-            elif isinstance(term, int) and cumulative_term is not None:
-                return combine_two_gates(cumulative_term, saved[term_to_extend_with]) 
-
-
-
-        def collapse_cache_line(cumulative_term, term_to_extend_with: int | LabelTupTup):
-
-            if isinstance(term_to_extend_with, int):
-                return cache_lookup_and_product(cumulative_term, term_to_extend_with)
-
-            else:
-                val = 1
-                qubits_used = [i for i in range(num_qubits_in_default)]
-                while qubits_used:
-                    qu = qubits_used[0]
-                    gate_matrix = _np.eye(4)
-                    found = False
-                    op_ind = self.qubit_start_point # Handle circuits with only qubits (i, i+k) where k is number of qubits in the subsystem.
-                    while not found and op_ind < len(term):
-                        op = term[op_ind]
-                        if qu in op.qubits:
-                            gate_matrix = get_dense_representation_of_gate_with_perfect_swap_gates(model, op, saved, self.swap_gate)
-                            found = True
-                            # We assume that the qubits need to overlap for a specific gate.
-                            # i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
-                            qubits_used = qubits_used[len(op.qubits):]
-                        op_ind += 1
-                    val = _np.kron(val, gate_matrix)
-                    if not found:
-                        # Remove that qubit from list to check.
-                        qubits_used = qubits_used[1:]
-
-                if val.shape != expected_shape:
-                    breakpoint()
-                if cumulative_term is None:
-                    return val
-                else:
-                    return combine_two_gates(cumulative_term, val)
-
-        expected_shape = (4**num_qubits_in_default, 4**num_qubits_in_default)
         for key in round_keys:
             for cind in self.sequence_intro[key]:
                 cumulative_term = None
                 for term in self.cache[cind]:
-                    cumulative_term = collapse_cache_line(cumulative_term, term)
+                    cumulative_term = self._collapse_cache_line(model, cumulative_term, term, saved, num_qubits_in_default)
                         
                 if cumulative_term is None:
-                    saved[cind] = _np.eye(4**num_qubits_in_default) # identity of the appropriate size.
+                    saved[cind] = get_dense_representation_of_gate_with_perfect_swap_gates(model,  Label("Fake_Gate_To_Get_Tensor_Size_Right", *(qu for qu in range(num_qubits_in_default))), saved, self.swap_gate)
+                    # This will return an identity gate of the appropriate size.
+                    # But it may also be a Noisy idle gate.
                 else:
                     saved[cind] = cumulative_term
         if __debug__:
@@ -871,6 +820,66 @@ def collapse_cache_line(cumulative_term, term_to_extend_with: int | LabelTupTup)
     
         return saved, self.circuit_to_save_location 
 
+    def handle_results_cache_lookup_and_product(self,
+                            cumulative_term: None | _np.ndarray,
+                            term_to_extend_with: int | LabelTupTup,
+                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
+
+        if cumulative_term is None:
+            # look up result.
+            return results_cache[term_to_extend_with]
+        return combine_two_gates(cumulative_term, results_cache[term_to_extend_with]) 
+
+
+    def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
+                            term_to_extend_with: int | LabelTupTup,
+                            results_cache: dict[int | LabelTupTup, _np.ndarray],
+                            num_qubits_in_default: int) -> _np.ndarray:
+        """
+        Reduce a cache line to a single process matrix.
+
+        This should really only be called from collapse_circuits_to_process_matrices.
+
+        """
+
+
+        if isinstance(term_to_extend_with, int):
+            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
+
+        else:
+            val = 1
+            qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
+            matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
+                                            results_cache, self.swap_gate) for op in term_to_extend_with}
+            qubit_used = []
+            for key in matrix_reps.keys():
+                qubit_used.extend(key)
+
+            assert len(qubit_used) == len(set(qubit_used))
+            unused_qubits = set(qubits_available) - set(qubit_used)
+
+            implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
+                                        Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
+                                        results_cache, self.swap_gate) for qu in unused_qubits}
+
+            while qubits_available:
+
+                qu = qubits_available[0]
+                if qu in unused_qubits:
+                    val = _np.kron(val, implicit_idle_reps[(qu,)])
+                    qubits_available = qubits_available[1:]
+                else:
+                    # It must be a part of a non-trivial gate.
+                    gatekey = [key for key in matrix_reps if qu in key][0]
+                    val = _np.kron(val, matrix_reps[gatekey])
+
+                    qubits_available = qubits_available[len(gatekey):]
+
+            if cumulative_term is None:
+                return val
+            return combine_two_gates(cumulative_term, val)
+
+
     def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         output = ()
@@ -885,65 +894,6 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         return list(output)
 
-    """        
-    def _evaluate_product_rule(self, cind: int, rn: int):
-
-        sequence = self.cache[cind]
-        num_terms = len(sequence)
-        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
-
-        for sub_r in sorted(sub_rounds.keys())[::-1]:
-            sub_sequence = None
-            for sub_cind in sub_rounds[sub_r]:
-        
-                for term in sub_tree_cache[sub_cind]:
-                    if isinstance(term, tuple):
-                        # Then, this may be a partial derivative or an character in original sequence.
-                        if len(term) == 2:
-                            # Then this is taking a partial derivative.
-                            natural_term = term[1][0]
-                            if natural_term in self.derivative_cache:
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-                            else:
-                                # This should be a natural derivative.
-                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-
-                        # It is just an index to sequence for where to look in the cache.
-                        next_ind = term[0]
-                        sequence_val = sequence[next_ind]
-
-                        if isinstance(term, int) and cumulative_term is None:
-                            # look up result.
-                            cumulative_term = saved[term]
-                        elif isinstance(term, int) and not (cumulative_term is None):
-                            cumulative_term = saved[term] @ cumulative_term
-                        elif isinstance(term, LabelTupTup):
-                            val = 1
-                            for op in term:
-                            op_term = 1
-                            if op.num_qubits == 2:
-                                # We may need to do swaps.
-                                if op in saved:
-                                    op_term = saved[op]
-                                elif op.qubits[1] < op.qubits[0]:
-                                    # This is in the wrong order.
-                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                                    op_term = swap_term @ op_term @ swap_term.T
-                                    saved[op] = op_term # Save so we only need to this operation once.
-                                else:
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                            else:
-                                op_term = model.operation_blks["gates"][op].to_dense()
-                            val = _np.kron(val, op_term)
-                        #val = model.operation_blks["gates"][term[0]].to_dense()
-                        if cumulative_term is None:
-                            cumulative_term = val
-                        else:
-                            cumulative_term = val @ cumulative_term
-    """
-
 
 class CollectionOfLCSEvalTrees():
 
@@ -988,16 +938,22 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
 
     def collapse_circuits_to_process_matrices(self, model):
         # Just collapse all of them.
-        
+
+
         self.saved_results = {}
         for key in self.trees:
-            self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
+            tree = self.trees[key]
+            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits)
+            # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+            self.saved_results[key] = out1
+            self.sub_cir_to_ind_in_results[key] = out2
 
     def reconstruct_full_matrices(self):
 
         if len(self.saved_results) == 0:
             return
-        
+
         # Now we can do the combination.
 
         num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
@@ -1008,17 +964,19 @@ def reconstruct_full_matrices(self):
             for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
                 cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
                 lblkey = cir._line_labels
-
-                if len(cir.layertup) == 0:
-
-                    lane_circuits.append(_np.eye(4**(len(lblkey))))
-                else:
-                    if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
-                        print(lblkey)
-                        print(cir)
-                        breakpoint()
+                if lblkey == ("*",):
+                    # We are gettting a noisy idle line and so need to check the size we are expecting here.
                     ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                    lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                    print(cir.num_lines)
+                    # lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+
+                    # 
+                if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
+                    print(lblkey)
+                    print(cir)
+                    breakpoint()
+                ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                lane_circuits.append(self.saved_results[lblkey][ind_in_results])
             output.append(lane_circuits)
 
         # Need a map from lane id to computed location.
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 2a7b24fea..0ca844214 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -631,7 +631,21 @@ def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel
         """
 
         key = lbl.name if self._spatial_homogeneity_assumed else lbl
-        return model.operation_blks["gates"][key].to_dense()
+        if key in model.operation_blks["gates"]:
+            return model.operation_blks["gates"][key].to_dense()
+
+        elif self._add_padded_idle:
+            # We have idle gates that we can include.
+            absent_sslbls = lbl[1:]
+            new_key = self.single_qubit_idle_layer_labels[absent_sslbls]
+            if self._spatial_homogeneity_assumed:
+                new_key = new_key.name
+            return model.operation_blks["gates"][new_key].to_dense()
+    
+        else:
+            # Assume a perfect idle q-qubit gate.
+            return _np.eye(4**len(lbl.qubits))
+
 
 
 

From 27167a36d07ba362725e50557aabc1b490325171 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 14 Jul 2025 14:52:23 -0700
Subject: [PATCH 051/141] bugfix for
 ExplicitLayerRules.get_dense_process_matrix_represention_for_gate. Enable
 LCSEvalTreeForwardSimulator in tests. Change
 EvalTreeBasedUponLongestCommonSubstring.collapse_circuits_to_process_matrices
 to not use the `fake` gate label.

---
 pygsti/layouts/evaltree.py           |  6 ++----
 pygsti/models/explicitmodel.py       |  2 +-
 test/unit/objects/test_forwardsim.py | 12 ++++++++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 844787f87..2d20bacd8 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -22,7 +22,6 @@
 from pygsti.baseobjs.label import LabelTupTup, Label
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
-from pygsti.baseobjs.basis import get_num_qubits_in_basis
 import itertools
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 import time
@@ -806,9 +805,8 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                     cumulative_term = self._collapse_cache_line(model, cumulative_term, term, saved, num_qubits_in_default)
                         
                 if cumulative_term is None:
-                    saved[cind] = get_dense_representation_of_gate_with_perfect_swap_gates(model,  Label("Fake_Gate_To_Get_Tensor_Size_Right", *(qu for qu in range(num_qubits_in_default))), saved, self.swap_gate)
-                    # This will return an identity gate of the appropriate size.
-                    # But it may also be a Noisy idle gate.
+                    saved[cind] = _np.eye(4**num_qubits_in_default)
+                    # NOTE: unclear when (if ever) this should be a noisy idle gate.
                 else:
                     saved[cind] = cumulative_term
         if __debug__:
diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index b9f1bad67..1331d13f3 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -1757,7 +1757,7 @@ def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel,
         _np.ndarray
         """
 
-        operation = model.operations["gates"][lbl]
+        operation = model.operations[lbl]
 
         if isinstance(operation, _EmbeddedOp):
             return operation.embedded_op.to_dense()
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 2c742f533..17509b93f 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -13,7 +13,10 @@
 from pygsti.models import ExplicitOpModel
 from pygsti.circuits import Circuit, create_lsgst_circuit_lists
 from pygsti.baseobjs import Label as L
-from ..util import BaseCase
+try:
+    from ..util import BaseCase
+except ImportError:
+    BaseCase = object
 
 from pygsti.data import simulate_data
 from pygsti.modelpacks import smq1Q_XYI
@@ -282,7 +285,7 @@ def setUp(self):
             SimpleMatrixForwardSimulator(),
             MapForwardSimulator(),
             MatrixForwardSimulator(),
-            # LCSEvalTreeMatrixForwardSimulator()
+            LCSEvalTreeMatrixForwardSimulator()
         ]
         if TorchForwardSimulator.ENABLED:
             sims.append(TorchForwardSimulator())
@@ -364,3 +367,8 @@ def test_matrix_fwdsim(self):
     def test_lcs_matrix_fwdsim(self):
         self._run(LCSEvalTreeMatrixForwardSimulator)
 
+
+if __name__ == '__main__':
+    tester = ForwardSimConsistencyTester()
+    tester.test_consistent_probs()
+    print()

From 7d9f9b2cee796390d7d04a1e6d29de67b7082cad Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 15 Jul 2025 11:17:01 -0700
Subject: [PATCH 052/141] Error out when there is an implicit idle.

---
 pygsti/layouts/evaltree.py     | 10 ++++++++--
 pygsti/models/explicitmodel.py |  3 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 2d20bacd8..1f1c01236 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -601,9 +601,13 @@ def setup_circuit_list_for_LCS_computations(
         qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
         sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
 
+        if not implicit_idle_gate_name:
+            if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
+                raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
+
         assert len(sub_cirs) == len(lanes_to_qubits)
         for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]))
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]),)
             lbls = sc._line_labels
             if lbls in line_labels_to_circuit_list:
                 line_labels_to_circuit_list[lbls].append(sc)
@@ -907,7 +911,9 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
 
         starttime = time.time()
         for key, vals in line_lbls_to_circuit_list.items():
-            sub_cirs = [list(cir) for cir in vals]
+            sub_cirs = []
+            for cir in vals:
+                sub_cirs.append(list(cir))
             if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
                 if len(key) not in size_to_tree:
                     self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index 1331d13f3..e1464ad00 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -1757,6 +1757,9 @@ def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel,
         _np.ndarray
         """
 
+        if lbl not in model.operations:
+            return _np.empty(1)
+
         operation = model.operations[lbl]
 
         if isinstance(operation, _EmbeddedOp):

From 17294bcf002e197d21884878b9db911bddcd6004 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 15 Jul 2025 13:46:41 -0700
Subject: [PATCH 053/141] Make tests easier.

---
 pygsti/circuits/split_circuits_into_lanes.py | 109 ++++++++++++++++
 pygsti/layouts/evaltree.py                   | 127 +++----------------
 test/unit/objects/test_circuit_splitting.py  | 101 +++++++++++++++
 test/unit/objects/test_forwardsim.py         |  22 ++--
 4 files changed, 239 insertions(+), 120 deletions(-)
 create mode 100644 pygsti/circuits/split_circuits_into_lanes.py
 create mode 100644 test/unit/objects/test_circuit_splitting.py

diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
new file mode 100644
index 000000000..633565e2a
--- /dev/null
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -0,0 +1,109 @@
+import numpy as _np
+
+from pygsti.circuits import Circuit as _Circuit
+from pygsti.baseobjs.label import Label, LabelTupTup
+
+def compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit: _Circuit) -> tuple[dict[int, int],
+                                                                                        dict[int, tuple[int]]]:
+    """
+    Parameters:
+    ------------
+    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
+
+    num_qubits: int - The total number of qubits expected in the circuit.
+
+    Returns
+    --------
+    Dictionary mapping qubit number to lane number in the circuit.
+    """
+
+    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(circuit.num_lines)}
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        for op in layer:
+            qubits_used = op.qubits
+            for qb in qubits_used:
+                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
+
+    lanes = {}
+    lan_num = 0
+    visited: dict[int, int] = {}
+    def reachable_nodes(starting_point: int,
+                        graph_qubits_to_neighbors: dict[int, set[int]],
+                        visited: dict[int, set[int]]):
+        """
+        Find which nodes are reachable from this starting point.
+        """
+        if starting_point in visited:
+            return visited[starting_point]
+        else:
+            assert starting_point in graph_qubits_to_neighbors
+            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
+            output = set(visited[starting_point])
+            for child in graph_qubits_to_neighbors[starting_point]:
+                if child != starting_point:
+                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
+            visited[starting_point] = output
+            return output
+
+    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
+    while available_starting_points:
+        sp = available_starting_points[0]
+        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
+        for node in nodes:
+            available_starting_points.remove(node)
+        lanes[lan_num] = nodes
+        lan_num += 1
+
+    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
+        """
+        Determine a mapping from qubit to the lane it is in for this specific circuit.
+        """
+        out = {}
+        for key, val in lanes_to_qubits.items():
+            for qb in val:
+                out[qb] = key
+        return out
+
+    return compute_qubits_to_lanes(lanes), lanes
+
+
+def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
+    """
+    Split a circuit into multiple subcircuits which do not talk across lanes.
+    """
+
+    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
+
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        group = []
+        group_lane = None
+        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
+
+        for op in sorted_layer:
+            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
+            qubits_used = op.qubits # This will be a list of qubits used.
+            # I am assuming that the qubits are indexed numerically and not by strings.
+            lane = qubits_to_lanes[qubits_used[0]]
+
+            if group_lane is None:
+                group_lane = lane
+                group.append(op)
+            elif group_lane == lane:
+                group.append(op)
+            else:
+                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+                group_lane = lane
+                group = [op]
+
+        if len(group) > 0:
+            # We have a left over group.
+            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+
+    if num_layers == 0:
+        return []
+
+    return lanes_to_gates
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 1f1c01236..df4e63533 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -24,6 +24,8 @@
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
+
+from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
 from typing import Iterable
 
@@ -474,111 +476,14 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I")
     return tmp
 
 
-def _compute_qubit_to_lanes_mapping_for_circuit(circuit, num_qubits: int) -> tuple[dict[int, int], dict[int, tuple[int]]]:
-    """
-    Parameters:
-    ------------
-    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
-
-    num_qubits: int - The total number of qubits expected in the circuit.
-
-    Returns
-    --------
-    Dictionary mapping qubit number to lane number in the circuit.
-    """
-
-    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(num_qubits)}
-    num_layers = circuit.num_layers
-    for layer_ind in range(num_layers):
-        layer = circuit.layer(layer_ind)
-        for op in layer:
-            qubits_used = op.qubits
-            for qb in qubits_used:
-                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
-
-    lanes = {}
-    lan_num = 0
-    visited: dict[int, int] = {}
-    def reachable_nodes(starting_point: int, graph_qubits_to_neighbors: dict[int, set[int]], visited: dict[int, set[int]]):
-        """
-        Find which nodes are reachable from this starting point.
-        """
-        if starting_point in visited:
-            return visited[starting_point]
-        else:
-            assert starting_point in graph_qubits_to_neighbors
-            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
-            output = set(visited[starting_point])
-            for child in graph_qubits_to_neighbors[starting_point]:
-                if child != starting_point:
-                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
-            visited[starting_point] = output
-            return output
-
-    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
-    while available_starting_points:
-        sp = available_starting_points[0]
-        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
-        for node in nodes:
-            available_starting_points.remove(node)
-        lanes[lan_num] = nodes
-        lan_num += 1
-
-    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
-        """
-        Determine a mapping from qubit to the lane it is in for this specific circuit.
-        """
-        out = {}
-        for key, val in lanes_to_qubits.items():
-            for qb in val:
-                out[qb] = key
-        return out
-
-    return compute_qubits_to_lanes(lanes), lanes
-
-
-def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
-    """
-    Split a circuit into multiple subcircuits which do not talk across lanes.
-    """
-
-    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
-
-    num_layers = circuit.num_layers
-    for layer_ind in range(num_layers):
-        layer = circuit.layer(layer_ind)
-        group = []
-        group_lane = None
-        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
-
-        for op in sorted_layer:
-            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
-            qubits_used = op.qubits # This will be a list of qubits used.
-            # I am assuming that the qubits are indexed numerically and not by strings.
-            lane = qubits_to_lanes[qubits_used[0]]
-
-            if group_lane is None:
-                group_lane = lane
-                group.append(op)
-            elif group_lane == lane:
-                group.append(op)
-            else:
-                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
-                group_lane = lane
-                group = [op]
-
-        if len(group) > 0:
-            # We have a left over group.
-            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
 
-    return lanes_to_gates
 
 
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
-                                                    dict[tuple[_Circuit], list[tuple[int, int]]],
-                                                    dict[tuple[int, ...], set[_Circuit]]]:
+                                                    dict[tuple[LabelTupTup], list[tuple[int, int]]],
+                                                    dict[tuple[int, ...], list[LabelTupTup]]]:
     """
     Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
 
@@ -598,25 +503,25 @@ def setup_circuit_list_for_LCS_computations(
         if implicit_idle_gate_name:
             cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
 
-        qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
-        sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
+        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir, cir.num_lines)
+        sub_cirs = compute_subcircuits(cir, qubit_to_lane)
 
         if not implicit_idle_gate_name:
             if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
                 raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
 
-        assert len(sub_cirs) == len(lanes_to_qubits)
+        assert len(sub_cirs) == len(lane_to_qubits)
         for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]),)
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lane_to_qubits[j]),)
             lbls = sc._line_labels
             if lbls in line_labels_to_circuit_list:
-                line_labels_to_circuit_list[lbls].append(sc)
+                line_labels_to_circuit_list[lbls].append(sc.layertup)
             else:
-                line_labels_to_circuit_list[lbls] = [sc]
-            if sc in sub_cir_to_cir_id_and_lane_id:
-                sub_cir_to_cir_id_and_lane_id[sc].append((i,j))
+                line_labels_to_circuit_list[lbls] = [sc.layertup]
+            if sc.layertup in sub_cir_to_cir_id_and_lane_id:
+                sub_cir_to_cir_id_and_lane_id[sc.layertup].append((i,j))
             else:
-                sub_cir_to_cir_id_and_lane_id[sc] = [(i,j)]
+                sub_cir_to_cir_id_and_lane_id[sc.layertup] = [(i,j)]
             if i in cir_ind_and_lane_id_to_sub_cir:
                 cir_ind_and_lane_id_to_sub_cir[i][j] = sc
             else:
@@ -899,7 +804,9 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
 class CollectionOfLCSEvalTrees():
 
-    def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id, cir_id_and_lane_id_to_sub_cir):
+    def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTupTup]],
+                 sub_cir_to_full_cir_id_and_lane_id,
+                 cir_id_and_lane_id_to_sub_cir):
         
         self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
 
@@ -913,7 +820,7 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
         for key, vals in line_lbls_to_circuit_list.items():
             sub_cirs = []
             for cir in vals:
-                sub_cirs.append(list(cir))
+                sub_cirs.append(cir.layertup)
             if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
                 if len(key) not in size_to_tree:
                     self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
diff --git a/test/unit/objects/test_circuit_splitting.py b/test/unit/objects/test_circuit_splitting.py
new file mode 100644
index 000000000..ea5bdf998
--- /dev/null
+++ b/test/unit/objects/test_circuit_splitting.py
@@ -0,0 +1,101 @@
+from pygsti.circuits.circuit import Circuit as _Circuit
+from pygsti.baseobjs.label import Label
+from pygsti.circuits.split_circuits_into_lanes import compute_subcircuits, compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit
+import numpy as np
+
+
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
+    my_circuit = []
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = str(np.random.choice(allowed_gates))
+            layer.append((gate, qnum))
+        my_circuit.append(layer)
+    return _Circuit(my_circuit)
+
+
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: int, depth_L: int, lane_end_points: list[int], gates_to_qubits_used: dict[str, int]):
+
+    assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
+    assert lane_end_points[0] > 0
+    assert np.all(np.diff(lane_end_points) > 0) # then it is sorted in increasing order.
+
+    if lane_end_points[-1] < num_qubits:
+        lane_end_points.append(num_qubits)
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        start_point = 0
+
+        for lane_ep in lane_end_points:
+            num_used: int = 0
+            while num_used < (lane_ep - start_point):
+                navail = (lane_ep - start_point) - num_used
+                nchosen = 0
+                if navail >= max(n_qs_to_gates_avail):
+                    # we can use any gate
+                    nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+                else:
+                    # we need to first choose how many to use.
+                    nchosen = np.random.randint(1, navail + 1)
+                gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+                tmp = list(np.random.permutation(nchosen) + num_used + start_point) # Increase to offset.
+                perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+                if gate == "Gcustom":
+                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+                else:
+                    layer.append((gate, *perm_of_qubits_used))
+                num_used += nchosen
+
+            if num_used > (lane_ep - start_point) + 1:
+                print(num_used, f"lane ({start_point}, {lane_ep})")
+                raise AssertionError("lane barrier is broken")
+            
+            start_point = lane_ep
+        my_circuit.append(layer)
+    return _Circuit(my_circuit, line_labels=[i for i in range(num_qubits)])
+
+
+def test_subcircuits_splits_can_create_empty_sub_circuit():
+
+
+    original = _Circuit([], line_labels=[0])
+
+    qubits_to_lanes = {0: 0}
+
+    attempt = compute_subcircuits(original, qubits_to_lanes)
+
+    assert original == _Circuit(attempt, line_labels=[0])
+
+
+def test_find_qubit_to_lane_splitting():
+
+    gates_to_num_used = {"X": 1, "Y": 1, "Z": 1, "CNOT": 2, "CZ": 2}
+
+    depth = 10
+    num_qubits = 6
+
+    lane_eps = [1, 2, 4, 5]
+    # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+
+    circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
+
+    qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
+
+
+    assert len(qubit_to_lane) == num_qubits
+
+    assert len(lane_to_qubits) <= num_qubits
+
+    circuit = _Circuit([[]])
+
+test_subcircuits_splits_can_create_empty_sub_circuit()
\ No newline at end of file
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 17509b93f..e2236e845 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -19,12 +19,14 @@
     BaseCase = object
 
 from pygsti.data import simulate_data
-from pygsti.modelpacks import smq1Q_XYI
+from pygsti.modelpacks import smq1Q_XYI, smq1Q_XY
 from pygsti.protocols import gst
 from pygsti.protocols.protocol import ProtocolData
 from pygsti.tools import two_delta_logl
 
 
+
+GLOBAL_MODEL_IDLE = smq1Q_XYI
 def Ls(*args):
     """ Convert args to a tuple to Labels """
     return tuple([L(x) for x in args])
@@ -153,8 +155,8 @@ class BaseProtocolData:
 
     @classmethod
     def setUpClass(cls):
-        cls.gst_design = smq1Q_XYI.create_gst_experiment_design(max_max_length=16)
-        cls.mdl_target = smq1Q_XYI.target_model()
+        cls.gst_design = GLOBAL_MODEL_IDLE.create_gst_experiment_design(max_max_length=16)
+        cls.mdl_target = GLOBAL_MODEL_IDLE.target_model()
         cls.mdl_datagen = cls.mdl_target.depolarize(op_noise=0.05, spam_noise=0.025)
 
         ds = simulate_data(cls.mdl_datagen, cls.gst_design.all_circuits_needing_data, 20000, sample_error='none')
@@ -259,23 +261,23 @@ def jac_colinearities(self):
             colinearities *= -1
         return colinearities
 
-    
 
 class ForwardSimConsistencyTester(TestCase):
 
     PROBS_TOL = 1e-14
     JACS_TOL = 1e-10
 
+    
     def setUp(self):
-        self.model_ideal = smq1Q_XYI.target_model()
+        self.model_ideal = GLOBAL_MODEL_IDLE.target_model()
         if TorchForwardSimulator.ENABLED:
             # TorchFowardSimulator can only work with TP modelmembers.
             self.model_ideal.convert_members_inplace(to_type='full TP')
         
         self.model_noisy = self.model_ideal.depolarize(op_noise=0.05, spam_noise=0.025)
-        prep_fiducials = smq1Q_XYI.prep_fiducials()
-        meas_fiducials = smq1Q_XYI.meas_fiducials()
-        germs = smq1Q_XYI.germs()
+        prep_fiducials = GLOBAL_MODEL_IDLE.prep_fiducials()
+        meas_fiducials = GLOBAL_MODEL_IDLE.meas_fiducials()
+        germs = GLOBAL_MODEL_IDLE.germs()
         max_lengths = [4]
         circuits = create_lsgst_circuit_lists(
             self.model_noisy, prep_fiducials, meas_fiducials, germs, max_lengths
@@ -339,9 +341,9 @@ class ForwardSimIntegrationTester(BaseProtocolData):
 
     def _run(self, obj : ForwardSimulator.Castable):
         self.setUpClass()
-        proto = gst.GateSetTomography(smq1Q_XYI.target_model("full TP"), 'stdgaugeopt', name="testGST")
+        proto = gst.GateSetTomography(GLOBAL_MODEL_IDLE.target_model("full TP"), name="testGST")
         results = proto.run(self.gst_data, simulator=obj)
-        mdl_result = results.estimates["testGST"].models['stdgaugeopt']
+        mdl_result = results.estimates["testGST"].models["final iteration estimate"]
         twoDLogL = two_delta_logl(mdl_result, self.gst_data.dataset)
         assert twoDLogL <= 0.05  # should be near 0 for perfect data
         pass

From 06dbc64372e688bff3b6e57271006d2f987870ec Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 15 Jul 2025 13:58:28 -0700
Subject: [PATCH 054/141] Improve the circuit splitting test.

---
 test/unit/objects/test_circuit_splitting.py | 32 +++++++++++++++++----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/test/unit/objects/test_circuit_splitting.py b/test/unit/objects/test_circuit_splitting.py
index ea5bdf998..397392670 100644
--- a/test/unit/objects/test_circuit_splitting.py
+++ b/test/unit/objects/test_circuit_splitting.py
@@ -4,7 +4,11 @@
 import numpy as np
 
 
-def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]) -> _Circuit:
+    """
+    Build a random circuit of depth L which operates on num_qubits and has the allowed
+    single qubit gates specified in allowed gates.
+    """
     my_circuit = []
     for lnum in range(depth_L):
         layer = []
@@ -12,10 +16,20 @@ def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
             gate = str(np.random.choice(allowed_gates))
             layer.append((gate, qnum))
         my_circuit.append(layer)
-    return _Circuit(my_circuit)
+    return _Circuit(my_circuit, line_labels=[i for i in range(num_qubits)])
+
 
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(
+                                num_qubits: int,
+                                depth_L: int,
+                                lane_end_points: list[int],
+                                gates_to_qubits_used: dict[str, int]) -> _Circuit:
+    """
+    Builds a circuit with a known lane structure.
+    Any two + qubit lanes can be split into smaller lanes if none of the gates
+    chosen for that lane actually operate on two or more qubits.
+    """
 
-def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: int, depth_L: int, lane_end_points: list[int], gates_to_qubits_used: dict[str, int]):
 
     assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
     assert lane_end_points[0] > 0
@@ -86,7 +100,9 @@ def test_find_qubit_to_lane_splitting():
 
     lane_eps = [1, 2, 4, 5]
     # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+    minimum_num_lanes = 5
 
+    # This is a random circuit so the lanes may not be perfect.
     circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
 
     qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
@@ -94,8 +110,14 @@ def test_find_qubit_to_lane_splitting():
 
     assert len(qubit_to_lane) == num_qubits
 
+    assert len(lane_to_qubits) >= minimum_num_lanes
     assert len(lane_to_qubits) <= num_qubits
 
-    circuit = _Circuit([[]])
+    for qubit in qubit_to_lane:
+        assert qubit_to_lane[qubit] in lane_to_qubits
+
 
-test_subcircuits_splits_can_create_empty_sub_circuit()
\ No newline at end of file
+    for lane in lane_to_qubits:
+        for qu in lane_to_qubits[lane]:
+            assert qu in qubit_to_lane
+            assert lane == qubit_to_lane[qu]

From 7ad7b6dcffbc9ef56112c2d0a72f5c36d78f5a45 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 15 Jul 2025 14:22:45 -0700
Subject: [PATCH 055/141] change Circuit.replace_gatename_inplace to handle the
 common situation when old_gatename is not a Label object, but can match
 old_gatename == obj for some obj encountered when iterating over elements of
 Circuit._labels.

---
 pygsti/circuits/circuit.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 7674b9bb5..bb3915cfe 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -2563,9 +2563,15 @@ def replace_gatename_inplace(self, old_gatename, new_gatename):
 
         def replace(obj):  # obj is either a simple label or a list
             if isinstance(obj, _Label):
-                if obj.name == old_gatename:
-                    newobj = _Label(new_gatename, obj.sslbls)
-                else: newobj = obj
+                newobj = _Label(new_gatename, obj.sslbls) if (obj.name == old_gatename) else obj
+            elif obj == old_gatename:
+                if len(obj) == 0:
+                    sslbls = self.line_labels
+                else:
+                    import warnings
+                    warnings.warn(f'Cannot infer target of gate(s) of {obj}.')
+                    sslbls = tuple()
+                newobj = _Label((new_gatename,) + sslbls)
             else:
                 newobj = [replace(sub) for sub in obj]
             return newobj

From f158ebfa57bd94c133708e9c86c75bbbf93bb648 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 17 Jun 2025 14:42:16 -0700
Subject: [PATCH 056/141] Make the matrix forward simulator respect the lanes
 present in the structure.

---
 pygsti/forwardsims/matrixforwardsim.py | 1570 ++++++++++++++++++++++++
 1 file changed, 1570 insertions(+)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 0ce8ba860..0cf5190c4 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -31,6 +31,10 @@
 from pygsti.tools.matrixtools import _fas
 from pygsti.tools import listtools as _lt
 from pygsti.circuits import CircuitList as _CircuitList
+from pygsti.tools.internalgates import internal_gate_unitaries
+from pygsti.tools.optools import unitary_to_superop
+from pygsti.baseobjs.label import LabelTup, LabelTupTup
+
 
 _dummy_profiler = _DummyProfiler()
 
@@ -2158,3 +2162,1569 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                                   layout.resource_alloc())
         return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
                                               dataset, ds_cache)
+
+
+class NicksMatrixForwardSimulator(_DistributableForwardSimulator, SimpleMatrixForwardSimulator):
+    """
+    Computes circuit outcome probabilities by multiplying together circuit-layer process matrices.
+
+    Interfaces with a model via its `circuit_layer_operator` method and extracts a dense matrix
+    representation of operators by calling their `to_dense` method.  An "evaluation tree" that
+    composes all of the circuits using pairwise "joins"  is constructed by a :class:`MatrixCOPALayout`
+    layout object, and this tree then directs pairwise multiplications of process matrices to compute
+    circuit outcome probabilities.  Derivatives are computed analytically, using operators'
+    `deriv_wrt_params` methods.
+
+    Parameters
+    ----------
+    model : Model, optional
+        The parent model of this simulator.  It's fine if this is `None` at first,
+        but it will need to be set (by assigning `self.model` before using this simulator.
+
+    distribute_by_timestamp : bool, optional
+        When `True`, treat the data as time dependent, and distribute the computation of outcome
+        probabilitiesby assigning groups of processors to the distinct time stamps within the
+        dataset.  This means of distribution be used only when the circuits themselves contain
+        no time delay infomation (all circuit layer durations are 0), as operators are cached
+        at the "start" time of each circuit, i.e., the timestamp in the data set.  If `False`,
+        then the data is treated in a time-independent way, and the overall counts for each outcome
+        are used.  If support for intra-circuit time dependence is needed, you must use a different
+        forward simulator (e.g. :class:`MapForwardSimulator`).
+
+    num_atoms : int, optional
+        The number of atoms (sub-evaluation-trees) to use when creating the layout (i.e. when calling
+        :meth:`create_layout`).  This determines how many units the element (circuit outcome
+        probability) dimension is divided into, and doesn't have to correclate with the number of
+        processors.  When multiple processors are used, if `num_atoms` is less than the number of
+        processors then `num_atoms` should divide the number of processors evenly, so that
+        `num_atoms // num_procs` groups of processors can be used to divide the computation
+        over parameter dimensions.
+
+    processor_grid : tuple optional
+        Specifies how the total number of processors should be divided into a number of
+        atom-processors, 1st-parameter-deriv-processors, and 2nd-parameter-deriv-processors.
+        Each level of specification is optional, so this can be a 1-, 2-, or 3- tuple of
+        integers (or None).  Multiplying the elements of `processor_grid` together should give
+        at most the total number of processors.
+
+    param_blk_sizes : tuple, optional
+        The parameter block sizes along the first or first & second parameter dimensions - so
+        this can be a 0-, 1- or 2-tuple of integers or `None` values.  A block size of `None`
+        means that there should be no division into blocks, and that each block processor
+        computes all of its parameter indices at once.
+    """
+
+    @classmethod
+    def _array_types_for_method(cls, method_name):
+        # The array types of *intermediate* or *returned* values within various class methods (for memory estimates)
+        if method_name == '_bulk_fill_probs_block': return cls._array_types_for_method('_compute_product_cache')
+        if method_name == '_bulk_fill_dprobs_block':
+            return cls._array_types_for_method('_compute_product_cache') \
+                + cls._array_types_for_method('_compute_dproduct_cache')
+        if method_name == '_bulk_fill_hprobs_block':
+            return cls._array_types_for_method('_compute_product_cache') \
+                + cls._array_types_for_method('_compute_dproduct_cache') \
+                + cls._array_types_for_method('_compute_hproduct_cache')
+
+        if method_name == '_compute_product_cache': return ('zdd', 'z', 'z')  # cache of gates, scales, and scaleVals
+        if method_name == '_compute_dproduct_cache': return ('zddb',)  # cache x dim x dim x distributed_nparams
+        if method_name == '_compute_hproduct_cache': return ('zddbb',)  # cache x dim x dim x dist_np1 x dist_np2
+        return super()._array_types_for_method(method_name)
+
+    def __init__(self, model=None, distribute_by_timestamp=False, num_atoms=None, processor_grid=None,
+                 param_blk_sizes=None):
+        super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
+        self._mode = "distribute_by_timestamp" if distribute_by_timestamp else "time_independent"
+        self.swap_gate_superop = unitary_to_superop(internal_gate_unitaries()["SWAP"])
+
+        # We are also going to set up lanes to use.
+
+        # Fix it to 5 qubits.
+        self._lanes_used = {0: {0}, 1: {1}, 2: {2,3}, 3: {4}}
+        self._qubits_to_lanes = {0: 0, 1: 1, 2:2, 3:2, 4:3}
+
+
+    def _to_nice_serialization(self):
+        state = super()._to_nice_serialization()
+        state.update({'mode': self._mode,
+                      # (don't serialize parent model or processor distribution info)
+                      })
+        return state
+
+    @classmethod
+    def _from_nice_serialization(cls, state):
+        #Note: resets processor-distribution information
+        return cls(None, state['mode'] == "distribute_by_timestamp")
+
+    def copy(self):
+        """
+        Return a shallow copy of this MatrixForwardSimulator
+
+        Returns
+        -------
+        MatrixForwardSimulator
+        """
+        return MatrixForwardSimulator(self.model)
+
+    def _compute_product_cache(self, layout_atom_tree, resource_alloc):
+        """
+        Computes an array of operation sequence products (process matrices).
+
+        Note: will *not* parallelize computation:  parallelization should be
+        done at a higher level.
+        """
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+
+        #Note: resource_alloc gives procs that could work together to perform
+        # computation, e.g. paralllel dot products but NOT to just partition
+        # futher (e.g. among the wrt_slices) as this is done in the layout.
+        # This function doesn't make use of resource_alloc - all procs compute the same thing.
+
+        eval_tree = layout_atom_tree
+        cacheSize = len(eval_tree)
+        
+        # This is the maximum size any operator can be. However, we are going to make use of the minimum size.
+        prodCache = _np.zeros((cacheSize, dim, dim), 'd')
+        prodCache = [[] for _ in range(cacheSize)] # Build the cache dynamically.
+        scaleCache = _np.zeros(cacheSize, 'd')
+
+        for iDest, iRight, iLeft in eval_tree:
+
+            #Special case of an "initial operation" that can be filled directly
+            if iRight is None:  # then iLeft gives operation:
+                opLabel = iLeft
+                if opLabel is None:
+                    prodCache[iDest] = _np.identity(dim)
+                    # Note: scaleCache[i] = 0.0 from initialization
+                else:
+                    small_gate = 1
+                    if isinstance(opLabel, LabelTup):
+                        small_gate = self.model.operation_blks["gates"][opLabel].to_dense(on_space="minimal")
+                        # We know that this operator is the whole lane.
+
+                        qubits = opLabel.qubits
+                        if len(qubits) == 2:
+                            if qubits[0] > qubits[1]:
+                                # We need to swap.
+                                small_gate = self.swap_gate_superop.T @ small_gate @ self.swap_gate_superop
+
+                    elif isinstance(opLabel, LabelTupTup):
+                        # We need to iterate through this operator in order to build up the right system.
+                        for ind in range(len(opLabel)):
+                            next_matrix = self.model.operation_blks["gates"][opLabel[ind]].to_dense(on_space="minimal")
+                            # Do we need to insert the swap gates?
+                            qubits = opLabel[ind].qubits
+                            if len(qubits) == 2:
+                                if qubits[0] > qubits[1]:
+                                    # We need to swap.
+                                    next_matrix = self.swap_gate_superop.T @ next_matrix @ self.swap_gate_superop
+
+                            small_gate = _np.kron(small_gate, next_matrix)
+                    # gate = self.model.circuit_layer_operator(opLabel, 'op').to_dense(on_space='minimal')
+                    nG = max(_nla.norm(small_gate), 1.0)
+                    prodCache[iDest] = small_gate / nG
+                    scaleCache[iDest] = _np.log(nG)
+                continue
+
+            # combine iLeft + iRight => iDest
+            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
+            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
+            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
+            L, R = prodCache[iLeft], prodCache[iRight]
+            prodCache[iDest] = L @ R
+            scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
+
+            if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
+                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]), 1e-300)
+                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
+                sL, sR = L / nL, R / nR
+                prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
+
+
+        if __debug__:
+            # So that it can be optimized out when called with -o.
+
+            for i in range(cacheSize):
+                # since all scaled gates start with norm <= 1, products should all have norm <= 1
+                assert len((~_np.isfinite(prodCache[i])).nonzero()[0]) == 0
+
+        return prodCache, scaleCache
+
+    def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
+                                resource_alloc=None, wrt_slice=None, profiler=None):
+        """
+        Computes a tree of product derivatives in a linear cache space. Will
+        use derivative columns to parallelize computation.
+        """
+
+        if profiler is None: profiler = _dummy_profiler
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        nDerivCols = self.model.num_params if (wrt_slice is None) \
+            else _slct.length(wrt_slice)
+        deriv_shape = (nDerivCols, dim, dim)
+        eval_tree = layout_atom_tree
+        cacheSize = len(eval_tree)
+
+        #Note: resource_alloc gives procs that could work together to perform
+        # computation, e.g. paralllel dot products but NOT to just partition
+        # futher (e.g. among the wrt_slices) as this is done in the layout.
+        # This function doesn't make use of resource_alloc - all procs compute the same thing.
+
+        ## ------------------------------------------------------------------
+        #
+        ##print("MPI: _compute_dproduct_cache begin: %d deriv cols" % nDerivCols)
+        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
+        #    #print("MPI: _compute_dproduct_cache called w/comm size %d" % comm.Get_size())
+        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
+        #
+        #    if resource_alloc.comm.Get_size() > nDerivCols:
+        #
+        #        #If there are more processors than deriv cols, give a
+        #        # warning -- note that we *cannot* make use of a tree being
+        #        # split because there's no good way to reconstruct the
+        #        # *non-final* parent-tree elements from those of the sub-trees.
+        #        _warnings.warn("Increased speed could be obtained by giving dproduct cache computation"
+        #                       " *fewer* processors, as there are more cpus than derivative columns.")
+        #
+        #    # Use comm to distribute columns
+        #    allDerivColSlice = slice(0, nDerivCols) if (wrt_slice is None) else wrt_slice
+        #    _, myDerivColSlice, _, sub_resource_alloc = \
+        #        _mpit.distribute_slice(allDerivColSlice, resource_alloc.comm)
+        #    #print("MPI: _compute_dproduct_cache over %d cols (%s) (rank %d computing %s)" \
+        #    #    % (nDerivCols, str(allDerivColIndices), comm.Get_rank(), str(myDerivColIndices)))
+        #    if sub_resource_alloc is not None and sub_resource_alloc.comm is not None \
+        #       and sub_resource_alloc.comm.Get_size() > 1:
+        #        _warnings.warn("Too many processors to make use of in "
+        #                       " _compute_dproduct_cache.")
+        #        if sub_resource_alloc.comm.Get_rank() > 0: myDerivColSlice = slice(0, 0)
+        #        #don't compute anything on "extra", i.e. rank != 0, cpus
+        #
+        #    my_results = self._compute_dproduct_cache(
+        #        layout_atom_tree, prod_cache, scale_cache, None, myDerivColSlice, profiler)
+        #    # pass None as comm, *not* mySubComm, since we can't do any
+        #    #  further parallelization
+        #
+        #    tm = _time.time()
+        #    all_results = resource_alloc.comm.allgather(my_results)
+        #    profiler.add_time("MPI IPC", tm)
+        #    return _np.concatenate(all_results, axis=1)  # TODO: remove this concat w/better gather?
+        #
+        ## ------------------------------------------------------------------
+
+        tSerialStart = _time.time()
+        dProdCache = _np.zeros((cacheSize,) + deriv_shape)
+        wrtIndices = _slct.indices(wrt_slice) if (wrt_slice is not None) else None
+
+        for iDest, iRight, iLeft in eval_tree:
+
+            #Special case of an "initial operation" that can be filled directly
+            if iRight is None:  # then iLeft gives operation:
+                opLabel = iLeft
+                if opLabel is None:
+                    dProdCache[iDest] = _np.zeros(deriv_shape)
+                else:
+                    #doperation = self.dproduct( (opLabel,) , wrt_filter=wrtIndices)
+                    doperation = self._doperation(opLabel, wrt_filter=wrtIndices)
+                    dProdCache[iDest] = doperation / _np.exp(scale_cache[iDest])
+                continue
+
+            tm = _time.time()
+
+            # combine iLeft + iRight => i
+            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
+            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
+            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
+            L, R = prod_cache[iLeft], prod_cache[iRight]
+            dL, dR = dProdCache[iLeft], dProdCache[iRight]
+            dProdCache[iDest] = _np.dot(dL, R) + \
+                _np.swapaxes(_np.dot(L, dR), 0, 1)  # dot(dS, T) + dot(S, dT)
+            profiler.add_time("compute_dproduct_cache: dots", tm)
+            profiler.add_count("compute_dproduct_cache: dots")
+
+            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
+            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
+                dProdCache[iDest] /= _np.exp(scale)
+                if dProdCache[iDest].max() < _DSMALL and dProdCache[iDest].min() > -_DSMALL:
+                    _warnings.warn("Scaled dProd small in order to keep prod managable.")
+            elif (_np.count_nonzero(dProdCache[iDest]) and dProdCache[iDest].max() < _DSMALL
+                  and dProdCache[iDest].min() > -_DSMALL):
+                _warnings.warn("Would have scaled dProd but now will not alter scale_cache.")
+
+        #profiler.print_mem("DEBUGMEM: POINT2"); profiler.comm.barrier()
+
+        profiler.add_time("compute_dproduct_cache: serial", tSerialStart)
+        profiler.add_count("compute_dproduct_cache: num columns", nDerivCols)
+
+        return dProdCache
+
+    def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
+                                d_prod_cache2, scale_cache, resource_alloc=None,
+                                wrt_slice1=None, wrt_slice2=None):
+        """
+        Computes a tree of product 2nd derivatives in a linear cache space. Will
+        use derivative rows and columns to parallelize computation.
+        """
+
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+
+        # Note: dProdCache?.shape = (#circuits,#params_to_diff_wrt,dim,dim)
+        nDerivCols1 = d_prod_cache1.shape[1]
+        nDerivCols2 = d_prod_cache2.shape[1]
+        assert(wrt_slice1 is None or _slct.length(wrt_slice1) == nDerivCols1)
+        assert(wrt_slice2 is None or _slct.length(wrt_slice2) == nDerivCols2)
+        hessn_shape = (nDerivCols1, nDerivCols2, dim, dim)
+        eval_tree = layout_atom_tree
+        cacheSize = len(eval_tree)
+
+        #Note: resource_alloc gives procs that could work together to perform
+        # computation, e.g. paralllel dot products but NOT to just partition
+        # futher (e.g. among the wrt_slices) as this is done in the layout.
+        # This function doesn't make use of resource_alloc - all procs compute the same thing.
+
+        ## ------------------------------------------------------------------
+        #
+        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
+        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
+        #
+        #    if resource_alloc.comm.Get_size() > nDerivCols1 * nDerivCols2:
+        #        #If there are more processors than deriv cells, give a
+        #        # warning -- note that we *cannot* make use of a tree being
+        #        # split because there's no good way to reconstruct the
+        #        # *non-final* parent-tree elements from those of the sub-trees.
+        #        _warnings.warn("Increased speed could be obtained"
+        #                       " by giving hproduct cache computation"
+        #                       " *fewer* processors and *smaller* (sub-)tree"
+        #                       " (e.g. by splitting tree beforehand), as there"
+        #                       " are more cpus than hessian elements.")  # pragma: no cover
+        #
+        #    # allocate final result memory
+        #    hProdCache = _np.zeros((cacheSize,) + hessn_shape)
+        #
+        #    # Use comm to distribute columns
+        #    allDeriv1ColSlice = slice(0, nDerivCols1)
+        #    allDeriv2ColSlice = slice(0, nDerivCols2)
+        #    deriv1Slices, myDeriv1ColSlice, deriv1Owners, mySubComm = \
+        #        _mpit.distribute_slice(allDeriv1ColSlice, resource_alloc.comm)
+        #
+        #    # Get slice into entire range of model params so that
+        #    #  per-gate hessians can be computed properly
+        #    if wrt_slice1 is not None and wrt_slice1.start is not None:
+        #        myHessianSlice1 = _slct.shift(myDeriv1ColSlice, wrt_slice1.start)
+        #    else: myHessianSlice1 = myDeriv1ColSlice
+        #
+        #    #print("MPI: _compute_hproduct_cache over %d cols (rank %d computing %s)" \
+        #    #    % (nDerivCols2, comm.Get_rank(), str(myDerivColSlice)))
+        #
+        #    if mySubComm is not None and mySubComm.Get_size() > 1:
+        #        deriv2Slices, myDeriv2ColSlice, deriv2Owners, mySubSubComm = \
+        #            _mpit.distribute_slice(allDeriv2ColSlice, mySubComm)
+        #
+        #        # Get slice into entire range of model params (see above)
+        #        if wrt_slice2 is not None and wrt_slice2.start is not None:
+        #            myHessianSlice2 = _slct.shift(myDeriv2ColSlice, wrt_slice2.start)
+        #        else: myHessianSlice2 = myDeriv2ColSlice
+        #
+        #        if mySubSubComm is not None and mySubSubComm.Get_size() > 1:
+        #            _warnings.warn("Too many processors to make use of in "
+        #                           " _compute_hproduct_cache.")
+        #            #TODO: remove: not needed now that we track owners
+        #            #if mySubSubComm.Get_rank() > 0: myDeriv2ColSlice = slice(0,0)
+        #            #  #don't compute anything on "extra", i.e. rank != 0, cpus
+        #
+        #        hProdCache[:, myDeriv1ColSlice, myDeriv2ColSlice] = self._compute_hproduct_cache(
+        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice],
+        #            d_prod_cache2[:, myDeriv2ColSlice], scale_cache, None, myHessianSlice1, myHessianSlice2)
+        #        # pass None as comm, *not* mySubSubComm, since we can't do any further parallelization
+        #
+        #        #NOTE: we only need to gather to the root processor (TODO: update this)
+        #        _mpit.gather_slices(deriv2Slices, deriv2Owners, hProdCache, [None, myDeriv1ColSlice],
+        #                            2, mySubComm)  # , gather_mem_limit) #gather over col-distribution (Deriv2)
+        #        #note: gathering axis 2 of hProdCache[:,myDeriv1ColSlice],
+        #        #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
+        #    else:
+        #        #compute "Deriv1" row-derivatives distribution only; don't use column distribution
+        #        hProdCache[:, myDeriv1ColSlice] = self._compute_hproduct_cache(
+        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice], d_prod_cache2,
+        #            scale_cache, None, myHessianSlice1, wrt_slice2)
+        #        # pass None as comm, *not* mySubComm (this is ok, see "if" condition above)
+        #
+        #    #NOTE: we only need to gather to the root processor (TODO: update this)
+        #    _mpit.gather_slices(deriv1Slices, deriv1Owners, hProdCache, [], 1, resource_alloc.comm)
+        #    #, gather_mem_limit) #gather over row-distribution (Deriv1)
+        #    #note: gathering axis 1 of hProdCache,
+        #    #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
+        #
+        #    return hProdCache
+        #
+        ## ------------------------------------------------------------------
+
+        hProdCache = _np.zeros((cacheSize,) + hessn_shape)
+        wrtIndices1 = _slct.indices(wrt_slice1) if (wrt_slice1 is not None) else None
+        wrtIndices2 = _slct.indices(wrt_slice2) if (wrt_slice2 is not None) else None
+
+        for iDest, iRight, iLeft in eval_tree:
+
+            #Special case of an "initial operation" that can be filled directly
+            if iRight is None:  # then iLeft gives operation:
+                opLabel = iLeft
+                if opLabel is None:
+                    hProdCache[iDest] = _np.zeros(hessn_shape)
+                elif not self.model.circuit_layer_operator(opLabel, 'op').has_nonzero_hessian():
+                    #all gate elements are at most linear in params, so
+                    # all hessians for single- or zero-circuits are zero.
+                    hProdCache[iDest] = _np.zeros(hessn_shape)
+                else:
+                    hoperation = self._hoperation(opLabel,
+                                                  wrt_filter1=wrtIndices1,
+                                                  wrt_filter2=wrtIndices2)
+                    hProdCache[iDest] = hoperation / _np.exp(scale_cache[iDest])
+                continue
+
+            # combine iLeft + iRight => i
+            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
+            # (Dest,iLeft,iRight,iFinal) = tup implies circuit[iDest] = circuit[iLeft] + circuit[iRight], but we want:
+            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
+            L, R = prod_cache[iLeft], prod_cache[iRight]
+            dL1, dR1 = d_prod_cache1[iLeft], d_prod_cache1[iRight]
+            dL2, dR2 = d_prod_cache2[iLeft], d_prod_cache2[iRight]
+            hL, hR = hProdCache[iLeft], hProdCache[iRight]
+            # Note: L, R = GxG ; dL,dR = vgs x GxG ; hL,hR = vgs x vgs x GxG
+
+            dLdRa = _np.swapaxes(_np.dot(dL1, dR2), 1, 2)
+            dLdRb = _np.swapaxes(_np.dot(dL2, dR1), 1, 2)
+            dLdR_sym = dLdRa + _np.swapaxes(dLdRb, 0, 1)
+
+            hProdCache[iDest] = _np.dot(hL, R) + dLdR_sym + _np.transpose(_np.dot(L, hR), (1, 2, 0, 3))
+
+            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
+            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
+                hProdCache[iDest] /= _np.exp(scale)
+                if hProdCache[iDest].max() < _HSMALL and hProdCache[iDest].min() > -_HSMALL:
+                    _warnings.warn("Scaled hProd small in order to keep prod managable.")
+            elif (_np.count_nonzero(hProdCache[iDest]) and hProdCache[iDest].max() < _HSMALL
+                  and hProdCache[iDest].min() > -_HSMALL):
+                _warnings.warn("hProd is small (oh well!).")
+
+        return hProdCache
+
+    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
+        """
+        Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
+
+        Parameters
+        ----------
+        circuits : list
+            The circuits whose outcome probabilities should be included in the layout.
+
+        dataset : DataSet
+            The source of data counts that will be compared to the circuit outcome
+            probabilities.  The computed outcome probabilities are limited to those
+            with counts present in `dataset`.
+
+        resource_alloc : ResourceAllocation
+            A available resources and allocation information.  These factors influence how
+            the layout (evaluation strategy) is constructed.
+
+        array_types : tuple, optional
+            A tuple of string-valued array types.  See :meth:`ForwardSimulator.create_layout`.
+
+        derivative_dimensions : int or tuple[int], optional
+            Optionally, the parameter-space dimension used when taking first
+            and second derivatives with respect to the cirucit outcome probabilities.  This must be
+            non-None when `array_types` contains `'ep'` or `'epp'` types.
+            If a tuple, then must be length 1.
+
+        verbosity : int or VerbosityPrinter
+            Determines how much output to send to stdout.  0 means no output, higher
+            integers mean more output.
+
+        layout_creation_circuit_cache : dict, optional (default None)
+            A precomputed dictionary serving as a cache for completed
+            circuits. I.e. circuits with prep labels and POVM labels appended.
+            Along with other useful pre-computed circuit structures used in layout
+            creation.
+            
+        Returns
+        -------
+        MatrixCOPALayout
+        """
+        # There are two types of quantities we adjust to create a good layout: "group-counts" and "processor-counts"
+        #  - group counts:  natoms, nblks, nblks2 give how many indpendently computed groups/ranges of circuits,
+        #                   1st parameters, and 2nd parameters are used.  Making these larger can reduce memory
+        #                   consumption by reducing intermediate memory usage.
+        #  - processor counts: na, np, np2 give how many "atom-processors", "param-processors" and "param2-processors"
+        #                      are used to process data along each given direction.  These values essentially specify
+        #                      how the physical procesors are divided by giving the number of (roughly equal) intervals
+        #                      exist along each dimension of the physical processor "grid".  Thus, thees values are set
+        #                      based on the total number of cores available and how many dimensions are being computed.
+
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+        mem_limit = resource_alloc.mem_limit - resource_alloc.allocated_memory \
+            if (resource_alloc.mem_limit is not None) else None  # *per-processor* memory limit
+        printer = _VerbosityPrinter.create_printer(verbosity, resource_alloc)
+        nprocs = resource_alloc.comm_size
+        comm = resource_alloc.comm
+        if isinstance(derivative_dimensions, int):
+            num_params = derivative_dimensions
+        elif isinstance(derivative_dimensions, tuple):
+            assert len(derivative_dimensions) == 1
+            num_params = derivative_dimensions[0]
+        else:
+            num_params = self.model.num_params
+        C = 1.0 / (1024.0**3)
+
+        if mem_limit is not None:
+            if mem_limit <= 0:
+                raise MemoryError("Attempted layout creation w/memory limit = %g <= 0!" % mem_limit)
+            printer.log("Layout creation w/mem limit = %.2fGB" % (mem_limit * C))
+
+        natoms, na, npp, param_dimensions, param_blk_sizes = self._compute_processor_distribution(
+            array_types, nprocs, num_params, len(circuits), default_natoms=1)
+
+        if self._mode == "distribute_by_timestamp":
+            #Special case: time dependent data that gets grouped & distributed by unique timestamp
+            # To to this, we override above values of natoms, na, and npp:
+            natoms = 1  # save all processor division for within the (single) atom, for different timestamps
+            na, npp = 1, (1, 1)  # save all processor division for within the (single) atom, for different timestamps
+
+        printer.log("MatrixLayout: %d processors divided into %s (= %d) grid along circuit and parameter directions." %
+                    (nprocs, ' x '.join(map(str, (na,) + npp)), _np.prod((na,) + npp)))
+        printer.log("   %d atoms, parameter block size limits %s" % (natoms, str(param_blk_sizes)))
+        assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
+
+        layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
+                                   na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
+                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
+
+        if mem_limit is not None:
+            loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
+            loc_nparams2 = num_params / npp[1] if len(npp) > 1 else 0
+            blk1 = param_blk_sizes[0] if len(param_blk_sizes) > 0 else 0
+            blk2 = param_blk_sizes[1] if len(param_blk_sizes) > 1 else 0
+            if blk1 is None: blk1 = loc_nparams1
+            if blk2 is None: blk2 = loc_nparams2
+            global_layout = layout.global_layout
+            if comm is not None:
+                from mpi4py import MPI
+                max_local_els = comm.allreduce(layout.num_elements, op=MPI.MAX)    # layout.max_atom_elements
+                max_atom_els = comm.allreduce(layout.max_atom_elements, op=MPI.MAX)
+                max_local_circuits = comm.allreduce(layout.num_circuits, op=MPI.MAX)
+                max_atom_cachesize = comm.allreduce(layout.max_atom_cachesize, op=MPI.MAX)
+            else:
+                max_local_els = layout.num_elements
+                max_atom_els = layout.max_atom_elements
+                max_local_circuits = layout.num_circuits
+                max_atom_cachesize = layout.max_atom_cachesize
+            mem_estimate = _bytes_for_array_types(array_types, global_layout.num_elements, max_local_els, max_atom_els,
+                                                  global_layout.num_circuits, max_local_circuits,
+                                                  layout._param_dimensions, (loc_nparams1, loc_nparams2),
+                                                  (blk1, blk2), max_atom_cachesize,
+                                                  self.model.evotype.minimal_dim(self.model.state_space))
+
+            GB = 1.0 / 1024.0**3
+            if mem_estimate > mem_limit:
+                raise MemoryError("Not enough memory for desired layout! (limit=%.1fGB, required=%.1fGB)" % (
+                    mem_limit * GB, mem_estimate * GB))
+            else:
+                printer.log("   Esimated memory required = %.1fGB" % (mem_estimate * GB))
+
+        return layout
+    
+    @staticmethod
+    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
+        """
+        Helper function for pre-computing/pre-processing circuits structures
+        used in matrix layout creation.
+        """
+        cache = dict()
+        completed_circuits, split_circuits = model.complete_circuits(circuits, return_split=True)
+
+        cache['completed_circuits'] = {ckt: comp_ckt for ckt, comp_ckt in zip(circuits, completed_circuits)}
+        cache['split_circuits'] = {ckt: split_ckt for ckt, split_ckt in zip(circuits, split_circuits)}
+
+        if dataset is not None:
+            aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
+            ds_circuits = _lt.apply_aliases_to_circuits(circuits, aliases)
+            unique_outcomes_list = []
+            for ckt in ds_circuits:
+                ds_row = dataset[ckt]
+                unique_outcomes_list.append(ds_row.unique_outcomes if ds_row is not None else None)
+        else:
+            unique_outcomes_list = [None]*len(circuits)
+
+        expanded_circuit_outcome_list = model.bulk_expand_instruments_and_separate_povm(circuits, 
+                                                                                        observed_outcomes_list = unique_outcomes_list, 
+                                                                                        split_circuits = split_circuits)
+        
+        expanded_circuit_cache = {ckt: expanded_ckt for ckt,expanded_ckt in zip(circuits, expanded_circuit_outcome_list)}
+                    
+        cache['expanded_and_separated_circuits'] = expanded_circuit_cache
+
+        expanded_subcircuits_no_spam_cache = dict()
+        for expc_outcomes in cache['expanded_and_separated_circuits'].values():
+            for sep_povm_c, _ in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
+                exp_nospam_c = sep_povm_c.circuit_without_povm[1:] 
+                expanded_subcircuits_no_spam_cache[exp_nospam_c] = exp_nospam_c.expand_subcircuits()
+
+        cache['expanded_subcircuits_no_spam'] = expanded_subcircuits_no_spam_cache
+
+        return cache
+
+    def _scale_exp(self, scale_exps):
+        old_err = _np.seterr(over='ignore')
+        scaleVals = _np.exp(scale_exps)  # may overflow, but OK if infs occur here
+        _np.seterr(**old_err)
+        return scaleVals
+
+    def _rho_e_from_spam_tuple(self, spam_tuple):
+        # This calculator uses the convention that rho has shape (N,1)
+        rholabel, elabel = spam_tuple
+        rho = self.model.circuit_layer_operator(rholabel, 'prep').to_dense(on_space='minimal')[:, None]
+        E = _np.conjugate(_np.transpose(self.model.circuit_layer_operator(
+            elabel, 'povm').to_dense(on_space='minimal')[:, None]))
+        return rho, E
+
+    def _probs_from_rho_e(self, rho, e, gs, scale_vals):
+        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
+
+        #Compute probability and save in return array
+        # want vp[iFinal] = float(dot(e, dot(G, rho)))
+        #  vp[i] = sum_k,l e[0,k] gs[i,k,l] rho[l,0] * scale_vals[i]
+        #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
+        #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
+        #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
+        return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
+        # shape == (len(circuit_list),) ; may overflow but OK
+
+    def _dprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs, scale_vals, wrt_slice=None):
+        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
+
+        rholabel, elabel = spam_tuple
+        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
+        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
+        nCircuits = gs.shape[0]
+
+        nDerivCols = self.model.num_params if wrt_slice is None else _slct.length(wrt_slice)
+
+        # GATE DERIVS (assume d_gs is already sized/filtered) -------------------
+        assert(d_gs.shape[1] == nDerivCols), "d_gs must be pre-filtered!"
+
+        #Compute d(probability)/dOps and save in return list (now have G,dG => product, dprod_dOps)
+        #  prod, dprod_dOps = G,dG
+        # dp_dOps[i,j] = sum_k,l e[0,k] d_gs[i,j,k,l] rho[l,0]
+        # dp_dOps[i,j] = sum_k e[0,k] dot( d_gs, rho )[i,j,k,0]
+        # dp_dOps[i,j] = dot( e, dot( d_gs, rho ) )[0,i,j,0]
+        # dp_dOps      = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,3))
+        old_err2 = _np.seterr(invalid='ignore', over='ignore')
+        path = _np.einsum_path('hk,ijkl,lm->ij', e, d_gs, rho, optimize='optimal')
+        dp_dOps = _np.einsum('hk,ijkl,lm->ij', e, d_gs, rho, optimize=path[0]) * scale_vals[:, None]
+        _np.seterr(**old_err2)
+        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols)
+        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
+        #  this case set to zero since we can't tell whether it's + or - inf anyway...
+        dp_dOps[_np.isnan(dp_dOps)] = 0
+
+        #SPAM -------------
+
+        if self.model._param_interposer is not None:
+            #When there is an interposer, we compute derivs wrt *all* the ops params (inefficient?),
+            # then apply interposer, then take desired wrt_filter columns:
+            nOpDerivCols = self.model._param_interposer.num_op_params
+
+            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
+            _fas(dp_drhos, [None, rhoVec.gpindices],
+                 _np.squeeze(_np.dot(_np.dot(e, gs), rhoVec.deriv_wrt_params()),  # *don't* apply wrt filter here
+                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
+            dp_drhos = _np.dot(dp_drhos, self.model._param_interposer.deriv_op_params_wrt_model_params())
+            if wrt_slice is not None: dp_drhos = dp_drhos[:, wrt_slice]
+
+            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
+            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
+            _fas(dp_dEs, [None, EVec.gpindices], _np.dot(dp_dAnyE, EVec.deriv_wrt_params()))
+            dp_dEs = _np.dot(dp_dEs, self.model._param_interposer.deriv_op_params_wrt_model_params())
+            if wrt_slice is not None: dp_dEs = dp_dEs[:, wrt_slice]
+
+        else:
+            #Simpler case of no interposer
+            nOpDerivCols = nDerivCols
+
+            rho_wrtFilter, rho_gpindices = self._process_wrt_filter(
+                wrt_slice, self.model.circuit_layer_operator(rholabel, 'prep'))
+            E_wrtFilter, E_gpindices = self._process_wrt_filter(
+                wrt_slice, self.model.circuit_layer_operator(elabel, 'povm'))
+
+            # Get: dp_drhos[i, rho_gpindices] = dot(e,gs[i],drho/drhoP)
+            # dp_drhos[i,J0+J] = sum_kl e[0,k] gs[i,k,l] drhoP[l,J]
+            # dp_drhos[i,J0+J] = dot(e, gs, drhoP)[0,i,J]
+            # dp_drhos[:,J0+J] = squeeze(dot(e, gs, drhoP),axis=(0,))[:,J]
+            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
+            _fas(dp_drhos, [None, rho_gpindices],
+                 _np.squeeze(_np.dot(_np.dot(e, gs),
+                                     rhoVec.deriv_wrt_params(rho_wrtFilter)),
+                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
+
+            # Get: dp_dEs[i, E_gpindices] = dot(transpose(dE/dEP),gs[i],rho))
+            # dp_dEs[i,J0+J] = sum_lj dEPT[J,j] gs[i,j,l] rho[l,0]
+            # dp_dEs[i,J0+J] = sum_j dEP[j,J] dot(gs, rho)[i,j]
+            # dp_dEs[i,J0+J] = sum_j dot(gs, rho)[i,j,0] dEP[j,J]
+            # dp_dEs[i,J0+J] = dot(squeeze(dot(gs, rho),2), dEP)[i,J]
+            # dp_dEs[:,J0+J] = dot(squeeze(dot(gs, rho),axis=(2,)), dEP)[:,J]
+            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
+            # may overflow, but OK (deriv w.r.t any of self.effects - independent of which)
+            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
+            _fas(dp_dEs, [None, E_gpindices],
+                 _np.dot(dp_dAnyE, EVec.deriv_wrt_params(E_wrtFilter)))
+
+        sub_vdp = dp_drhos + dp_dEs + dp_dOps
+        return sub_vdp
+
+    def _hprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs1, d_gs2, h_gs, scale_vals,
+                           wrt_slice1=None, wrt_slice2=None):
+        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
+
+        rholabel, elabel = spam_tuple
+        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
+        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
+        nCircuits = gs.shape[0]
+
+        rho_wrtFilter1, rho_gpindices1 = self._process_wrt_filter(
+            wrt_slice1, self.model.circuit_layer_operator(rholabel, 'prep'))
+        rho_wrtFilter2, rho_gpindices2 = self._process_wrt_filter(
+            wrt_slice2, self.model.circuit_layer_operator(rholabel, 'prep'))
+        E_wrtFilter1, E_gpindices1 = self._process_wrt_filter(
+            wrt_slice1, self.model.circuit_layer_operator(elabel, 'povm'))
+        E_wrtFilter2, E_gpindices2 = self._process_wrt_filter(
+            wrt_slice2, self.model.circuit_layer_operator(elabel, 'povm'))
+
+        nDerivCols1 = self.model.num_params if wrt_slice1 is None else _slct.length(wrt_slice1)
+        nDerivCols2 = self.model.num_params if wrt_slice2 is None else _slct.length(wrt_slice2)
+
+        #flt1 = self._get_filter_info(wrtSlices1)
+        #flt2 = self._get_filter_info(wrtSlices2)
+
+        # GATE DERIVS (assume h_gs is already sized/filtered) -------------------
+        assert(h_gs.shape[1] == nDerivCols1), "h_gs must be pre-filtered!"
+        assert(h_gs.shape[2] == nDerivCols2), "h_gs must be pre-filtered!"
+
+        #Compute d2(probability)/dGates2 and save in return list
+        # d2pr_dOps2[i,j,k] = sum_l,m e[0,l] h_gs[i,j,k,l,m] rho[m,0]
+        # d2pr_dOps2[i,j,k] = sum_l e[0,l] dot( d_gs, rho )[i,j,k,l,0]
+        # d2pr_dOps2[i,j,k] = dot( e, dot( d_gs, rho ) )[0,i,j,k,0]
+        # d2pr_dOps2        = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,4))
+        old_err2 = _np.seterr(invalid='ignore', over='ignore')
+        d2pr_dOps2 = _np.squeeze(_np.dot(e, _np.dot(h_gs, rho)), axis=(0, 4)) * scale_vals[:, None, None]
+        _np.seterr(**old_err2)
+
+        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols, nDerivCols)
+        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
+        #  this case set to zero since we can't tell whether it's + or - inf anyway...
+        d2pr_dOps2[_np.isnan(d2pr_dOps2)] = 0
+
+        # SPAM DERIVS (assume d_gs1 and d_gs2 are already sized/filtered) --------
+        assert(d_gs1.shape[1] == nDerivCols1), "d_gs1 must be pre-filtered!"
+        assert(d_gs2.shape[1] == nDerivCols2), "d_gs1 must be pre-filtered!"
+
+        # Get: d2pr_drhos[i, j, rho_gpindices] = dot(e,d_gs[i,j],drho/drhoP))
+        # d2pr_drhos[i,j,J0+J] = sum_kl e[0,k] d_gs[i,j,k,l] drhoP[l,J]
+        # d2pr_drhos[i,j,J0+J] = dot(e, d_gs, drhoP)[0,i,j,J]
+        # d2pr_drhos[:,:,J0+J] = squeeze(dot(e, d_gs, drhoP),axis=(0,))[:,:,J]
+        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
+        d2pr_drhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+        _fas(d2pr_drhos1, [None, None, rho_gpindices2],
+             _np.squeeze(_np.dot(_np.dot(e, d_gs1), drho), axis=(0,))
+             * scale_vals[:, None, None])  # overflow OK
+
+        # get d2pr_drhos where gate derivatives are wrt the 2nd set of gate parameters
+        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
+            assert(nDerivCols1 == nDerivCols2)
+            d2pr_drhos2 = _np.transpose(d2pr_drhos1, (0, 2, 1))
+        else:
+            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
+            d2pr_drhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
+            _fas(d2pr_drhos2, [None, None, rho_gpindices1],
+                 _np.squeeze(_np.dot(_np.dot(e, d_gs2), drho), axis=(0,))
+                 * scale_vals[:, None, None])  # overflow OK
+            d2pr_drhos2 = _np.transpose(d2pr_drhos2, (0, 2, 1))
+
+        # Get: d2pr_dEs[i, j, E_gpindices] = dot(transpose(dE/dEP),d_gs[i,j],rho)
+        # d2pr_dEs[i,j,J0+J] = sum_kl dEPT[J,k] d_gs[i,j,k,l] rho[l,0]
+        # d2pr_dEs[i,j,J0+J] = sum_k dEP[k,J] dot(d_gs, rho)[i,j,k,0]
+        # d2pr_dEs[i,j,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[i,j,J]
+        # d2pr_dEs[:,:,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[:,:,J]
+        d2pr_dEs1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+        dp_dAnyE = _np.squeeze(_np.dot(d_gs1, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
+        devec = EVec.deriv_wrt_params(E_wrtFilter2)
+        _fas(d2pr_dEs1, [None, None, E_gpindices2],
+             _np.dot(dp_dAnyE, devec))
+
+        # get d2pr_dEs where gate derivatives are wrt the 2nd set of gate parameters
+        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
+            assert(nDerivCols1 == nDerivCols2)
+            d2pr_dEs2 = _np.transpose(d2pr_dEs1, (0, 2, 1))
+        else:
+            d2pr_dEs2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
+            dp_dAnyE = _np.squeeze(_np.dot(d_gs2, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
+            devec = EVec.deriv_wrt_params(E_wrtFilter1)
+            _fas(d2pr_dEs2, [None, None, E_gpindices1], _np.dot(dp_dAnyE, devec))
+            d2pr_dEs2 = _np.transpose(d2pr_dEs2, (0, 2, 1))
+
+        # Get: d2pr_dErhos[i, e_offset[eIndex]:e_offset[eIndex+1], e_offset[rhoIndex]:e_offset[rhoIndex+1]] =
+        #    dEP^T * prod[i,:,:] * drhoP
+        # d2pr_dErhos[i,J0+J,K0+K] = sum jk dEPT[J,j] prod[i,j,k] drhoP[k,K]
+        # d2pr_dErhos[i,J0+J,K0+K] = sum j dEPT[J,j] dot(prod,drhoP)[i,j,K]
+        # d2pr_dErhos[i,J0+J,K0+K] = dot(dEPT,prod,drhoP)[J,i,K]
+        # d2pr_dErhos[i,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[i,J,K]
+        # d2pr_dErhos[:,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[:,J,K]
+        d2pr_dErhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
+        dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
+        devec = EVec.deriv_wrt_params(E_wrtFilter1)
+        _fas(d2pr_dErhos1, (None, E_gpindices1, rho_gpindices2),
+             _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
+
+        # get d2pr_dEs where e derivatives are wrt the 2nd set of gate parameters
+        if wrt_slice1 == wrt_slice2:  # Note: this doesn't involve gate derivatives
+            d2pr_dErhos2 = _np.transpose(d2pr_dErhos1, (0, 2, 1))
+        else:
+            d2pr_dErhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
+            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
+            dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
+            devec = EVec.deriv_wrt_params(E_wrtFilter2)
+            _fas(d2pr_dErhos2, [None, E_gpindices2, rho_gpindices1],
+                 _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
+            d2pr_dErhos2 = _np.transpose(d2pr_dErhos2, (0, 2, 1))
+
+        #Note: these 2nd derivatives are non-zero when the spam vectors have
+        # a more than linear dependence on their parameters.
+        if self.model.circuit_layer_operator(rholabel, 'prep').has_nonzero_hessian():
+            dp_dAnyRho = _np.dot(e, gs).squeeze(0) * scale_vals[:, None]  # overflow OK
+            d2pr_d2rhos = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+            _fas(d2pr_d2rhos, [None, rho_gpindices1, rho_gpindices2],
+                 _np.tensordot(dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep').hessian_wrt_params(
+                     rho_wrtFilter1, rho_wrtFilter2), (1, 0)))
+            # _np.einsum('ij,jkl->ikl', dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep') \
+            #    .hessian_wrt_params(rho_wrtFilter1, rho_wrtFilter2))
+        else:
+            d2pr_d2rhos = 0
+
+        if self.model.circuit_layer_operator(elabel, 'povm').has_nonzero_hessian():
+            dp_dAnyE = _np.dot(gs, rho).squeeze(2) * scale_vals[:, None]  # overflow OK
+            d2pr_d2Es = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
+            _fas(d2pr_d2Es, [None, E_gpindices1, E_gpindices2],
+                 _np.tensordot(dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
+                     E_wrtFilter1, E_wrtFilter2), (1, 0)))
+            # _np.einsum('ij,jkl->ikl', dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
+            #    E_wrtFilter1, E_wrtFilter2))
+        else:
+            d2pr_d2Es = 0
+
+        # END SPAM DERIVS -----------------------
+
+        ret = d2pr_d2rhos + d2pr_dErhos2 + d2pr_drhos2    # wrt rho
+        ret += d2pr_dErhos1 + d2pr_d2Es + d2pr_dEs2      # wrt e
+        ret += d2pr_drhos1 + d2pr_dEs1 + d2pr_dOps2   # wrt gates
+
+        return ret
+
+    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
+        #Free memory from previous subtree iteration before computing caches
+        scaleVals = Gs = prodCache = scaleCache = None
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2)  # prod cache
+
+        #Fill cache info
+        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
+
+        if not resource_alloc.is_host_leader:
+            # (same as "if resource_alloc.host_comm is not None and resource_alloc.host_comm.rank != 0")
+            # we cannot further utilize multiplie processors when computing a single block.  The required
+            # ending condition is that array_to_fill on each processor has been filled.  But if memory
+            # is being shared and resource_alloc contains multiple processors on a single host, we only
+            # want *one* (the rank=0) processor to perform the computation, since array_to_fill will be
+            # shared memory that we don't want to have muliple procs using simultaneously to compute the
+            # same thing.  Thus, we just do nothing on all of the non-root host_comm processors.
+            # We could also print a warning (?), or we could carefully guard any shared mem updates
+            # using "if resource_alloc.is_host_leader" conditions (if we could use  multiple procs elsewhere).
+            return
+
+        #use cached data to final values
+        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
+        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
+        # ( n_circuits, dim, dim )
+
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
+            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
+            #  to the the element indices when `spamtuple` is used.
+            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices],
+                 self._probs_from_rho_e(rho, E, Gs[tree_indices], scaleVals[tree_indices]))
+        _np.seterr(**old_err)
+
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim * dim * _slct.length(param_slice))
+        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
+        dProdCache = self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
+                                                  resource_alloc, param_slice)
+        if not resource_alloc.is_host_leader:
+            return  # Non-root host processors aren't used anymore to compute the result on the root proc
+
+        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
+        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
+        dGs = layout_atom.nonscratch_cache_view(dProdCache, axis=0)
+
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices, dest_param_slice], self._dprobs_from_rho_e(
+                spam_tuple, rho, E, Gs[tree_indices], dGs[tree_indices], scaleVals[tree_indices], param_slice))
+
+        _np.seterr(**old_err)
+
+    def _bulk_fill_hprobs_atom(self, array_to_fill, dest_param_slice1, dest_param_slice2, layout_atom,
+                               param_slice1, param_slice2, resource_alloc):
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2
+                                                 * _slct.length(param_slice1) * _slct.length(param_slice2))
+        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
+        dProdCache1 = self._compute_dproduct_cache(
+            layout_atom.tree, prodCache, scaleCache, resource_alloc, param_slice1)  # computed on rank=0 only
+        dProdCache2 = dProdCache1 if (param_slice1 == param_slice2) else \
+            self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
+                                         resource_alloc, param_slice2)  # computed on rank=0 only
+        hProdCache = self._compute_hproduct_cache(layout_atom.tree, prodCache, dProdCache1,
+                                                  dProdCache2, scaleCache, resource_alloc,
+                                                  param_slice1, param_slice2)  # computed on rank=0 only
+
+        if not resource_alloc.is_host_leader:
+            return  # Non-root host processors aren't used anymore to compute the result on the root proc
+
+        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
+        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
+        dGs1 = layout_atom.nonscratch_cache_view(dProdCache1, axis=0)
+        dGs2 = layout_atom.nonscratch_cache_view(dProdCache2, axis=0)
+        #( n_circuits, nDerivColsX, dim, dim )
+
+        hGs = layout_atom.nonscratch_cache_view(hProdCache, axis=0)
+        #( n_circuits, len(wrt_filter1), len(wrt_filter2), dim, dim )
+
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices, dest_param_slice1, dest_param_slice2], self._hprobs_from_rho_e(
+                spam_tuple, rho, E, Gs[tree_indices], dGs1[tree_indices], dGs2[tree_indices],
+                hGs[tree_indices], scaleVals[tree_indices], param_slice1, param_slice2))
+
+        _np.seterr(**old_err)
+
+    def bulk_product(self, circuits, scale=False, resource_alloc=None):
+        """
+        Compute the products of many circuits at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        Returns
+        -------
+        prods : numpy array
+            Array of shape S x G x G, where:
+            - S == the number of operation sequences
+            - G == the linear dimension of a operation matrix (G x G operation matrices).
+        scaleValues : numpy array
+            Only returned when scale == True. A length-S array specifying
+            the scaling that needs to be applied to the resulting products
+            (final_product[i] = scaleValues[i] * prods[i]).
+        """
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+
+        # Need to break these circuits down into lanes first.
+        def compute_subcircuits(circuit, lanes_to_qubits_used, qubits_to_lanes):
+
+            lanes_to_gates = [[] for _ in range(len(lanes_to_qubits_used))]
+            for layer in circuit:
+                if isinstance(layer, LabelTupTup):
+                    group = []
+                    nused = 0
+                    for op in layer:
+                        qubits_used = op.qubits
+                        lane = qubits_to_lanes[qubits_used[0]]
+                        if nused + len(qubits_used) == len(lanes_to_qubits_used[lane]):
+                            group.append(op)
+                            lanes_to_gates[lane].append(LabelTupTup(tuple(group)))
+                            nused = 0
+                            group = []
+                        elif nused + len(qubits_used) < len(lanes_to_qubits_used[lane]):
+                            nused += len(qubits_used)
+                            group.append(op)
+                        else:
+                            raise ValueError("Too many indices")
+                elif isinstance(layer, LabelTup):
+                    qubits_used = layer.qubits
+                    lanes_to_gates[qubits_to_lanes[qubits_used[0]]] = layer
+            return lanes_to_gates
+
+        full_list = []
+        for cir in circuits:
+            full_list.append(compute_subcircuits(cir, self._lanes_used, self._qubits_to_lanes))
+
+
+        nCircuits = len(circuits)
+
+        eval_tree = _EvalTree.create(full_list)
+        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
+
+        # Now the cache will also hold the circuit lanes.
+        # So 0:nCircuits*nLanes will hold all the Gs.
+        # Tensor back up in a [(lane)*nLanes, (lane+1)*nLanes]
+
+        sval = _np.zeros(len(circuits))
+        gates = [1 for _ in circuits]
+
+        for ind in range():
+            for lane in range(len(self._lanes_used)):
+                gates[ind] = _np.kron(gates[ind], prodCache[lane + ind*len(self._lanes_used)])
+                sval[ind] += scaleCache[lane + ind*len(self._lanes_used)]
+
+        gates = _np.array(gates)
+        old_err = _np.seterr(over="ignore")
+        gates *= _np.exp(sval)[:, None, None]
+        _np.seterr(**old_err)
+
+
+        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
+        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
+        Gs = prodCache[0:nCircuits]
+
+        if scale:
+            return Gs, scaleVals
+        else:
+            old_err = _np.seterr(over='ignore')
+            Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
+            _np.seterr(**old_err)
+            return Gs
+
+    def bulk_dproduct(self, circuits, flat=False, return_prods=False,
+                      scale=False, resource_alloc=None, wrt_filter=None):
+        """
+        Compute the derivative of a many operation sequences at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        flat : bool, optional
+            Affects the shape of the returned derivative array (see below).
+
+        return_prods : bool, optional
+            when set to True, additionally return the probabilities.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        wrt_filter : list of ints, optional
+            If not None, a list of integers specifying which gate parameters
+            to include in the derivative.  Each element is an index into an
+            array of gate parameters ordered by concatenating each gate's
+            parameters (in the order specified by the model).  This argument
+            is used internally for distributing derivative calculations across
+            multiple processors.
+
+        Returns
+        -------
+        derivs : numpy array
+            * if flat == False, an array of shape S x M x G x G, where:
+              - S == len(circuits)
+              - M == the length of the vectorized model
+              - G == the linear dimension of a operation matrix (G x G operation matrices)
+              and derivs[i,j,k,l] holds the derivative of the (k,l)-th entry
+              of the i-th operation sequence product with respect to the j-th model
+              parameter.
+            * if flat == True, an array of shape S*N x M where:
+              - N == the number of entries in a single flattened gate (ordering same as numpy.flatten),
+              - S,M == as above,
+              and deriv[i,j] holds the derivative of the (i % G^2)-th entry of
+              the (i / G^2)-th flattened operation sequence product  with respect to
+              the j-th model parameter.
+        products : numpy array
+            Only returned when return_prods == True.  An array of shape
+            S x G x G; products[i] is the i-th operation sequence product.
+        scaleVals : numpy array
+            Only returned when scale == True.  An array of shape S such that
+            scaleVals[i] contains the multiplicative scaling needed for
+            the derivatives and/or products for the i-th operation sequence.
+        """
+        nCircuits = len(circuits)
+        nDerivCols = self.model.num_params if (wrt_filter is None) else _slct.length(wrt_filter)
+
+        wrtSlice = _slct.list_to_slice(wrt_filter) if (wrt_filter is not None) else None
+        #TODO: just allow slices as argument: wrt_filter -> wrtSlice?
+
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+
+        eval_tree = _EvalTree.create(circuits)
+        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
+        dProdCache = self._compute_dproduct_cache(eval_tree, prodCache, scaleCache,
+                                                  resource_alloc.comm, wrtSlice)
+
+        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
+        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
+        Gs = prodCache[0:nCircuits]
+        dGs = dProdCache[0:nCircuits]
+
+        if not scale:
+            old_err = _np.seterr(over='ignore', invalid='ignore')
+            if return_prods:
+                Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
+
+            # may overflow or get nans (invalid), but ok
+            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 3) * scaleVals, 0, 3)
+            # convert nans to zero, as these occur b/c an inf scaleVal is mult by a zero deriv value, and we
+            dGs[_np.isnan(dGs)] = 0
+            _np.seterr(**old_err)
+
+        if flat:
+            # cols = deriv cols, rows = flattened everything else
+            dim = self.model.evotype.minimal_dim(self.model.state_space)
+            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 1).reshape(
+                (nDerivCols, nCircuits * dim**2)), 0, 1)
+
+        if return_prods:
+            return (dGs, Gs, scaleVals) if scale else (dGs, Gs)
+        else:
+            return (dGs, scaleVals) if scale else dGs
+
+    ## ---------------------------------------------------------------------------------------------
+    ## TIME DEPENDENT functionality ----------------------------------------------------------------
+    ## ---------------------------------------------------------------------------------------------
+
+    def _ds_quantities(self, timestamp, ds_cache, layout, dataset, TIMETOL=1e-6):
+        if timestamp not in ds_cache:
+            if 'truncated_ds' not in ds_cache:
+                ds_cache['truncated_ds'] = dataset.truncate(layout.circuits)
+            trunc_dataset = ds_cache['truncated_ds']
+
+            if 'ds_for_time' not in ds_cache:
+                #tStart = _time.time()
+                ds_cache['ds_for_time'] = trunc_dataset.split_by_time()
+                #print("DB: Split dataset by time in %.1fs (%d timestamps)" % (_time.time() - tStart,
+                #                                                              len(ds_cache['ds_for_time'])))
+
+            if timestamp not in ds_cache['ds_for_time']:
+                return (None, None, None, None, None)
+
+            #Similar to MDC store's add_count_vectors function -- maybe consolidate in FUTURE?
+            counts = _np.empty(layout.num_elements, 'd')
+            totals = _np.empty(layout.num_elements, 'd')
+            dataset_at_t = ds_cache['ds_for_time'][timestamp]  # trunc_dataset.time_slice(timestamp, timestamp+TIMETOL)
+
+            firsts = []; indicesOfCircuitsWithOmittedData = []
+            for (i, circuit) in enumerate(layout.circuits):  # should be 'ds_circuits' really
+                inds = layout.indices_for_index(i)
+                if circuit in dataset_at_t:
+                    cnts = dataset_at_t[circuit].counts
+                else:
+                    cnts = {}  # Note: this will cause 0 totals, which will need to be handled downstream
+                totals[inds] = sum(cnts.values())  # dataset[opStr].total
+                counts[inds] = [cnts.get(x, 0) for x in layout.outcomes_for_index(i)]
+                lklen = _slct.length(inds)  # consolidate w/ `add_omitted_freqs`?
+                if 0 < lklen < self.model.compute_num_outcomes(circuit):
+                    firsts.append(_slct.to_array(inds)[0])
+                    indicesOfCircuitsWithOmittedData.append(i)
+
+            if len(firsts) > 0:
+                firsts = _np.array(firsts, 'i')
+                indicesOfCircuitsWithOmittedData = _np.array(indicesOfCircuitsWithOmittedData, 'i')
+                #print("DB: SPARSE DATA: %d of %d rows have sparse data" % (len(firsts), len(layout.circuits)))
+            else:
+                firsts = indicesOfCircuitsWithOmittedData = None
+
+            #if self.circuits.circuit_weights is not None:
+            #  SEE add_count_vectors
+
+            nonzero_totals = _np.where(_np.abs(totals) < 1e-10, 1e-10, totals)  # avoid divide-by-zero error on nxt line
+            freqs = counts / nonzero_totals
+            ds_cache[timestamp] = (counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData)
+
+        return ds_cache[timestamp]
+
+    def _bulk_fill_timedep_objfn(self, raw_objective, array_to_fill, layout, ds_circuits,
+                                 num_total_outcomes, dataset, ds_cache=None):
+
+        assert(self._mode == "distribute_by_timestamp"), \
+            ("Must set `distribute_by_timestamp=True` to use a "
+             "time-dependent objective function with MatrixForwardSimulator!")
+
+        resource_alloc = layout.resource_alloc()
+        atom_resource_alloc = layout.resource_alloc('atom-processing')
+        atom_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
+
+        #Split timestamps up between processors - maybe do this in a time-dep layout?
+        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
+        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
+            _mpit.distribute_indices(list(range(len(all_timestamps))), atom_resource_alloc)
+        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
+
+        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
+                                                                  (layout.num_elements,), 'd')
+        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
+        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
+        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
+
+        array_to_fill[:] = 0.0
+        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
+        assert(my_array_to_fill.shape == (layout.num_elements,))
+
+        for timestamp_index in my_timestamp_inds:
+            timestamp = all_timestamps[timestamp_index]
+
+            # compute objective at time timestamp
+            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
+                self._ds_quantities(timestamp, ds_cache, layout, dataset)
+            if counts is None: return  # no data at this time => no contribution
+
+            for _, obj in self.model._iter_parameterized_objs():
+                obj.set_time(timestamp)
+            for opcache in self.model._opcaches.values():
+                for obj in opcache.values():
+                    obj.set_time(timestamp)
+
+            for atom in layout.atoms:  # layout only holds local atoms
+                self._bulk_fill_probs_atom(probs_array[atom.element_slice], atom, timestamp_processing_ralloc)
+
+            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
+            # (similar to DistributableForwardSimulator._bulk_fill_probs)
+
+            terms = raw_objective.terms(probs_array, counts, totals, freqs)
+            if firsts is not None and shared_mem_leader:  # consolidate with `_update_terms_for_omitted_probs`
+                omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
+                                                 for i in indicesOfCircuitsWithOmittedData])
+                terms[firsts] += raw_objective.zero_freq_terms(totals[firsts], omitted_probs)
+            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
+
+            my_array_to_fill += terms
+
+        #collect/gather results (SUM local arrays together)
+        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
+
+        _smt.cleanup_shared_ndarray(probs_array_shm)
+
+    def _bulk_fill_timedep_dobjfn(self, raw_objective, array_to_fill, layout, ds_circuits,
+                                  num_total_outcomes, dataset, ds_cache=None):
+
+        assert(self._mode == "distribute_by_timestamp"), \
+            ("Must set `distribute_by_timestamp=True` to use a "
+             "time-dependent objective function with MatrixForwardSimulator!")
+
+        resource_alloc = layout.resource_alloc()
+        param_resource_alloc = layout.resource_alloc('param-processing')
+        param_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
+
+        #Split timestamps up between processors - maybe do this in a time-dep layout?
+        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
+        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
+            _mpit.distribute_indices(list(range(len(all_timestamps))), param_resource_alloc)
+        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
+
+        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
+                                                                  (layout.num_elements,), 'd')
+        dprobs_array, dprobs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
+                                                                    (layout.num_elements, self.model.num_params), 'd')
+        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
+        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
+        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
+
+        array_to_fill[:] = 0.0
+        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
+        all_param_slice = slice(0, self.model.num_params)  # All params computed at once for now
+        assert(my_array_to_fill.shape == (layout.num_elements, self.model.num_params))
+
+        for timestamp_index in my_timestamp_inds:
+            timestamp = all_timestamps[timestamp_index]
+            # compute objective at time layout_atom.time
+            #print("DB: Rank %d : layout atom for t=" % resource_alloc.comm.rank, layout_atom.timestamp)
+
+            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
+                self._ds_quantities(timestamp, ds_cache, layout, dataset)
+
+            for _, obj in self.model._iter_parameterized_objs():
+                obj.set_time(timestamp)
+            for opcache in self.model._opcaches.values():
+                for obj in opcache.values():
+                    obj.set_time(timestamp)
+
+            for atom in layout.atoms:  # layout only holds local atoms
+                self._bulk_fill_probs_atom(probs_array, atom, timestamp_processing_ralloc)
+                self._bulk_fill_dprobs_atom(dprobs_array, all_param_slice, atom,
+                                            all_param_slice, timestamp_processing_ralloc)
+
+            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
+            # (similar to DistributableForwardSimulator._bulk_fill_probs)
+
+            if shared_mem_leader:
+                if firsts is not None:  # consolidate with TimeIndependentMDCObjectiveFunction.dterms?
+                    dprobs_omitted_rowsum = _np.empty((len(firsts), self.model.num_params), 'd')
+                    for ii, i in enumerate(indicesOfCircuitsWithOmittedData):
+                        dprobs_omitted_rowsum[ii, :] = _np.sum(dprobs_array[layout.indices_for_index(i), :], axis=0)
+
+                dprobs_array *= raw_objective.dterms(probs_array, counts, totals, freqs)[:, None]
+
+                if firsts is not None:  # consolidate with _update_dterms_for_omitted_probs?
+                    omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
+                                                     for i in indicesOfCircuitsWithOmittedData])
+                    dprobs_array[firsts] -= raw_objective.zero_freq_dterms(totals[firsts], omitted_probs)[:, None] \
+                        * dprobs_omitted_rowsum
+            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
+
+            my_array_to_fill += dprobs_array
+
+        #collect/gather results (SUM local arrays together)
+        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
+
+        _smt.cleanup_shared_ndarray(probs_array_shm)
+        _smt.cleanup_shared_ndarray(dprobs_array_shm)
+
+    def bulk_fill_timedep_chi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                               min_prob_clip_for_weighting, prob_clip_interval, ds_cache=None):
+        """
+        Compute the chi2 contributions for an entire tree of circuits, allowing for time dependent operations.
+
+        Computation is performed by summing together the contributions for each time the circuit is
+        run, as given by the timestamps in `dataset`.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated 1D numpy array of length equal to the
+            total number of computed elements (i.e. layout.num_elements)
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the chi2 contributions.
+
+        min_prob_clip_for_weighting : float, optional
+            Sets the minimum and maximum probability p allowed in the chi^2
+            weights: N/(p*(1-p)) by clipping probability p values to lie within
+            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
+        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
+                                   layout.resource_alloc())
+        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                             dataset, ds_cache)
+
+    def bulk_fill_timedep_dchi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                                min_prob_clip_for_weighting, prob_clip_interval, chi2_array_to_fill=None,
+                                ds_cache=None):
+        """
+        Compute the chi2 jacobian contributions for an entire tree of circuits, allowing for time dependent operations.
+
+        Similar to :meth:`bulk_fill_timedep_chi2` but compute the *jacobian*
+        of the summed chi2 contributions for each circuit with respect to the
+        model's parameters.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated ExM numpy array where E is the total number of
+            computed elements (i.e. layout.num_elements) and M is the
+            number of model parameters.
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the chi2 contributions.
+
+        min_prob_clip_for_weighting : float, optional
+            Sets the minimum and maximum probability p allowed in the chi^2
+            weights: N/(p*(1-p)) by clipping probability p values to lie within
+            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        chi2_array_to_fill : numpy array, optional
+            when not None, an already-allocated length-E numpy array that is filled
+            with the per-circuit chi2 contributions, just like in
+            bulk_fill_timedep_chi2(...).
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
+        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
+                                   layout.resource_alloc())
+        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                              dataset, ds_cache)
+
+    def bulk_fill_timedep_loglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                                 min_prob_clip, radius, prob_clip_interval, ds_cache=None):
+        """
+        Compute the log-likelihood contributions (within the "poisson picture") for an entire tree of circuits.
+
+        Computation is performed by summing together the contributions for each time the circuit is run,
+        as given by the timestamps in `dataset`.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated 1D numpy array of length equal to the
+            total number of computed elements (i.e. layout.num_elements)
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the logl contributions.
+
+        min_prob_clip : float, optional
+            The minimum probability treated normally in the evaluation of the
+            log-likelihood.  A penalty function replaces the true log-likelihood
+            for probabilities that lie below this threshold so that the
+            log-likelihood never becomes undefined (which improves optimizer
+            performance).
+
+        radius : float, optional
+            Specifies the severity of rounding used to "patch" the
+            zero-frequency terms of the log-likelihood.
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
+        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
+                                                  layout.resource_alloc())
+        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                             dataset, ds_cache)
+
+    def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
+                                  min_prob_clip, radius, prob_clip_interval, logl_array_to_fill=None, ds_cache=None):
+        """
+        Compute the ("poisson picture")log-likelihood jacobian contributions for an entire tree of circuits.
+
+        Similar to :meth:`bulk_fill_timedep_loglpp` but compute the *jacobian*
+        of the summed logl (in posison picture) contributions for each circuit
+        with respect to the model's parameters.
+
+        Parameters
+        ----------
+        array_to_fill : numpy ndarray
+            an already-allocated ExM numpy array where E is the total number of
+            computed elements (i.e. layout.num_elements) and M is the
+            number of model parameters.
+
+        layout : CircuitOutcomeProbabilityArrayLayout
+            A layout for `array_to_fill`, describing what circuit outcome each
+            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
+
+        ds_circuits : list of Circuits
+            the circuits to use as they should be queried from `dataset` (see
+            below).  This is typically the same list of circuits used to
+            construct `layout` potentially with some aliases applied.
+
+        num_total_outcomes : list or array
+            a list of the total number of *possible* outcomes for each circuit
+            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
+            needed for handling sparse data, where `dataset` may not contain
+            counts for all the possible outcomes of each circuit.
+
+        dataset : DataSet
+            the data set used to compute the logl contributions.
+
+        min_prob_clip : float
+            a regularization parameter for the log-likelihood objective function.
+
+        radius : float
+            a regularization parameter for the log-likelihood objective function.
+
+        prob_clip_interval : 2-tuple or None, optional
+            (min,max) values used to clip the predicted probabilities to.
+            If None, no clipping is performed.
+
+        logl_array_to_fill : numpy array, optional
+            when not None, an already-allocated length-E numpy array that is filled
+            with the per-circuit logl contributions, just like in
+            bulk_fill_timedep_loglpp(...).
+
+        Returns
+        -------
+        None
+        """
+        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
+        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
+                                                  layout.resource_alloc())
+        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
+                                              dataset, ds_cache)

From 31154997588b61a56358f78f4990712f98082fc6 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 20 Jun 2025 16:30:57 -0700
Subject: [PATCH 057/141] save the lane information if the circuit is built
 from tensor products

---
 pygsti/circuits/circuit.py | 44 ++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 93c8c4e00..e8287543f 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -10,7 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
-import collections as _collections
+from __future__ import annotations
 import itertools as _itertools
 import warnings as _warnings
 
@@ -527,11 +527,13 @@ def _bare_init(self, labels, line_labels, editable, name='', stringrep=None, occ
         self._name = name  # can be None
         #self._times = None  # for FUTURE expansion
         self.auxinfo = {}  # for FUTURE expansion / user metadata
+        self.saved_auxinfo = {}
+        self.saved_auxinfo["lanes"] = {tuple(line_labels): self._labels}
 
     #Note: If editing _copy_init one should also check _bare_init in case changes must be propagated.
     #specialized codepath for copying
     def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occurrence=None,
-                compilable_layer_indices_tup=(), hashable_tup=None, precomp_hash=None):
+                compilable_layer_indices_tup=(), hashable_tup=None, precomp_hash=None, saved_aux: dict[str, dict]={}):
         self._labels = labels
         self._line_labels = line_labels
         self._occurrence_id = occurrence
@@ -547,6 +549,9 @@ def _copy_init(self, labels, line_labels, editable, name='', stringrep=None, occ
         #self._times = None  # for FUTURE expansion
         self.auxinfo = {}  # for FUTURE expansion / user metadata
 
+
+        self.saved_auxinfo = saved_aux
+
         return self
     
     #pickle management functions
@@ -1047,13 +1052,13 @@ def copy(self, editable='auto'):
                 editable_labels =[[lbl] if lbl.IS_SIMPLE else list(lbl.components) for lbl in self._labels]
                 return ret._copy_init(editable_labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
-                                      self._compilable_layer_indices_tup)
+                                      self._compilable_layer_indices_tup, saved_aux=self.saved_auxinfo)
             else:
                 #copy the editable labels (avoiding shallow copy issues)
                 editable_labels = [sublist.copy() for sublist in self._labels]
                 return ret._copy_init(editable_labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
-                                      self._compilable_layer_indices_tup)
+                                      self._compilable_layer_indices_tup, saved_aux=self.saved_auxinfo)
         else: #create static copy
             if self._static:
                 #if presently static leverage precomputed hashable_tup and hash. 
@@ -1062,7 +1067,7 @@ def copy(self, editable='auto'):
                 return ret._copy_init(self._labels, self._line_labels, editable, 
                                       self._name, self._str, self._occurrence_id, 
                                       self._compilable_layer_indices_tup, 
-                                      self._hashable_tup, self._hash)
+                                      self._hashable_tup, self._hash, saved_aux=self.saved_auxinfo)
             else:
                 static_labels = tuple([layer_lbl if isinstance(layer_lbl, _Label) else _Label(layer_lbl) 
                                        for layer_lbl in self._labels])
@@ -1070,7 +1075,7 @@ def copy(self, editable='auto'):
                 return ret._copy_init(static_labels, self._line_labels, 
                                       editable, self._name, self._str, self._occurrence_id, 
                                       self._compilable_layer_indices_tup, 
-                                      hashable_tup, hash(hashable_tup))
+                                      hashable_tup, hash(hashable_tup), saved_aux=self.saved_auxinfo)
 
     def clear(self):
         """
@@ -1219,7 +1224,7 @@ def extract_labels(self, layers=None, lines=None, strict=True):
                     return self._labels[layers]
                 if isinstance(layers, slice) and strict is True:  # if strict=False, then need to recompute line labels
                     #can speed this up a measurably by manually computing the new hashable tuple value and hash
-                    if not self._line_labels in (('*',), ()):
+                    if self._line_labels not in (('*',), ()):
                         new_hashable_tup = self._labels[layers] + ('@',) + self._line_labels
                     else:
                         new_hashable_tup = self._labels[layers]
@@ -2231,7 +2236,7 @@ def insert_layer_inplace(self, circuit_layer, j):
 
         self.insert_labels_into_layers_inplace([circuit_layer], j)
 
-    def insert_circuit(self, circuit, j):
+    def insert_circuit(self, circuit: Circuit, j):
         """
         Inserts a circuit into this circuit, returning a copy.
 
@@ -2259,7 +2264,7 @@ def insert_circuit(self, circuit, j):
         if self._static: cpy.done_editing()
         return cpy
 
-    def insert_circuit_inplace(self, circuit, j):
+    def insert_circuit_inplace(self, circuit: Circuit, j):
         """
         Inserts a circuit into this circuit.
 
@@ -2294,7 +2299,7 @@ def insert_circuit_inplace(self, circuit, j):
         labels_to_insert = circuit.extract_labels(layers=None, lines=lines_to_insert)
         self.insert_labels_into_layers_inplace(labels_to_insert, j)
 
-    def append_circuit(self, circuit):
+    def append_circuit(self, circuit: Circuit):
         """
         Append a circuit to the end of this circuit, returning a copy.
 
@@ -2312,7 +2317,7 @@ def append_circuit(self, circuit):
         """
         return self.insert_circuit(circuit, self.num_layers)
 
-    def append_circuit_inplace(self, circuit):
+    def append_circuit_inplace(self, circuit: Circuit):
         """
         Append a circuit to the end of this circuit.
 
@@ -2331,7 +2336,7 @@ def append_circuit_inplace(self, circuit):
         assert(not self._static), "Cannot edit a read-only circuit!"
         self.insert_circuit_inplace(circuit, self.num_layers)
 
-    def prefix_circuit(self, circuit):
+    def prefix_circuit(self, circuit: Circuit):
         """
         Prefix a circuit to the beginning of this circuit, returning a copy.
 
@@ -2349,7 +2354,7 @@ def prefix_circuit(self, circuit):
         """
         return self.insert_circuit(circuit, 0)
 
-    def prefix_circuit_inplace(self, circuit):
+    def prefix_circuit_inplace(self, circuit: Circuit):
         """
         Prefix a circuit to the beginning of this circuit.
 
@@ -2368,7 +2373,7 @@ def prefix_circuit_inplace(self, circuit):
         assert(not self._static), "Cannot edit a read-only circuit!"
         self.insert_circuit_inplace(circuit, 0)
 
-    def tensor_circuit_inplace(self, circuit, line_order=None):
+    def tensor_circuit_inplace(self, circuit: Circuit, line_order=None):
         """
         The tensor product of this circuit and `circuit`.
 
@@ -2421,8 +2426,9 @@ def tensor_circuit_inplace(self, circuit, line_order=None):
         #Add circuit's labels into this circuit
         self.insert_labels_as_lines_inplace(circuit._labels, line_labels=circuit.line_labels)
         self._line_labels = new_line_labels  # essentially just reorders labels if needed
+        self.saved_auxinfo["lanes"].update(circuit.saved_auxinfo["lanes"])
 
-    def tensor_circuit(self, circuit, line_order=None):
+    def tensor_circuit(self, circuit: Circuit, line_order=None):
         """
         The tensor product of this circuit and `circuit`, returning a copy.
 
@@ -2450,7 +2456,7 @@ def tensor_circuit(self, circuit, line_order=None):
         if self._static: cpy.done_editing()
         return cpy
 
-    def replace_layer_with_circuit_inplace(self, circuit, j):
+    def replace_layer_with_circuit_inplace(self, circuit: Circuit, j):
         """
         Replaces the `j`-th layer of this circuit with `circuit`.
 
@@ -2470,7 +2476,7 @@ def replace_layer_with_circuit_inplace(self, circuit, j):
         del self[j]
         self.insert_labels_into_layers_inplace(circuit, j)
 
-    def replace_layer_with_circuit(self, circuit, j):
+    def replace_layer_with_circuit(self, circuit: Circuit, j):
         """
         Replaces the `j`-th layer of this circuit with `circuit`,
         returning a copy.
@@ -3889,7 +3895,7 @@ def convert_to_cirq(self,
         return cirq.Circuit(moments)
     
     @classmethod
-    def from_cirq(cls, circuit, qubit_conversion=None, cirq_gate_conversion= None,
+    def from_cirq(cls, circuit: Circuit, qubit_conversion=None, cirq_gate_conversion= None,
                   remove_implied_idles = True, global_idle_replacement_label = 'auto'):
         """
         Converts and instantiates a pyGSTi Circuit object from a Cirq Circuit object.
@@ -4557,7 +4563,7 @@ class CompressedCircuit(object):
         takes more time but could result in better compressing.
     """
 
-    def __init__(self, circuit, min_len_to_compress=20, max_period_to_look_for=20):
+    def __init__(self, circuit: Circuit, min_len_to_compress=20, max_period_to_look_for=20):
         """
         Create a new CompressedCircuit object
 

From 6cad8d3b1f4b1a1e4f8ced1c2d5a7a4f9f1ce151 Mon Sep 17 00:00:00 2001
From: nkoskelo <koskelo2@illinois.edu>
Date: Tue, 1 Jul 2025 13:11:05 -0700
Subject: [PATCH 058/141] Preliminary spatially homogeneous qubits.

---
 pygsti/circuits/circuit.py       | 40 ++++++++++++++++++++++++++++++++
 pygsti/models/localnoisemodel.py | 28 ++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index e8287543f..ac13aa539 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -2498,6 +2498,46 @@ def replace_layer_with_circuit(self, circuit: Circuit, j):
         if self._static: cpy.done_editing()
         return cpy
 
+    def replace_spatially_equivalent_qubits(self, old_single_qubit, equiv_qubit_in_model):
+        """
+        Changes the *name* of a gate throughout this Circuit.
+
+        Note that the name is only a part of the label identifying each
+        gate, and doesn't include the lines (qubits) a gate acts upon.  For
+        example, the "Gx:0" and "Gx:1" labels both have the same name but
+        act on different qubits.
+
+        Parameters
+        ----------
+        old_single_qubit : int
+            The qubit to replace.
+
+        equiv_qubit_in_model : int
+            The qubit to replace `equiv_qubit_in_model` with.
+
+        Returns
+        -------
+        None
+        """
+        assert(not self._static), "Cannot edit a read-only circuit!"
+
+        def replace(obj):  # obj is either a simple label or a list
+            if isinstance(obj, _Label):
+                if len(obj.qubits) == 1:
+                    if obj.qubits[0] == old_single_qubit:
+                        newobj = _Label(obj.name,
+                                        (equiv_qubit_in_model,))
+                    else:
+                        newobj = obj
+                else: 
+                    newobj = obj
+            else:
+                newobj = [replace(sub) for sub in obj]
+            return newobj
+
+        self._labels = replace(self._labels)
+
+
     def replace_gatename_inplace(self, old_gatename, new_gatename):
         """
         Changes the *name* of a gate throughout this Circuit.
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 739fb8f7d..cc46bb770 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -612,3 +612,31 @@ def _layer_component_operation(self, model, complbl, cache):
         else:
             ret = _opfactory.op_from_factories(model.factories['layers'], complbl)
         return ret
+
+
+
+
+class LocalNoiseModelWithEquivalentClassesForSingleQubits(LocalNoiseModel):
+
+    def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None, evotype="default",
+                 simulator="auto", on_construction_error='raise',
+                 independent_gates=False, ensure_composed_gates=False, implicit_idle_mode="none", equiv_qubits_classes=None):
+
+        super().__init__(processor_spec, gatedict, prep_layers, povm_layers, evotype, simulator,
+                on_construction_error, independent_gates, ensure_composed_gates, implicit_idle_mode)
+
+        self.equiv_qubit_classes = equiv_qubit_classes
+
+        for key in self.operation_blks:
+            for labels in self.operation_blks[key]:
+                qubit_used = labels.qubits
+                if len(qubits_used) == 1:
+                    # We may be able to replace this.
+                    new_qubit = self.equiv_qubit_classes[int(qubits_used[0])]
+                    if new_qubit not in qubits_used:
+                        # Need to replace.
+                        new_label = labels[0] + (new_qubit,)
+                        self.operation_blks[key][labels] = self.operation_blks[key][new_label]
+                        # This assumes no circular updates.
+
+        
\ No newline at end of file

From 24930fb9f199e513b9c61d0258eeab997c702589 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Mon, 7 Jul 2025 17:56:06 -0700
Subject: [PATCH 059/141] Save

---
 pygsti/baseobjs/label.py                     |   3 +-
 pygsti/forwardsims/matrixforwardsim.py       |  93 +++
 pygsti/layouts/evaltree.py                   | 757 ++++++++++++++++++-
 pygsti/layouts/matrixlayout.py               | 229 +++++-
 pygsti/models/modelconstruction.py           |  55 +-
 pygsti/models/singlequbitequivalencerules.py | 115 +++
 6 files changed, 1246 insertions(+), 6 deletions(-)
 create mode 100644 pygsti/models/singlequbitequivalencerules.py

diff --git a/pygsti/baseobjs/label.py b/pygsti/baseobjs/label.py
index 52ca9dcd7..254b9b03c 100644
--- a/pygsti/baseobjs/label.py
+++ b/pygsti/baseobjs/label.py
@@ -139,7 +139,8 @@ def __new__(cls, name, state_space_labels=None, time=None, args=None):
                 return LabelStr.init(name, time)
 
         else:
-            if args is not None: return LabelTupWithArgs.init(name, state_space_labels, time, args)
+            if args is not None:
+                return LabelTupWithArgs.init(name, state_space_labels, time, args)
             else:
                 if time == 0.0:
                     return LabelTup.init(name, state_space_labels)
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 0cf5190c4..bf02f9999 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -21,7 +21,9 @@
 from pygsti.forwardsims.forwardsim import ForwardSimulator as _ForwardSimulator
 from pygsti.forwardsims.forwardsim import _bytes_for_array_types
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
 from pygsti.layouts.matrixlayout import MatrixCOPALayout as _MatrixCOPALayout
+from pygsti.layouts.matrixlayout import _MatrixCOPALayoutAtomWithLCS
 from pygsti.baseobjs.profiler import DummyProfiler as _DummyProfiler
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
@@ -3728,3 +3730,94 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                                   layout.resource_alloc())
         return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
                                               dataset, ds_cache)
+
+
+class LCSEvalTreeMatrixForwardSimulator(MatrixForwardSimulator):
+
+    def bulk_product(self, circuits, scale=False, resource_alloc=None):
+        """
+        Compute the products of many circuits at once.
+
+        Parameters
+        ----------
+        circuits : list of Circuits
+            The circuits to compute products for.  These should *not* have any preparation or
+            measurement layers.
+
+        scale : bool, optional
+            When True, return a scaling factor (see below).
+
+        resource_alloc : ResourceAllocation
+            Available resources for this computation. Includes the number of processors
+            (MPI comm) and memory limit.
+
+        Returns
+        -------
+        prods : numpy array
+            Array of shape S x G x G, where:
+            - S == the number of operation sequences
+            - G == the linear dimension of a operation matrix (G x G operation matrices).
+        scaleValues : numpy array
+            Only returned when scale == True. A length-S array specifying
+            the scaling that needs to be applied to the resulting products
+            (final_product[i] = scaleValues[i] * prods[i]).
+        """
+        resource_alloc = _ResourceAllocation.cast(resource_alloc)
+        nCircuits = len(circuits)
+
+        eval_tree = _EvalTreeLCS(circuits)
+        prodCache = eval_tree.fill_out_circuit_cache(self.model)
+        Gs = prodCache[0:nCircuits]
+
+
+        return Gs
+
+    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, resource_alloc):
+        
+        # Overestimate the amount of cache usage by assuming everything is the same size.
+        dim = self.model.evotype.minimal_dim(self.model.state_space)
+        resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
+
+        prodCache = layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
+        Gs = layout_atom.tree.reconstruct_full_matrices(prodCache)
+        old_err = _np.seterr(over='ignore')
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
+            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
+            #  to the the element indices when `spamtuple` is used.
+            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
+            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+            _fas(array_to_fill, [element_indices],
+                 self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
+        _np.seterr(**old_err)
+
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+        
+        eps = 1e-7  # hardcoded?
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+
+        for i in range(self.model.num_params):
+            if i in iParamToFinal:
+                iFinal = iParamToFinal[i]
+                vec = orig_vec.copy(); vec[i] += eps
+                self.model.from_vector(vec, close=True)
+                self._bulk_fill_probs_atom(probs2, layout_atom)
+                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
+        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache)
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index d8aa24f85..9b07ffa76 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -18,6 +18,12 @@
 
 from pygsti.circuits.circuit import Circuit as _Circuit
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
+from pygsti.baseobjs.label import LabelTupTup, Label
+from pygsti.modelmembers.operations import create_from_superop_mx
+from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
+import itertools
+from typing import Sequence
+
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -340,7 +346,7 @@ def _get_start_indices(max_intersect):
                             (_time.time() - tm)); tm = _time.time()
 
                 #merge_method = "fast"
-                #Another possible algorith (but slower)
+                #Another possible algorithm (but slower)
                 #if merge_method == "best":
                 #    while len(indicesLeft) > 0:
                 #        iToMergeInto,_ = min(enumerate(map(len,subTreeSetList)),
@@ -444,3 +450,752 @@ def _get_start_indices(max_intersect):
 
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
+
+
+
+def _best_matching_only(A: Sequence, B: Sequence) -> int:
+    """
+    Returns:
+    -----
+    int - the length of the longest matching prefix between A and B.
+    """
+    i = 0
+    n = len(A)
+    m = len(B)
+    while i < n and i < m:
+        if A[i] != B[i]:
+            return len(A[:i])
+        i += 1
+    return len(A[:i])
+
+
+
+def _lcs_dp_version(A, B):
+    """
+    Compute the longest common substring between A and B using
+    dynamic programming.
+
+    
+    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
+
+    """
+    
+    table = setup_lcs_dynamic_programming_table(A, B)
+    n, m = table.shape
+    for i in range(n-2, -1, -1):
+        for j in range(m-2, -1, -1):
+            opt1 = 0
+            if A[i] == B[j]:
+                opt1 = _best_matching_only(A[i:], B[j:])
+            opt2 = table[i, j+1]
+            opt3 = table[i+1, j]
+            table[i,j] = max(opt1, opt2, opt3)
+    return table
+
+def setup_lcs_dynamic_programming_table(A, B):
+    """
+    Create the table used for LCS dynamic programming.
+    """
+    return _np.zeros((len(A) + 1, len(B) + 1))
+
+def build_one_round_of_eval_tree(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
+    if table_data_and_sequences:
+        table, sequences = table_data_and_sequences
+    else:
+        table, sequences = _compute_lcs_for_every_pair_of_circuits(circuits)
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = build_internal_tables(circuits)
+
+    best_index = _np.where(table == _np.max(table))
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_circuits = circuits
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
+
+    if _np.max(internal_subtable) >= _np.max(table):
+        # We are only going to replace if this was the longest substring.
+        for cir_ind in best_internal_index[0]:
+            for seq in internal_subsequences[cir_ind]:
+                key = tuple(seq)
+                if key in all_subsequences_to_replace:
+                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+                else:
+                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    if _np.max(table) >= _np.max(internal_subtable):
+        for ii in range(len(best_index[0])):
+            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
+            cir_index = best_index[0][ii]
+            cir_index2 = best_index[1][ii]
+            seq = updated_circuits[cir_index][starting_point: int(starting_point + length+1)]
+
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                if cir_index not in all_subsequences_to_replace[key]:
+                    # We did not already handle this with internal subsequences.
+                    all_subsequences_to_replace[key][cir_index] = [starting_point]
+                if cir_index2 not in all_subsequences_to_replace[key]:
+                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
+
+            else:
+                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
+
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_circuits[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+
+                    sp += 1
+                updated_circuits[cir_ind] = my_cir
+
+                cache_struct[cir_ind] = updated_circuits[cir_ind]
+
+            updated_circuits.append(list(seq))
+            cache_struct[cache_num] = updated_circuits[cache_num]
+
+            cache_num += 1
+
+    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+
+    return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
+
+def locate_sequences_in_AB(A, B, dp_table) -> tuple[int, int, int]:
+    """
+    Finds the indices of the starting points of the sequences in A and B.
+
+    Returns:
+    ---------
+    int - starting index in A of LCS(A,B)
+    int - starting index in B of LCS(A,B)
+    int - length of LCS(A,B)
+    """
+    n, m = dp_table.shape
+    i = 0
+    j = 0
+    while i < n-1 and j < m -1:
+        curr = dp_table[i,j]
+        opt1 = dp_table[i+1, j+1]
+        opt2 = dp_table[i+1, j]
+        opt3 = dp_table[i, j+1]
+        options = [opt1, opt2, opt3]
+        if _np.all(curr == options):
+            i += 1
+            j += 1
+        elif opt2 > opt1 and opt2 > opt3:
+            i += 1
+        elif opt3 > opt2 and opt3 > opt1:
+            j += 1
+        else:
+            # All three options are equal. So we should march the diagonal.
+            i += 1
+            j += 1
+            return i-1, j-1, dp_table[i,j]
+    return None, None, None
+
+def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
+    """
+    Computes the LCS for every pair of circuits A,B in circuit_list
+    """
+    best_subsequences = {}
+    best_lengths = _np.zeros((len(circuit_list), len(circuit_list)))
+    curr_best = 0
+    for i, cir0 in enumerate(circuit_list):
+        if len(cir0) >= curr_best:
+            # Could be the best.
+            for j in range(i+1, len(circuit_list)):
+                cir1 = circuit_list[j]
+                if len(cir1) >= curr_best:
+                    table = _lcs_dp_version(cir0, cir1)
+                    best_lengths[i,j] = table[0,0]
+                    best_subsequences[(i,j)] = locate_sequences_in_AB(cir0, cir1, table)
+                    curr_best = max(best_lengths[i,j], curr_best)
+                else:
+                    best_lengths[i,j] = -1
+                    best_subsequences[(i,j)] = (None, None, None)
+        else:
+            # Skipped because cannot be the best yet.
+            best_lengths[i,j] = -1
+            best_subsequences[(i,j)] = (None, None, None)
+    return best_lengths, best_subsequences
+
+
+def _longest_common_internal_subsequence(A: _Circuit) -> tuple[int, dict[tuple, list[int]]]:
+    """
+    Compute the longest common subsequence within a single circuit A.
+
+    Returns:
+    ---------
+    int - length of longest common subsequences within A
+    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
+    """
+    n = len(A)
+    best = 0
+    best_ind = {}
+    changed = False
+    for w in range(1, int(_np.floor(n / 2) + 1)):
+        for sp in range(n - w):
+            window = A[sp: sp + w]
+            for match in range(sp+ w, n-w + 1):
+                if A[match: match + w] == window:
+                    if best == w:
+                        if tuple(window) in best_ind:
+                            best_ind[tuple(window)].add(match)
+                        else:
+                            best_ind[tuple(window)] = {sp, match}
+                    else:
+                        best_ind = {tuple(window): {sp, match}}
+                        changed = True
+                        best = w
+        if not changed:
+            return best, best_ind
+    return best, best_ind
+
+def build_internal_tables(circuit_list):
+    """
+    Compute all the longest common internal sequences for each circuit A in circuit_list
+    """
+
+    C = len(circuit_list)
+    the_table = _np.zeros(C)
+    seq_table = [[] for _ in range(C)]
+
+    curr_best = 1
+    for i in range(C):
+        if len(circuit_list[i]) >= curr_best:
+            the_table[i], seq_table[i] = _longest_common_internal_subsequence(circuit_list[i])
+            curr_best = max(curr_best, the_table[i])
+    return the_table, seq_table
+
+def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
+    """
+    Add in explicit idles to the labels for each layer.
+    """
+
+    tmp = circuit.copy(editable=True)
+    num_layers = circuit.num_layers
+
+    for i in range(num_layers):
+        tmp[i] = Label(tmp.layer_label_with_idles(i, idle_gate_name))
+
+    if tmp._static:
+        tmp.done_editing()
+    return tmp
+
+
+def _compute_qubit_to_lanes_mapping_for_circuit(circuit, num_qubits: int) -> tuple[dict[int, int], dict[int, tuple[int]]]:
+    """
+    Returns
+    --------
+    Dictionary mapping qubit number to lane number in the circuit.
+    """
+
+    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(num_qubits)}
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        for op in layer:
+            qubits_used = op.qubits
+            for qb in qubits_used:
+                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
+
+    lanes = {}
+    lan_num = 0
+    visited: dict[int, int] = {}
+    def reachable_nodes(starting_point: int, graph_qubits_to_neighbors: dict[int, set[int]], visited: dict[int, set[int]]):
+        """
+        Find which nodes are reachable from this starting point.
+        """
+        if starting_point in visited:
+            return visited[starting_point]
+        else:
+            assert starting_point in graph_qubits_to_neighbors
+            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
+            output = set(visited[starting_point])
+            for child in graph_qubits_to_neighbors[starting_point]:
+                if child != starting_point:
+                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
+            visited[starting_point] = output
+            return output
+
+    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
+    while available_starting_points:
+        sp = available_starting_points[0]
+        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
+        for node in nodes:
+            available_starting_points.remove(node)
+        lanes[lan_num] = nodes
+        lan_num += 1
+
+    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
+        """
+        Determine a mapping from qubit to the lane it is in for this specific circuit.
+        """
+        out = {}
+        for key, val in lanes_to_qubits.items():
+            for qb in val:
+                out[qb] = key
+        return out
+
+    return compute_qubits_to_lanes(lanes), lanes
+
+
+
+def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
+    """
+    Split a circuit into multiple subcircuits which do not talk across lanes.
+    """
+
+    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
+
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        group = []
+        group_lane = None
+        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
+
+        for op in sorted_layer:
+            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
+            qubits_used = op.qubits # This will be a list of qubits used.
+            # I am assuming that the qubits are indexed numerically and not by strings.
+            lane = qubits_to_lanes[qubits_used[0]]
+
+            if group_lane is None:
+                group_lane = lane
+                group.append(op)
+            elif group_lane == lane:
+                group.append(op)
+            else:
+                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+                group_lane = lane
+                group = [op]
+
+        if len(group) > 0:
+            # We have a left over group.
+            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+
+    return lanes_to_gates
+
+def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
+                                            implicit_idle_gate_name: str = "I") -> tuple[list[tuple],list[int], list[dict[int, int]]]:
+    """
+    Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
+
+    Also return a sequence detailing the number of lanes in each circuit.
+    Then, a sequence detailing the number of qubits in each lane for a circuit.
+    """
+
+    output = []
+    qubits_used_in_each_lane = []
+
+    for cir in circuit_list:
+
+        if implicit_idle_gate_name:
+            cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
+
+        qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
+        sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
+
+        output.extend(sub_cirs)
+        qubits_used_in_each_lane.append(lanes_to_qubits)
+    return output, qubits_used_in_each_lane
+
+
+class EvalTreeBasedUponLongestCommonSubstring():
+
+    def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: list[dict[int, tuple[int, ...]]]):
+        """
+        Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
+        """
+
+        assert len(qubits_used_in_each_lane) <= len(circuit_list)
+        external_matches = _compute_lcs_for_every_pair_of_circuits(circuit_list)
+        internal_matches = build_internal_tables(circuit_list)
+
+        max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+
+        C = len(circuit_list)
+        num_full_circuits = len(qubits_used_in_each_lane)
+        sequence_intro = {0: _np.arange(C)}
+
+        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+        cache_pos = C
+        new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
+
+        i = 0
+        while max_rounds > 1:
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = build_one_round_of_eval_tree(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
+            i += 1
+            external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
+            internal_matches = build_internal_tables(new_circuit_list)
+
+            max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+
+        self.circuit_list = new_circuit_list
+        self.cache = cache
+        self.num_circuits = C
+        self.qubits_used_in_each_lane = qubits_used_in_each_lane
+
+        self.sequence_intro = sequence_intro
+
+        swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                      0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00, -1.23259516e-32],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00, 1.23259516e-32,
+                                      0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                      1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32],
+                                   
+                                    [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,
+                                     0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [ 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00,0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                   
+                                    [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,
+                                    0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00],
+                                    [ 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00],
+       
+                                    [ 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,
+                                    1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
+                                    [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
+
+        self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")
+
+
+
+
+        self.cache_ind_to_num_qubits_needed = {}
+        offset = 0
+        for i in range(num_full_circuits):
+            for j in qubits_used_in_each_lane[i]:
+                cache_ind = offset + j
+                self.set_cache_sizes(cache_ind, len(qubits_used_in_each_lane[i][j]))
+
+            offset += len(qubits_used_in_each_lane[i])
+        
+        self.tensor_contraction_orders_by_circuit = {}
+        self.tensor_contraction_order_cache = {}
+        self.qubit_list_cache = {}
+
+        for i in range(num_full_circuits):
+            self.qubit_list_cache[i] = [qubits_used_in_each_lane[i][k] for k in sorted(qubits_used_in_each_lane[i])]
+            self.tensor_contraction_orders_by_circuit[i] = self.best_order_for_tensor_contraction(tuple(self.qubit_list_cache[i]))
+            
+
+        # for val in cache.values():
+        #     num_terms = len(val)
+        #     self._best_order_for_first_derivative(num_terms)
+
+    def set_cache_sizes(self, cache_ind: int, num_qubits: int):
+        """
+        Set the size to use the number of qubits specified.
+        """
+        if cache_ind in self.cache_ind_to_num_qubits_needed:
+            return # we have already set them all.
+        self.cache_ind_to_num_qubits_needed[cache_ind] = num_qubits
+        for child in self.cache:
+            if isinstance(child, int):
+                self.set_cache_sizes(child, num_qubits)
+
+    def collapse_circuits_to_process_matrices(self, model):
+        """
+        Compute the total product cache. Note that this may still have a tensor product
+        structure that the operator needs to combine again if they want to have the full 'dense' matrix.
+        """
+
+
+        round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+        saved: dict[int, _LinearOperator] = {}
+        
+        def look_up_operations(model, opTuple) -> _LinearOperator:
+
+            if hasattr(model, "operations"):
+                return model.operations[opTuple].to_dense()
+            elif hasattr(model, "operation_blks"):
+                return model.operation_blks[opTuple].to_dense()
+            else:
+                raise ValueError("Missing attribute")
+
+        for key in round_keys:
+            for cind in self.sequence_intro[key]:
+                cumulative_term = None
+                for term in self.cache[cind]:
+                    if isinstance(term, int) and cumulative_term is None:
+                        # look up result.
+                        cumulative_term = saved[term]
+                    elif isinstance(term, int) and not (cumulative_term is None):
+                        cumulative_term = saved[term] @ (cumulative_term)
+                    elif isinstance(term, LabelTupTup):
+                        val = 1
+                        for op in term:
+                            op_term = 1
+                            if op.num_qubits == 2:
+                                # We may need to do swaps.
+                                if op in saved:
+                                    op_term = saved[op]
+                                elif op.qubits[1] < op.qubits[0]:
+                                    # This is in the wrong order.
+                                    op_term = look_up_operations(model, op)
+                                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
+                                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
+                                    saved[op] = op_term # Save so we only need to this operation once.
+                                else:
+                                    op_term = look_up_operations(model, op)
+                            else:
+                                op_term = look_up_operations(model, op)
+
+                            val = _np.kron(val, op_term)
+                        #val = model.operation_blks["gates"][term[0]].to_dense()
+                        if cumulative_term is None:
+                            cumulative_term = val
+                        else:
+                            cumulative_term = val @ (cumulative_term)
+                if cumulative_term is None:
+                    saved[cind] = _np.eye(4**self.cache_ind_to_num_qubits_needed[cind]) # identity of the appropriate size.
+                else:
+                    saved[cind] = cumulative_term
+        if __debug__:
+            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
+            for key in self.cache:
+                assert key in saved
+        
+        return saved
+    
+    def reconstruct_full_matrices(self, process_matrices_cache):
+
+        output = []
+        start_pos = 0
+        for cir_ind in range(len(self.qubits_used_in_each_lane)):
+            lane_circuits = []
+            for i in range(self.qubits_used_in_each_lane[cir_ind]):
+                lane_circuits.append(process_matrices_cache[start_pos + i])
+            output.append(lane_circuits)
+            start_pos += self.inds_needed_to_reconstruct[cir_ind]
+
+        # Now we will do the contraction.
+        for cir_ind in range(len(self.inds_needed_to_reconstruct)):
+
+            order = self.tensor_contraction_orders_by_circuit[cir_ind]
+
+            while order:
+                sp = order[0]
+                if len(output[cir_ind][sp]) == 0:
+                    breakpoint()
+                output[cir_ind][sp] = _np.kron(output[cir_ind][sp], output[cir_ind][sp+1])
+                output[cir_ind][sp+1:] = output[cir_ind][sp+2:]
+                
+                # Adjust future indices
+                tmp = []
+                for new_val in order[1:]:
+                    tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
+                order = tmp
+
+            output[cir_ind] = output[cir_ind][0]
+            assert output[cir_ind].shape == (256, 256)
+        return output
+    # def compute_derivatives_using_cache(self, model, productCache):
+    #     """
+    #     We are interested in computing the derivative of the probabilities specified by a model
+    #     and the cached circuit list against the model parameters. We will assume that the model can take a
+    #     derivative with respect to a single gate operation. However, we need to handle the product rule.
+    #     """
+
+    #     productCache = self.fill_out_circuit_cache(model)
+
+    #     round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+    #     saved = {}
+
+    #     product_rule_cache: dict[int, list[int]] = {}
+    #     for key in round_keys:
+    #         for cind in self.sequence_intro[key]:
+
+
+                
+    #             cumulative_term = None
+    #             for term in self.cache[cind]:
+    #                 if isinstance(term, int) and cumulative_term is None:
+    #                     # look up result.
+    #                     cumulative_term = saved[term]
+    #                 elif isinstance(term, int) and not (cumulative_term is None):
+    #                     cumulative_term = saved[term] @ cumulative_term
+    #                 elif isinstance(term, LabelTupTup):
+    #                     val = 1
+    #                     for op in term:
+    #                         op_term = 1
+    #                         if op.num_qubits == 2:
+    #                             # We may need to do swaps.
+    #                             if op in saved:
+    #                                 op_term = saved[op]
+    #                             elif op.qubits[1] < op.qubits[0]:
+    #                                 # This is in the wrong order.
+    #                                 swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
+    #                                 op_term = model.operation_blks["gates"][op].to_dense()
+    #                                 op_term = swap_term @ op_term @ swap_term.T
+    #                                 saved[op] = op_term # Save so we only need to this operation once.
+    #                             else:
+    #                                 op_term = model.operation_blks["gates"][op].to_dense()
+    #                         else:
+    #                             op_term = model.operation_blks["gates"][op].to_dense()
+    #                         val = np.kron(val, op_term)
+    #                     #val = model.operation_blks["gates"][term[0]].to_dense()
+    #                     if cumulative_term is None:
+    #                         cumulative_term = val
+    #                     else:
+    #                         cumulative_term = val @ cumulative_term
+    #             saved[cind] = cumulative_term
+    #     return saved
+    
+    def cache_num_to_matrix_size(self, ind, output_cache):
+        if ind in output_cache:
+            return output_cache[ind]
+        else:
+            if ind not in self.cache:
+                assert ind in self.cache
+            children = self.cache[ind]
+            answer = 0
+            for child in children:
+                if isinstance(child, Label):
+                    lbls = child.num_qubits
+                    sub_probanswer = lbls
+                else:
+                    sub_probanswer = self.cache_num_to_matrix_size(child, output_cache)
+                answer = max(answer, sub_probanswer)
+            output_cache[ind] = answer
+            return answer
+
+
+    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...]):
+        
+
+        if qubit_list in self.tensor_contraction_order_cache:
+            return self.tensor_contraction_order_cache[qubit_list]
+
+        best_cost = _np.inf
+        best_order = []
+
+        for order in itertools.permutations(range(len(qubit_list)-1), len(qubit_list)-1):
+
+            my_list = [qb for qb in qubit_list] # force deep copy.
+            my_starting_points = [sp for sp in order]
+            cost = 0
+            early_exit = False
+            while my_starting_points and not early_exit:
+                sp = my_starting_points.pop(0)
+
+                cost += self._tensor_cost_model(my_list[sp], my_list[sp+1])
+                if cost <= best_cost:
+                    # modify sp for future.
+                    tmp = []
+                    for new_val in my_starting_points:
+                        tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
+                    my_starting_points = tmp
+
+                    q2 = my_list.pop(sp+1)
+                    my_list[sp] += q2
+                else:
+                    early_exit = True # This round is done because the partial sum was too big.
+
+            if cost < best_cost and not early_exit:
+                best_cost = cost
+                best_order = list(order)
+
+        # Store off the information.
+        self.tensor_contraction_order_cache[qubit_list] = best_order
+
+        return best_order
+
+    def _tensor_cost_model(self, num_qubits1, num_qubits2):
+        """
+        Assumes kronecker product of 2 square matrices.
+        """
+
+        return (4**num_qubits1)**2 * (4**num_qubits2)**2
+
+    """        
+    def _evaluate_product_rule(self, cind: int, rn: int):
+
+        sequence = self.cache[cind]
+        num_terms = len(sequence)
+        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
+
+        for sub_r in sorted(sub_rounds.keys())[::-1]:
+            sub_sequence = None
+            for sub_cind in sub_rounds[sub_r]:
+        
+                for term in sub_tree_cache[sub_cind]:
+                    if isinstance(term, tuple):
+                        # Then, this may be a partial derivative or an character in original sequence.
+                        if len(term) == 2:
+                            # Then this is taking a partial derivative.
+                            natural_term = term[1][0]
+                            if natural_term in self.derivative_cache:
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+                            else:
+                                # This should be a natural derivative.
+                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+
+                        # It is just an index to sequence for where to look in the cache.
+                        next_ind = term[0]
+                        sequence_val = sequence[next_ind]
+
+                        if isinstance(term, int) and cumulative_term is None:
+                            # look up result.
+                            cumulative_term = saved[term]
+                        elif isinstance(term, int) and not (cumulative_term is None):
+                            cumulative_term = saved[term] @ cumulative_term
+                        elif isinstance(term, LabelTupTup):
+                            val = 1
+                            for op in term:
+                            op_term = 1
+                            if op.num_qubits == 2:
+                                # We may need to do swaps.
+                                if op in saved:
+                                    op_term = saved[op]
+                                elif op.qubits[1] < op.qubits[0]:
+                                    # This is in the wrong order.
+                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
+                                    op_term = model.operation_blks["gates"][op].to_dense()
+                                    op_term = swap_term @ op_term @ swap_term.T
+                                    saved[op] = op_term # Save so we only need to this operation once.
+                                else:
+                                    op_term = model.operation_blks["gates"][op].to_dense()
+                            else:
+                                op_term = model.operation_blks["gates"][op].to_dense()
+                            val = _np.kron(val, op_term)
+                        #val = model.operation_blks["gates"][term[0]].to_dense()
+                        if cumulative_term is None:
+                            cumulative_term = val
+                        else:
+                            cumulative_term = val @ cumulative_term
+    """
+    
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 7642978e5..f3c2c8e85 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -17,10 +17,13 @@
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
 from pygsti.tools import slicetools as _slct
 
+NICK_USE_OLD_EVAL_TREE = False
 
 class _MatrixCOPALayoutAtom(_DistributableAtom):
     """
@@ -138,6 +141,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
             if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
                 double_expanded_ckt = cir.expand_subcircuits()
             double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
+
         self.tree = _EvalTree.create(double_expanded_nospam_circuits_plus_scratch)
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
@@ -151,7 +155,213 @@ def add_expanded_circuits(indices, add_to_this_dict):
         tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
         for i, c in expanded_nospam_circuits.items():
             for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
-                if spam_tuple not in tree_indices_by_spamtuple: tree_indices_by_spamtuple[spam_tuple] = []
+                if spam_tuple not in tree_indices_by_spamtuple:
+                    tree_indices_by_spamtuple[spam_tuple] = []
+                tree_indices_by_spamtuple[spam_tuple].append(i)
+
+        #Assign element indices, starting at `offset`
+        # now that we know how many of each spamtuple there are, assign final element indices.
+        local_offset = 0
+        self.indices_by_spamtuple = dict()  # values are (element_indices, tree_indices) tuples.
+        for spam_tuple, tree_indices in tree_indices_by_spamtuple.items():
+            self.indices_by_spamtuple[spam_tuple] = (slice(local_offset, local_offset + len(tree_indices)),
+                                                     _slct.list_to_slice(tree_indices, array_ok=True))
+            local_offset += len(tree_indices)
+            #TODO: allow tree_indices to be None or a slice?
+
+        element_slice = None  # slice(offset, offset + local_offset)  # *global* (of parent layout) element-index slice
+        num_elements = local_offset
+
+        elindex_outcome_tuples = {unique_i: list() for unique_i in range(len(unique_complete_circuits))}
+
+        for spam_tuple, (element_indices, tree_indices) in self.indices_by_spamtuple.items():
+            for elindex, tree_index in zip(_slct.indices(element_indices), _slct.to_array(tree_indices)):
+                outcome_by_spamtuple = expanded_nospam_circuit_outcomes[expanded_nospam_circuits[tree_index]]
+                outcome, unique_is = outcome_by_spamtuple[spam_tuple]
+                for unique_i in unique_is:
+                    elindex_outcome_tuples[unique_i].append((elindex, outcome))  # *local* element indices
+        self.elindex_outcome_tuples = elindex_outcome_tuples
+
+        super().__init__(element_slice, num_elements)
+
+    def nonscratch_cache_view(self, a, axis=None):
+        """
+        Create a view of array `a` restricting it to only the *final* results computed by this tree.
+
+        This need not be the entire array because there could be intermediate results
+        (e.g. "scratch space") that are excluded.
+
+        Parameters
+        ----------
+        a : ndarray
+            An array of results computed using this EvalTree,
+            such that the `axis`-th dimension equals the full
+            length of the tree.  The other dimensions of `a` are
+            unrestricted.
+
+        axis : int, optional
+            Specified the axis along which the selection of the
+            final elements is performed. If None, than this
+            selection if performed on flattened `a`.
+
+        Returns
+        -------
+        ndarray
+            Of the same shape as `a`, except for along the
+            specified axis, whose dimension has been reduced
+            to filter out the intermediate (non-final) results.
+        """
+        if axis is None:
+            return a[0:self._num_nonscratch_tree_items]
+        else:
+            sl = [slice(None)] * a.ndim
+            sl[axis] = slice(0, self._num_nonscratch_tree_items)
+            ret = a[tuple(sl)]
+            assert(ret.base is a or ret.base is a.base)  # check that what is returned is a view
+            assert(ret.size == 0 or _np.may_share_memory(ret, a))
+            return ret
+
+    @property
+    def cache_size(self):
+        """The cache size of this atom."""
+        return len(self.tree)
+    
+
+class _MatrixCOPALayoutAtomWithLCS(_DistributableAtom):
+    """
+    The atom ("atomic unit") for dividing up the element dimension in a :class:`MatrixCOPALayout`.
+
+    Parameters
+    ----------
+    unique_complete_circuits : list
+        A list that contains *all* the "complete" circuits for the parent layout.  This
+        atom only owns a subset of these, as given by `group` below.
+
+    unique_nospam_circuits : list
+        A list that contains the unique circuits within `unique_complete_circuits` once
+        their state preparations and measurements are removed.  A subset of these circuits
+        (see `group` below) are what fundamentally define the circuit outcomes that this atom
+        includes: it includes *all* the circuit outcomes of those circuits.
+
+    circuits_by_unique_nospam_circuits : dict
+       A dictionary with keys equal to the elements of `unique_nospam_circuits` and values
+       that are lists of indices into `unique_complete_circuits`.  Thus, this dictionary
+       maps each distinct circuit-without-SPAM circuit to the list of complete circuits
+       within `unique_complete_circuits` that correspond to it.
+
+    ds_circuits : list
+        A list of circuits parallel to `unique_complete_circuits` of these circuits
+        as they should be accessed from `dataset`.  This applies any aliases and
+        removes implied SPAM elements relative to `unique_complete_circuits`.
+
+    group : set
+        The set of indices into `unique_nospam_circuits` that define the circuit
+        outcomes owned by this atom.
+
+    helpful_scratch : set
+        A set of indices into `unique_nospam_circuits` that specify circuits that
+        aren't owned by this atom but are helpful in building up an efficient evaluation
+        tree.
+
+    model : Model
+        The model being used to construct this layout.  Used for expanding instruments
+        within the circuits.
+    
+    unique_circuits : list of Circuits
+        A list of the unique :class:`Circuit` objects representing the circuits this layout will include.
+
+    dataset : DataSet
+        The dataset, used to include only observed circuit outcomes in this atom
+        and therefore the parent layout.
+    """
+
+    def __init__(self, unique_complete_circuits, unique_nospam_circuits, circuits_by_unique_nospam_circuits,
+                 ds_circuits, group, helpful_scratch, model, unique_circuits, dataset=None, expanded_and_separated_circuit_cache=None,
+                 double_expanded_nospam_circuits_cache = None, implicit_idle_gate = None):
+
+        if expanded_and_separated_circuit_cache is None:
+            expanded_and_separated_circuit_cache = dict()
+
+        #Note: group gives unique_nospam_circuits indices, which circuits_by_unique_nospam_circuits
+        # turns into "unique complete circuit" indices, which the layout via it's to_unique can map
+        # to original circuit indices.
+        def add_expanded_circuits(indices, add_to_this_dict):
+            _expanded_nospam_circuit_outcomes = add_to_this_dict
+            for i in indices:
+                nospam_c = unique_nospam_circuits[i]
+                for unique_i in circuits_by_unique_nospam_circuits[nospam_c]:  # "unique" circuits: add SPAM to nospam_c
+                    #the cache is indexed into using the (potentially) incomplete circuits
+                    expc_outcomes = expanded_and_separated_circuit_cache.get(unique_circuits[unique_i], None)
+                    if expc_outcomes is None: #fall back on original non-cache behavior.
+                        observed_outcomes = None if (dataset is None) else dataset[ds_circuits[unique_i]].unique_outcomes
+                        expc_outcomes = model.expand_instruments_and_separate_povm(unique_complete_circuits[unique_i], observed_outcomes)
+                        #and add this new value to the cache.
+                        expanded_and_separated_circuit_cache[unique_circuits[unique_i]] = expc_outcomes 
+                    for sep_povm_c, outcomes in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
+                        prep_lbl = sep_povm_c.circuit_without_povm[0]
+                        exp_nospam_c = sep_povm_c.circuit_without_povm[1:]  # sep_povm_c *always* has prep lbl
+                        spam_tuples = [(prep_lbl, elabel) for elabel in sep_povm_c.full_effect_labels]
+                        outcome_by_spamtuple = {st:outcome for st, outcome in zip(spam_tuples, outcomes)}
+
+                        #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
+                        # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
+                        # keep track of a list of unique_i indices for each circut and spam tuple below.
+                        if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
+                            _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
+                        else:
+                            for st, outcome in outcome_by_spamtuple.items():
+                                if st in _expanded_nospam_circuit_outcomes[exp_nospam_c]:
+                                    existing_outcome, existing_unique_is = \
+                                        _expanded_nospam_circuit_outcomes[exp_nospam_c][st]
+                                    assert(existing_outcome == outcome), "Outcome should be same when spam tuples are!"
+                                    assert(unique_i not in existing_unique_is)  # SLOW - remove?
+                                    existing_unique_is.append(unique_i)
+                                else:
+                                    _expanded_nospam_circuit_outcomes[exp_nospam_c][st] = (outcome, [unique_i])
+
+        # keys = expanded circuits w/out SPAM layers; values = spamtuple => (outcome, unique_is) dictionary that
+        # keeps track of which "unique" circuit indices having each spamtuple / outcome.
+        expanded_nospam_circuit_outcomes = dict()
+        add_expanded_circuits(group, expanded_nospam_circuit_outcomes)
+        expanded_nospam_circuits = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes.keys())}
+
+        # add suggested scratch to the "final" elements as far as the tree creation is concerned
+        # - this allows these scratch element to help balance the tree.
+        if helpful_scratch:
+            expanded_nospam_circuit_outcomes_plus_scratch = expanded_nospam_circuit_outcomes.copy()
+            add_expanded_circuits(helpful_scratch, expanded_nospam_circuit_outcomes_plus_scratch)
+            expanded_nospam_circuits_plus_scratch = {i:cir for i, cir in enumerate(expanded_nospam_circuit_outcomes_plus_scratch.keys())}
+        else:
+            expanded_nospam_circuits_plus_scratch = expanded_nospam_circuits.copy()
+        
+        if double_expanded_nospam_circuits_cache is None:
+            double_expanded_nospam_circuits_cache = dict()
+        double_expanded_nospam_circuits_plus_scratch = dict()
+        for i, cir in expanded_nospam_circuits_plus_scratch.items():
+            # expand sub-circuits for a more efficient tree
+            double_expanded_ckt = double_expanded_nospam_circuits_cache.get(cir, None)
+            if double_expanded_ckt is None: #Fall back to standard behavior and do expansion.
+                double_expanded_ckt = cir.expand_subcircuits()
+            double_expanded_nospam_circuits_plus_scratch[i] = double_expanded_ckt
+
+        vals = list(double_expanded_nospam_circuits_plus_scratch.values())
+        
+        circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
+        self.tree = _EvalTreeLCS(circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree)
+        #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
+
+        self._num_nonscratch_tree_items = len(expanded_nospam_circuits)  # put this in EvalTree?
+
+        # self.tree's elements give instructions for evaluating ("caching") no-spam quantities (e.g. products).
+        # Now we assign final element indices to the circuit outcomes corresponding to a given no-spam ("tree")
+        # quantity plus a spam-tuple. We order the final indices so that all the outcomes corresponding to a
+        # given spam-tuple are contiguous.
+
+        tree_indices_by_spamtuple = dict()  # "tree" indices index expanded_nospam_circuits
+        for i, c in expanded_nospam_circuits.items():
+            for spam_tuple in expanded_nospam_circuit_outcomes[c].keys():
+                if spam_tuple not in tree_indices_by_spamtuple:
+                    tree_indices_by_spamtuple[spam_tuple] = []
                 tree_indices_by_spamtuple[spam_tuple].append(i)
 
         #Assign element indices, starting at `offset`
@@ -286,7 +496,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
         circuits. I.e. circuits with prep labels and POVM labels appended.
     """
 
-    def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=1,
+    def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=2,
                  num_param_dimension_processors=(), param_dimensions=(),
                  param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
                  layout_creation_circuit_cache = None):
@@ -368,13 +578,26 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
 
         def _create_atom(args):
             group, helpful_scratch_group = args
-            return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
+            if NICK_USE_OLD_EVAL_TREE:
+                return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 
                                          unique_circuits, dataset,
                                          self.expanded_and_separated_circuits_cache,
                                          self.expanded_subcircuits_no_spam_cache)
 
+            gatename = None
+            if hasattr(model._layer_rules, "_singleq_idle_layer_labels"):
+                keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
+                if model._layer_rules.implicit_idle_mode == "pad_1Q":
+                    gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
+            return _MatrixCOPALayoutAtomWithLCS(unique_complete_circuits, unique_nospam_circuits,
+                                         circuits_by_unique_nospam_circuits, ds_circuits,
+                                         group, helpful_scratch_group, model, 
+                                         unique_circuits, dataset,
+                                         self.expanded_and_separated_circuits_cache,
+                                         self.expanded_subcircuits_no_spam_cache, implicit_idle_gate=gatename)
+
         super().__init__(circuits, unique_circuits, to_unique, unique_complete_circuits,
                          _create_atom, list(zip(groups, helpful_scratch)), num_tree_processors,
                          num_param_dimension_processors, param_dimensions,
diff --git a/pygsti/models/modelconstruction.py b/pygsti/models/modelconstruction.py
index df4cfc879..57f6334f1 100644
--- a/pygsti/models/modelconstruction.py
+++ b/pygsti/models/modelconstruction.py
@@ -37,6 +37,7 @@
 from pygsti.models import gaugegroup as _gg
 from pygsti.models.localnoisemodel import LocalNoiseModel as _LocalNoiseModel
 from pygsti.models.cloudnoisemodel import CloudNoiseModel as _CloudNoiseModel
+from pygsti.models.singlequbitequivalencerules import EquivalentClassesLocalNoiseModel as _EquivalentClassesLocalNoiseModel
 from pygsti.baseobjs import label as _label
 from pygsti.baseobjs import statespace as _statespace
 from pygsti.baseobjs.basis import Basis as _Basis
@@ -1524,6 +1525,52 @@ def _setup_local_gates(processor_spec, evotype, modelnoise=None, custom_gates=No
                 if (noiseop is not None) else ideal_factory
     return gatedict
 
+def _create_crosstalk_free_model_with_equivalent_clases(qudit_to_spatially_equivalent_qudit ,processor_spec, modelnoise, custom_gates=None, evotype="default", simulator="auto",
+                                 on_construction_error='raise', independent_gates=False, independent_spam=True,
+                                 ensure_composed_gates=False, ideal_gate_type='auto', ideal_prep_type='auto',
+                                 ideal_povm_type='auto', implicit_idle_mode='none', basis='pp') -> _EquivalentClassesLocalNoiseModel:
+    """
+    Create a n-qudit "crosstalk-free" model while assuming that certain qudits are spatially equivalent for 1 qudit gates.
+
+    Similar to :meth:`create_crosstalk_free_model` but the noise is input more generally,
+    as a :class:`ModelNoise` object.  Arguments are the same as this function except that
+    `modelnoise` is given instead of several more specific noise-describing arguments.
+
+    Returns
+    -------
+    EquivalentClassesLocalNoiseModel
+    """
+
+    qudit_labels = processor_spec.qudit_labels
+    state_space = _statespace.QubitSpace(qudit_labels) if all([udim == 2 for udim in processor_spec.qudit_udims]) \
+        else _statespace.QuditSpace(qudit_labels, processor_spec.qudit_udims)
+    evotype = _Evotype.cast(evotype, state_space=state_space)
+    modelnoise = _OpModelNoise.cast(modelnoise)
+    modelnoise.reset_access_counters()
+
+    if ideal_gate_type == "auto":
+        ideal_gate_type = ('static standard', 'static clifford', 'static unitary')
+    if ideal_prep_type == "auto":
+        ideal_prep_type = _state.state_type_from_op_type(ideal_gate_type)
+    if ideal_povm_type == "auto":
+        ideal_povm_type = _povm.povm_type_from_op_type(ideal_gate_type)
+
+    gatedict = _setup_local_gates(processor_spec, evotype, modelnoise, custom_gates, ideal_gate_type, basis)
+
+    # (Note: global idle is now handled through processor-spec processing)
+
+    # SPAM:
+    local_noise = True
+    prep_layers, povm_layers = _create_spam_layers(processor_spec, modelnoise, local_noise,
+                                                   ideal_prep_type, ideal_povm_type, evotype,
+                                                   state_space, independent_spam, basis)
+
+    modelnoise.warn_about_zero_counters()
+    return _EquivalentClassesLocalNoiseModel(qudit_to_spatially_equivalent_qudit, processor_spec, gatedict, prep_layers, povm_layers,
+                            evotype, simulator, on_construction_error,
+                            independent_gates, ensure_composed_gates,
+                            implicit_idle_mode)
+
 
 def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                 depolarization_strengths=None, stochastic_error_probs=None, lindblad_error_coeffs=None,
@@ -1532,7 +1579,7 @@ def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                 evotype="default", simulator="auto", on_construction_error='raise',
                                 independent_gates=False, independent_spam=True, ensure_composed_gates=False,
                                 ideal_gate_type='auto', ideal_spam_type='computational', implicit_idle_mode='none',
-                                basis='pp'):
+                                basis='pp', qudit_to_equivalent_qudit:dict[int, int] = None):
     """
     Create a n-qudit "crosstalk-free" model.
 
@@ -1678,6 +1725,12 @@ def create_crosstalk_free_model(processor_spec, custom_gates=None,
                                              depolarization_parameterization, stochastic_parameterization,
                                              lindblad_parameterization, allow_nonlocal=False)
 
+    if qudit_to_equivalent_qudit:
+        return _create_crosstalk_free_model_with_equivalent_clases(qudit_to_equivalent_qudit, processor_spec, modelnoise, custom_gates, evotype,
+                                                                   simulator, on_construction_error, independent_gates, independent_spam,
+                                                                   ensure_composed_gates, ideal_gate_type, ideal_spam_type, ideal_spam_type, implicit_idle_mode, basis)
+    
+
     return _create_crosstalk_free_model(processor_spec, modelnoise, custom_gates, evotype,
                                         simulator, on_construction_error, independent_gates, independent_spam,
                                         ensure_composed_gates, ideal_gate_type, ideal_spam_type, ideal_spam_type,
diff --git a/pygsti/models/singlequbitequivalencerules.py b/pygsti/models/singlequbitequivalencerules.py
new file mode 100644
index 000000000..065b60942
--- /dev/null
+++ b/pygsti/models/singlequbitequivalencerules.py
@@ -0,0 +1,115 @@
+from pygsti.models.localnoisemodel import _SimpleCompLayerRules, LocalNoiseModel as _LocalNoiseModel
+from pygsti.baseobjs.label import Label, LabelTup, LabelTupTup
+from pygsti.modelmembers.operations import opfactory as _opfactory
+
+
+
+class SingleQuditGateEquivalenceClassesLayerRules(_SimpleCompLayerRules):
+    """
+    Submodel which assumes that you have a set of qubits for which you trust the action of a single
+    qubit gate equally for all qubits within the set.
+    """
+
+    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label):
+
+        super().__init__(qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label)
+
+    def operation_layer_operator(self, model, layerlbl: Label, caches):
+        """
+        Create the operator corresponding to `layerlbl`.
+
+        Parameters
+        ----------
+        layerlbl : Label
+            A circuit layer label.
+
+        Returns
+        -------
+        LinearOperator
+        """
+
+        if layerlbl in caches['complete-layers']:
+            return caches['complete-layers'][layerlbl]
+        
+        if isinstance(layerlbl, LabelTupTup):
+            # This could be a multiple qubit gate or multiple single qubit gates.
+
+            group = []
+            changed = False
+            
+            for op in layerlbl:
+                assert isinstance(op, LabelTup)
+                qubits_used = op.qubits
+                if op.num_qubits == 1:
+                    if model._qubits_to_equiv_qubit[qubits_used[0]] != qubits_used[0]:
+                        new_label = Label(op.name, model._qubits_to_equiv_qubit[qubits_used[0]], op.time, *op.args)
+
+                        changed = True
+                        group.append(new_label)
+                    else:
+                        group.append(op)
+                else:
+                    group.append(op)
+
+            if changed:
+                new_args = None if layerlbl.args == () else layerlbl.args
+                new_time = 0.0 if layerlbl.time == None else layerlbl.time
+                new_label = Label(group)
+            else:
+                new_label = layerlbl
+
+            # Get the operator
+            if new_label in caches['complete-layers']:
+                caches['complete-layers'][layerlbl] = caches['complete-layers'][new_label]
+                return caches['complete-layers'][new_label]
+            else:
+
+                answer = super().operation_layer_operator(model, new_label, caches)
+                caches['complete-layers'][new_label] = answer
+                caches['complete-layers'][layerlbl] = answer
+                return answer
+
+
+        elif isinstance(layerlbl, LabelTup):
+
+            qubits_used = layerlbl.qubits
+            if layerlbl.num_qubits == 1:
+                if model._qubits_to_equiv_qubit[qubits_used[0]] != qubits_used[0]:
+                    new_args = None if layerlbl.args == () else layerlbl.args
+                    new_time = 0.0 if layerlbl.time == None else layerlbl.time
+                    new_label = Label(layerlbl.name, model._qubits_to_equiv_qubit[qubits_used[0]], new_time, new_args)
+
+                    # Get the operator
+                    if new_label in caches['complete-layers']:
+                        caches['complete-layers'][layerlbl] = caches['complete-layers'][new_label]
+                        return caches['complete-layers'][new_label]
+                    else:
+
+                        answer = super().operation_layer_operator(model, new_label, caches)
+                        caches['complete-layer'][new_label] = answer
+                        caches['complete-layer'][layerlbl] = answer
+                        return answer
+
+        return super().operation_layer_operator(model, layerlbl, caches)
+
+
+class EquivalentClassesLocalNoiseModel(_LocalNoiseModel):
+
+    def __init__(self, qubit_to_equivalent_qubit_for_single_qgates: dict, processor_spec, gatedict, prep_layers=None, povm_layers=None, evotype="default",
+                simulator="auto", on_construction_error='raise',
+                independent_gates=False, ensure_composed_gates=False, implicit_idle_mode="none"):
+
+
+        super().__init__(processor_spec, gatedict, prep_layers, povm_layers, evotype, simulator,
+                         on_construction_error, independent_gates, ensure_composed_gates, implicit_idle_mode)
+        
+        # Now we need to reset the layer rules to use the Equivalent class rules.
+
+        old_rules = self._layer_rules
+
+        new_rules = SingleQuditGateEquivalenceClassesLayerRules( old_rules.qubit_labels, old_rules.implicit_idle_mode,
+                                                                old_rules.single_qubit_idle_layer_labels, old_rules.global_idle_layer_label)
+        
+        self._layer_rules = new_rules
+        self._qubits_to_equiv_qubit = qubit_to_equivalent_qubit_for_single_qgates
+        self._reinit_opcaches() # Clear the caches for using the new rules.

From e8be458a29798848d5fe9d56d2bb4de3b8f558e4 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 11:39:35 -0700
Subject: [PATCH 060/141] Lanes collapsed by looking through the qubits used
 and building up a layer, then combining the layers.

---
 pygsti/forwardsims/mapforwardsim.py    |   4 +
 pygsti/forwardsims/matrixforwardsim.py |  20 +-
 pygsti/layouts/evaltree.py             | 563 +++++++++++++++----------
 pygsti/layouts/matrixlayout.py         |  14 +-
 4 files changed, 375 insertions(+), 226 deletions(-)

diff --git a/pygsti/forwardsims/mapforwardsim.py b/pygsti/forwardsims/mapforwardsim.py
index 81ccf917c..7d59fb9ee 100644
--- a/pygsti/forwardsims/mapforwardsim.py
+++ b/pygsti/forwardsims/mapforwardsim.py
@@ -15,6 +15,7 @@
 
 import numpy as _np
 from numpy import linalg as _nla
+import time
 
 from pygsti.forwardsims.distforwardsim import DistributableForwardSimulator as _DistributableForwardSimulator
 from pygsti.forwardsims.forwardsim import ForwardSimulator as _ForwardSimulator
@@ -360,8 +361,11 @@ def create_copa_layout_circuit_cache(circuits, model, dataset=None):
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
         resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim)
+        start_time = time.time()
         self.calclib.mapfill_probs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]),  # all indices
                                         layout_atom, resource_alloc)
+        end_time = time.time()
+        print("Time to compute forward probs with map Forward after fixed layout (s): ", end_time - start_time)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index bf02f9999..fb72900cc 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -10,6 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
+import time
 import collections as _collections
 import time as _time
 import warnings as _warnings
@@ -3776,10 +3777,19 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         
         # Overestimate the amount of cache usage by assuming everything is the same size.
         dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
+        # resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
 
-        prodCache = layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
-        Gs = layout_atom.tree.reconstruct_full_matrices(prodCache)
+        starttime =time.time()
+        layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
+        endtime = time.time()
+
+        print("Time to collapse the process matrices (s): ", endtime - starttime)
+        starttime = time.time()
+        Gs = layout_atom.tree.reconstruct_full_matrices()
+        endtime = time.time()
+        print("Time to reconstruct the whole matrices (s): ", endtime - starttime)
+
+        starttime = time.time()
         old_err = _np.seterr(over='ignore')
         for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
             # "element indices" index a circuit outcome probability in array_to_fill's first dimension
@@ -3790,6 +3800,8 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
             _fas(array_to_fill, [element_indices],
                  self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
         _np.seterr(**old_err)
+        endtime = time.time()
+        print("Time to complete the spam operations (s): ", endtime - starttime)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
 
@@ -3816,7 +3828,7 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
                 iFinal = iParamToFinal[i]
                 vec = orig_vec.copy(); vec[i] += eps
                 self.model.from_vector(vec, close=True)
-                self._bulk_fill_probs_atom(probs2, layout_atom)
+                self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
                 array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 9b07ffa76..5f6f5fff9 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -10,6 +10,7 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
+from __future__ import annotations
 import bisect as _bisect
 import time as _time  # DEBUG TIMERS
 import warnings as _warnings
@@ -23,6 +24,7 @@
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
 from typing import Sequence
+import time
 
 
 
@@ -613,10 +615,11 @@ def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
     best_subsequences = {}
     best_lengths = _np.zeros((len(circuit_list), len(circuit_list)))
     curr_best = 0
-    for i, cir0 in enumerate(circuit_list):
+    for i in range(len(circuit_list)-1, -1, -1): # Lets do this in reverse order
+        cir0 = circuit_list[i]
         if len(cir0) >= curr_best:
             # Could be the best.
-            for j in range(i+1, len(circuit_list)):
+            for j in range(i-1, -1, -1):
                 cir1 = circuit_list[j]
                 if len(cir1) >= curr_best:
                     table = _lcs_dp_version(cir0, cir1)
@@ -790,8 +793,36 @@ def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[
 
     return lanes_to_gates
 
-def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
-                                            implicit_idle_gate_name: str = "I") -> tuple[list[tuple],list[int], list[dict[int, int]]]:
+
+def _split_circuits_by_lanes(circuit_list):
+    # First eliminate the duplicate circuits.
+
+    unique_circuits = []
+    matching_inds: dict[int, set[int]] = {}
+    C = len(circuit_list)
+    seen_circs: dict[tuple[LabelTupTup, int]] = {}
+    cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+    for i in range(C):
+        my_cir = circuit_list[i]
+        if tuple(my_cir) in seen_circs:
+            cache[i] = seen_circs[tuple(my_cir)]
+        else:
+            seen_circs[tuple(my_cir)] = i
+
+    labels_to_circuits = {}
+    for my_cir in seen_circs:
+        line_labels = _Circuit(my_cir)._line_labels
+        if line_labels in labels_to_circuits:
+            labels_to_circuits[line_labels].append(my_cir)
+        else:
+            labels_to_circuits[line_labels] = [my_cir]
+        
+
+def setup_circuit_list_for_LCS_computations(
+        circuit_list: list[_Circuit],
+        implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
+                                                    dict[tuple[_Circuit], list[tuple[int, int]]],
+                                                    dict[tuple[int, ...], set[_Circuit]]]:
     """
     Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
 
@@ -799,10 +830,17 @@ def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
     Then, a sequence detailing the number of qubits in each lane for a circuit.
     """
 
-    output = []
-    qubits_used_in_each_lane = []
+    # output = []
+    # cir_id_to_lanes = []
+
+    # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
+    # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
 
-    for cir in circuit_list:
+    sub_cir_to_cir_id_and_lane_id: dict[tuple[_Circuit], list[tuple[int, int]]] = {}
+    line_labels_to_circuit_list: dict[tuple[int, ...], set[_Circuit]] = {}
+    cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
+
+    for i, cir in enumerate(circuit_list):
 
         if implicit_idle_gate_name:
             cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
@@ -810,30 +848,54 @@ def setup_circuit_list_for_LCS_computations(circuit_list: list[_Circuit],
         qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
         sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
 
-        output.extend(sub_cirs)
-        qubits_used_in_each_lane.append(lanes_to_qubits)
-    return output, qubits_used_in_each_lane
+        assert len(sub_cirs) == len(lanes_to_qubits)
+        for j in range(len(sub_cirs)):
+            sc = _Circuit(sub_cirs[j])
+            lbls = sc._line_labels
+            if lbls in line_labels_to_circuit_list:
+                line_labels_to_circuit_list[lbls].append(sc)
+            else:
+                line_labels_to_circuit_list[lbls] = [sc]
+            if sc in sub_cir_to_cir_id_and_lane_id:
+                sub_cir_to_cir_id_and_lane_id[sc].append((i,j))
+            else:
+                sub_cir_to_cir_id_and_lane_id[sc] = [(i,j)]
+            if i in cir_ind_and_lane_id_to_sub_cir:
+                cir_ind_and_lane_id_to_sub_cir[i][j] = sc
+            else:
+                cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
+
+        # output.extend(sub_cirs)
+        # cir_id_to_lanes.append(lanes_to_qubits)
+    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
 
 class EvalTreeBasedUponLongestCommonSubstring():
 
-    def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: list[dict[int, tuple[int, ...]]]):
+    def __init__(self, circuit_list: list[LabelTupTup]):
         """
         Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
         """
 
-        assert len(qubits_used_in_each_lane) <= len(circuit_list)
+        self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
+
         external_matches = _compute_lcs_for_every_pair_of_circuits(circuit_list)
+        
+        best_external_match = _np.max(external_matches[0])
+        self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
+
+
         internal_matches = build_internal_tables(circuit_list)
+        best_internal_match = _np.max(internal_matches[0])
 
-        max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+        max_rounds = int(max(best_external_match,best_internal_match))
 
         C = len(circuit_list)
-        num_full_circuits = len(qubits_used_in_each_lane)
         sequence_intro = {0: _np.arange(C)}
 
-        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
         cache_pos = C
+        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+
         new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
 
         i = 0
@@ -841,18 +903,26 @@ def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: li
             new_circuit_list, cache_pos, cache, sequence_intro[i+1] = build_one_round_of_eval_tree(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
             i += 1
             external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
-            internal_matches = build_internal_tables(new_circuit_list)
 
-            max_rounds = int(max(_np.max(external_matches[0]), _np.max(internal_matches[0])))
+            if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
+                # We are not going to get a better internal match.
+                pass
+            else:
+                internal_matches = build_internal_tables(new_circuit_list)
+
+            best_external_match = _np.max(external_matches[0])
+            best_internal_match = _np.max(internal_matches[0])
+
+            max_rounds = int(max(best_external_match,best_internal_match))
 
         self.circuit_list = new_circuit_list
         self.cache = cache
         self.num_circuits = C
-        self.qubits_used_in_each_lane = qubits_used_in_each_lane
+        self.from_other = False
 
         self.sequence_intro = sequence_intro
 
-        swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
+        self.swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
                                       0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00, -1.23259516e-32],
                                     [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00, 1.23259516e-32,
                                       0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
@@ -888,45 +958,56 @@ def __init__(self, circuit_list: list[LabelTupTup], qubits_used_in_each_lane: li
                                     [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
 
-        self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")
-
+        # Assumes a perfect swap gate!
+        # self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")            
 
 
+    def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
+        """
+        Construct a tree from another tree.
+        """
+        
+        self.cache = other.cache
+        self.num_circuits = other.num_circuits
+        self.sequence_intro = other.sequence_intro
+        self.swap_gate = other.swap_gate
+        self.circuit_list = other.circuit_list
+        self.orig_circuit_list = other.orig_circuit_list
+        self.circuit_to_save_location = other.circuit_to_save_location
+        self.from_other = other
+
+        for ind in self.cache:
+            for i, term in enumerate(self.cache[ind]):
+                if isinstance(term, int):
+                    pass # The tree will stay the same.
+                elif isinstance(term, LabelTupTup):
+                    new_term = ()
+                    for op in term:
+                        new_qu = (qubit_label_exchange[qu] for qu in op.qubits)
+                        new_op = (op.name, *new_qu)
+                        new_term = (*new_term, new_op)
+                    self.cache[ind][i] = Label(new_term)
 
-        self.cache_ind_to_num_qubits_needed = {}
-        offset = 0
-        for i in range(num_full_circuits):
-            for j in qubits_used_in_each_lane[i]:
-                cache_ind = offset + j
-                self.set_cache_sizes(cache_ind, len(qubits_used_in_each_lane[i][j]))
-
-            offset += len(qubits_used_in_each_lane[i])
         
-        self.tensor_contraction_orders_by_circuit = {}
-        self.tensor_contraction_order_cache = {}
-        self.qubit_list_cache = {}
+        for icir in range(len(self.orig_circuit_list)):
+            self.orig_circuit_list[icir] = self.trace_through_cache_to_build_circuit(icir)
 
-        for i in range(num_full_circuits):
-            self.qubit_list_cache[i] = [qubits_used_in_each_lane[i][k] for k in sorted(qubits_used_in_each_lane[i])]
-            self.tensor_contraction_orders_by_circuit[i] = self.best_order_for_tensor_contraction(tuple(self.qubit_list_cache[i]))
-            
+        updated = {}
+        for cir, loc in self.circuit_to_save_location.items():
+            new_cir = ()
+            for layer in cir:
+                new_layer = ()
+                for op in layer:
+                    new_op = (op[0], *(qubit_label_exchange[qu] for qu in op[1:]))
+                    new_layer = (*new_layer, new_op)
+                new_cir = (*new_cir, new_layer)
+            updated[new_cir] = loc
+        self.circuit_to_save_location = updated
 
-        # for val in cache.values():
-        #     num_terms = len(val)
-        #     self._best_order_for_first_derivative(num_terms)
 
-    def set_cache_sizes(self, cache_ind: int, num_qubits: int):
-        """
-        Set the size to use the number of qubits specified.
-        """
-        if cache_ind in self.cache_ind_to_num_qubits_needed:
-            return # we have already set them all.
-        self.cache_ind_to_num_qubits_needed[cache_ind] = num_qubits
-        for child in self.cache:
-            if isinstance(child, int):
-                self.set_cache_sizes(child, num_qubits)
 
-    def collapse_circuits_to_process_matrices(self, model):
+
+    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int):
         """
         Compute the total product cache. Note that this may still have a tensor product
         structure that the operator needs to combine again if they want to have the full 'dense' matrix.
@@ -936,15 +1017,36 @@ def collapse_circuits_to_process_matrices(self, model):
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
         saved: dict[int, _LinearOperator] = {}
         
-        def look_up_operations(model, opTuple) -> _LinearOperator:
+        def look_up_operations(model, opTuple) -> _np.ndarray:
 
             if hasattr(model, "operations"):
                 return model.operations[opTuple].to_dense()
             elif hasattr(model, "operation_blks"):
-                return model.operation_blks[opTuple].to_dense()
+                if opTuple[0] not in model.operation_blks["gates"]:
+                    breakpoint()
+                return model.operation_blks["gates"][opTuple[0]].to_dense()
             else:
                 raise ValueError("Missing attribute")
 
+        def get_appropriate_gate(op, saved):
+            op_term = 1
+            if op.num_qubits == 2:
+                # We may need to do swaps.
+                if op in saved:
+                    op_term = saved[op]
+                elif op.qubits[1] < op.qubits[0]:
+                    # This is in the wrong order.
+                    op_term = look_up_operations(model, op)
+                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
+                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
+                    saved[op] = op_term # Save so we only need to this operation once.
+                else:
+                    op_term = look_up_operations(model, op)
+            else:
+                op_term = look_up_operations(model, op)
+            return op_term
+
+        expected_shape = (4**num_qubits_in_default, 4**num_qubits_in_default)
         for key in round_keys:
             for cind in self.sequence_intro[key]:
                 cumulative_term = None
@@ -952,11 +1054,100 @@ def look_up_operations(model, opTuple) -> _LinearOperator:
                     if isinstance(term, int) and cumulative_term is None:
                         # look up result.
                         cumulative_term = saved[term]
-                    elif isinstance(term, int) and not (cumulative_term is None):
+                    elif isinstance(term, int) and cumulative_term is not None:
                         cumulative_term = saved[term] @ (cumulative_term)
                     elif isinstance(term, LabelTupTup):
                         val = 1
-                        for op in term:
+                        qubits_used = [i for i in range(num_qubits_in_default)] # Qubits are assuming to be integer markers.
+                        while qubits_used:
+                            qu = qubits_used[0]
+                            gate_matrix = _np.eye(4)
+                            found = False
+                            op_ind = 0
+                            while not found and op_ind < len(term):
+                                op = term[op_ind]
+                                if qu in op.qubits:
+                                    gate_matrix = get_appropriate_gate(op, saved)
+                                    found = True
+                                    qubits_used = qubits_used[len(op.qubits):] # We assume that the qubits need to overlap for a specific gate. i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
+                                op_ind += 1
+                            val = _np.kron(val, gate_matrix)
+                            if not found:
+                                # Remove that qubit from list to check.
+                                qubits_used = qubits_used[1:]
+
+                        if val.shape != expected_shape:
+                            breakpoint()
+                        if cumulative_term is None:
+                            cumulative_term = val
+                        else:
+                            if val.shape[1] != cumulative_term.shape[0]:
+                                breakpoint()
+                            cumulative_term = val @ (cumulative_term)
+                if cumulative_term is None:
+                    saved[cind] = _np.eye(4**num_qubits_in_default) # identity of the appropriate size.
+                else:
+                    saved[cind] = cumulative_term
+        if __debug__:
+            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
+            for key in self.cache:
+                assert key in saved
+        
+        # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
+    
+        return saved, self.circuit_to_save_location 
+
+    def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
+
+        output = ()
+        for term in self.cache[cache_ind]:
+
+            if isinstance(term, Label):
+                output = (*output, term)
+            elif isinstance(term, int):
+                # Recurse down.
+                next_term = self.trace_through_cache_to_build_circuit(term)
+                output = (*output, *next_term)
+
+        return list(output)
+
+
+    """        
+    def _evaluate_product_rule(self, cind: int, rn: int):
+
+        sequence = self.cache[cind]
+        num_terms = len(sequence)
+        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
+
+        for sub_r in sorted(sub_rounds.keys())[::-1]:
+            sub_sequence = None
+            for sub_cind in sub_rounds[sub_r]:
+        
+                for term in sub_tree_cache[sub_cind]:
+                    if isinstance(term, tuple):
+                        # Then, this may be a partial derivative or an character in original sequence.
+                        if len(term) == 2:
+                            # Then this is taking a partial derivative.
+                            natural_term = term[1][0]
+                            if natural_term in self.derivative_cache:
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+                            else:
+                                # This should be a natural derivative.
+                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
+                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
+
+                        # It is just an index to sequence for where to look in the cache.
+                        next_ind = term[0]
+                        sequence_val = sequence[next_ind]
+
+                        if isinstance(term, int) and cumulative_term is None:
+                            # look up result.
+                            cumulative_term = saved[term]
+                        elif isinstance(term, int) and not (cumulative_term is None):
+                            cumulative_term = saved[term] @ cumulative_term
+                        elif isinstance(term, LabelTupTup):
+                            val = 1
+                            for op in term:
                             op_term = 1
                             if op.num_qubits == 2:
                                 # We may need to do swaps.
@@ -964,54 +1155,109 @@ def look_up_operations(model, opTuple) -> _LinearOperator:
                                     op_term = saved[op]
                                 elif op.qubits[1] < op.qubits[0]:
                                     # This is in the wrong order.
-                                    op_term = look_up_operations(model, op)
-                                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
-                                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
+                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
+                                    op_term = model.operation_blks["gates"][op].to_dense()
+                                    op_term = swap_term @ op_term @ swap_term.T
                                     saved[op] = op_term # Save so we only need to this operation once.
                                 else:
-                                    op_term = look_up_operations(model, op)
+                                    op_term = model.operation_blks["gates"][op].to_dense()
                             else:
-                                op_term = look_up_operations(model, op)
-
+                                op_term = model.operation_blks["gates"][op].to_dense()
                             val = _np.kron(val, op_term)
                         #val = model.operation_blks["gates"][term[0]].to_dense()
                         if cumulative_term is None:
                             cumulative_term = val
                         else:
-                            cumulative_term = val @ (cumulative_term)
-                if cumulative_term is None:
-                    saved[cind] = _np.eye(4**self.cache_ind_to_num_qubits_needed[cind]) # identity of the appropriate size.
+                            cumulative_term = val @ cumulative_term
+    """
+    
+
+class CollectionOfLCSEvalTrees():
+
+    def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id, cir_id_and_lane_id_to_sub_cir):
+        
+        self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
+
+        ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE = False
+
+        size_to_tree: dict[int, tuple[int, ...]] = {}
+
+        self.line_lbls_to_cir_list = line_lbls_to_circuit_list
+
+        starttime = time.time()
+        for key, vals in line_lbls_to_circuit_list.items():
+            sub_cirs = [list(cir) for cir in vals]
+            if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
+                if len(key) not in size_to_tree:
+                    self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                    size_to_tree[len(key)] = key
                 else:
-                    saved[cind] = cumulative_term
-        if __debug__:
-            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
-            for key in self.cache:
-                assert key in saved
+                    sample = EvalTreeBasedUponLongestCommonSubstring(sub_cirs[:2]) # Build a small version to be corrected later.
+                    other_key = size_to_tree[len(key)]
+                    sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
+                    self.trees[key] = sample
+            else:
+                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                
+        endtime = time.time()
+
+        print(" Time to compute all the evaluation orders (s): ", endtime - starttime)
+
+
+        self.sub_cir_to_full_cir_id_and_lane_id = sub_cir_to_full_cir_id_and_lane_id
+        self.cir_id_and_lane_id_to_sub_cir = cir_id_and_lane_id_to_sub_cir
+
+        self.cir_id_to_tensor_order = {}
+        self.compute_tensor_orders()
+
+        self.saved_results = {}
+        self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
+
+    def collapse_circuits_to_process_matrices(self, model):
+        # Just collapse all of them.
         
-        return saved
-    
-    def reconstruct_full_matrices(self, process_matrices_cache):
+        self.saved_results = {}
+        for key in self.trees:
+            self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+
+    def reconstruct_full_matrices(self):
+
+        if len(self.saved_results) == 0:
+            return
+        
+        # Now we can do the combination.
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
 
         output = []
-        start_pos = 0
-        for cir_ind in range(len(self.qubits_used_in_each_lane)):
+        for icir in range(num_cirs):
             lane_circuits = []
-            for i in range(self.qubits_used_in_each_lane[cir_ind]):
-                lane_circuits.append(process_matrices_cache[start_pos + i])
-            output.append(lane_circuits)
-            start_pos += self.inds_needed_to_reconstruct[cir_ind]
+            for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                lblkey = cir._line_labels
 
-        # Now we will do the contraction.
-        for cir_ind in range(len(self.inds_needed_to_reconstruct)):
+                if len(cir.layertup) == 0:
 
-            order = self.tensor_contraction_orders_by_circuit[cir_ind]
+                    lane_circuits.append(_np.eye(4**(len(lblkey))))
+                else:
+                    if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
+                        print(lblkey)
+                        print(cir)
+                        breakpoint()
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+            output.append(lane_circuits)
+
+        # Need a map from lane id to computed location.
+        for icir in range(num_cirs):
 
+            order = self.cir_id_to_tensor_order[icir]
+            
+            
             while order:
                 sp = order[0]
-                if len(output[cir_ind][sp]) == 0:
-                    breakpoint()
-                output[cir_ind][sp] = _np.kron(output[cir_ind][sp], output[cir_ind][sp+1])
-                output[cir_ind][sp+1:] = output[cir_ind][sp+2:]
+                output[icir][sp] = _np.kron(output[icir][sp], output[icir][sp+1])
+                output[icir][sp+1:] = output[icir][sp+2:]
                 
                 # Adjust future indices
                 tmp = []
@@ -1019,85 +1265,30 @@ def reconstruct_full_matrices(self, process_matrices_cache):
                     tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
                 order = tmp
 
-            output[cir_ind] = output[cir_ind][0]
-            assert output[cir_ind].shape == (256, 256)
+            output[icir] = output[icir][0]
         return output
-    # def compute_derivatives_using_cache(self, model, productCache):
-    #     """
-    #     We are interested in computing the derivative of the probabilities specified by a model
-    #     and the cached circuit list against the model parameters. We will assume that the model can take a
-    #     derivative with respect to a single gate operation. However, we need to handle the product rule.
-    #     """
-
-    #     productCache = self.fill_out_circuit_cache(model)
-
-    #     round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-    #     saved = {}
+    
+    def compute_tensor_orders(self):
 
-    #     product_rule_cache: dict[int, list[int]] = {}
-    #     for key in round_keys:
-    #         for cind in self.sequence_intro[key]:
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
 
+        cache_struct = {}
 
-                
-    #             cumulative_term = None
-    #             for term in self.cache[cind]:
-    #                 if isinstance(term, int) and cumulative_term is None:
-    #                     # look up result.
-    #                     cumulative_term = saved[term]
-    #                 elif isinstance(term, int) and not (cumulative_term is None):
-    #                     cumulative_term = saved[term] @ cumulative_term
-    #                 elif isinstance(term, LabelTupTup):
-    #                     val = 1
-    #                     for op in term:
-    #                         op_term = 1
-    #                         if op.num_qubits == 2:
-    #                             # We may need to do swaps.
-    #                             if op in saved:
-    #                                 op_term = saved[op]
-    #                             elif op.qubits[1] < op.qubits[0]:
-    #                                 # This is in the wrong order.
-    #                                 swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
-    #                                 op_term = model.operation_blks["gates"][op].to_dense()
-    #                                 op_term = swap_term @ op_term @ swap_term.T
-    #                                 saved[op] = op_term # Save so we only need to this operation once.
-    #                             else:
-    #                                 op_term = model.operation_blks["gates"][op].to_dense()
-    #                         else:
-    #                             op_term = model.operation_blks["gates"][op].to_dense()
-    #                         val = np.kron(val, op_term)
-    #                     #val = model.operation_blks["gates"][term[0]].to_dense()
-    #                     if cumulative_term is None:
-    #                         cumulative_term = val
-    #                     else:
-    #                         cumulative_term = val @ cumulative_term
-    #             saved[cind] = cumulative_term
-    #     return saved
-    
-    def cache_num_to_matrix_size(self, ind, output_cache):
-        if ind in output_cache:
-            return output_cache[ind]
-        else:
-            if ind not in self.cache:
-                assert ind in self.cache
-            children = self.cache[ind]
-            answer = 0
-            for child in children:
-                if isinstance(child, Label):
-                    lbls = child.num_qubits
-                    sub_probanswer = lbls
-                else:
-                    sub_probanswer = self.cache_num_to_matrix_size(child, output_cache)
-                answer = max(answer, sub_probanswer)
-            output_cache[ind] = answer
-            return answer
+        for cir_id in range(num_cirs):
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            self.cir_id_to_tensor_order[cir_id] = self.best_order_for_tensor_contraction(qubit_list, cache_struct)
 
+        return
+            
 
-    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...]):
+    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...], cache):
         
 
-        if qubit_list in self.tensor_contraction_order_cache:
-            return self.tensor_contraction_order_cache[qubit_list]
+        if qubit_list in cache:
+            return cache[qubit_list]
 
         best_cost = _np.inf
         best_order = []
@@ -1129,7 +1320,7 @@ def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...]):
                 best_order = list(order)
 
         # Store off the information.
-        self.tensor_contraction_order_cache[qubit_list] = best_order
+        cache[qubit_list] = best_order
 
         return best_order
 
@@ -1139,63 +1330,3 @@ def _tensor_cost_model(self, num_qubits1, num_qubits2):
         """
 
         return (4**num_qubits1)**2 * (4**num_qubits2)**2
-
-    """        
-    def _evaluate_product_rule(self, cind: int, rn: int):
-
-        sequence = self.cache[cind]
-        num_terms = len(sequence)
-        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
-
-        for sub_r in sorted(sub_rounds.keys())[::-1]:
-            sub_sequence = None
-            for sub_cind in sub_rounds[sub_r]:
-        
-                for term in sub_tree_cache[sub_cind]:
-                    if isinstance(term, tuple):
-                        # Then, this may be a partial derivative or an character in original sequence.
-                        if len(term) == 2:
-                            # Then this is taking a partial derivative.
-                            natural_term = term[1][0]
-                            if natural_term in self.derivative_cache:
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-                            else:
-                                # This should be a natural derivative.
-                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-
-                        # It is just an index to sequence for where to look in the cache.
-                        next_ind = term[0]
-                        sequence_val = sequence[next_ind]
-
-                        if isinstance(term, int) and cumulative_term is None:
-                            # look up result.
-                            cumulative_term = saved[term]
-                        elif isinstance(term, int) and not (cumulative_term is None):
-                            cumulative_term = saved[term] @ cumulative_term
-                        elif isinstance(term, LabelTupTup):
-                            val = 1
-                            for op in term:
-                            op_term = 1
-                            if op.num_qubits == 2:
-                                # We may need to do swaps.
-                                if op in saved:
-                                    op_term = saved[op]
-                                elif op.qubits[1] < op.qubits[0]:
-                                    # This is in the wrong order.
-                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                                    op_term = swap_term @ op_term @ swap_term.T
-                                    saved[op] = op_term # Save so we only need to this operation once.
-                                else:
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                            else:
-                                op_term = model.operation_blks["gates"][op].to_dense()
-                            val = _np.kron(val, op_term)
-                        #val = model.operation_blks["gates"][term[0]].to_dense()
-                        if cumulative_term is None:
-                            cumulative_term = val
-                        else:
-                            cumulative_term = val @ cumulative_term
-    """
-    
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index f3c2c8e85..8e86b702b 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -16,8 +16,9 @@
 
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
-from pygsti.layouts.evaltree import EvalTree as _EvalTree
+from pygsti.layouts.evaltree import CollectionOfLCSEvalTrees as _CollectionOfLCSEvalTrees
 from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import EvalTree as _EvalTree
 from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
@@ -346,8 +347,8 @@ def add_expanded_circuits(indices, add_to_this_dict):
 
         vals = list(double_expanded_nospam_circuits_plus_scratch.values())
         
-        circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
-        self.tree = _EvalTreeLCS(circuits_this_layout_will_handle_without_any_spam, inds_needed_to_reconstruct_from_tree)
+        cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
+        self.tree = _CollectionOfLCSEvalTrees(line_labels_to_circuit_list, sub_cir_to_cir_id_and_lane_id, cir_ind_and_lane_id_to_sub_cir)
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
         self._num_nonscratch_tree_items = len(expanded_nospam_circuits)  # put this in EvalTree?
@@ -588,9 +589,10 @@ def _create_atom(args):
 
             gatename = None
             if hasattr(model._layer_rules, "_singleq_idle_layer_labels"):
-                keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
-                if model._layer_rules.implicit_idle_mode == "pad_1Q":
-                    gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
+                if model._layer_rules._singleq_idle_layer_labels:
+                    keys = list(model._layer_rules._singleq_idle_layer_labels.keys())
+                    if model._layer_rules.implicit_idle_mode == "pad_1Q":
+                        gatename = model._layer_rules._singleq_idle_layer_labels[keys[0]].name
             return _MatrixCOPALayoutAtomWithLCS(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 

From e991f6c2cf159151437dbed0c15ce8f99846719a Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 15:40:11 -0700
Subject: [PATCH 061/141] Compute dense process matrix not with function in
 EvalTreeLCS.

---
 pygsti/forwardsims/matrixforwardsim.py |  16 +--
 pygsti/layouts/evaltree.py             | 148 +++++++++++++++----------
 pygsti/layouts/matrixlayout.py         |   6 +-
 test/unit/objects/test_forwardsim.py   |   7 +-
 4 files changed, 105 insertions(+), 72 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index fb72900cc..a1fa3a76f 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -23,6 +23,7 @@
 from pygsti.forwardsims.forwardsim import _bytes_for_array_types
 from pygsti.layouts.evaltree import EvalTree as _EvalTree
 from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
+from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations, CollectionOfLCSEvalTrees
 from pygsti.layouts.matrixlayout import MatrixCOPALayout as _MatrixCOPALayout
 from pygsti.layouts.matrixlayout import _MatrixCOPALayoutAtomWithLCS
 from pygsti.baseobjs.profiler import DummyProfiler as _DummyProfiler
@@ -1069,7 +1070,7 @@ def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
         return hProdCache
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
+                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None, use_old_tree_style: bool = True):
         """
         Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
 
@@ -1156,7 +1157,7 @@ def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types
 
         layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
                                    na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
-                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
+                                   layout_creation_circuit_cache=layout_creation_circuit_cache, use_old_tree_style=use_old_tree_style)
 
         if mem_limit is not None:
             loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
@@ -3764,12 +3765,13 @@ def bulk_product(self, circuits, scale=False, resource_alloc=None):
             (final_product[i] = scaleValues[i] * prods[i]).
         """
         resource_alloc = _ResourceAllocation.cast(resource_alloc)
-        nCircuits = len(circuits)
 
-        eval_tree = _EvalTreeLCS(circuits)
-        prodCache = eval_tree.fill_out_circuit_cache(self.model)
-        Gs = prodCache[0:nCircuits]
+        my_data = setup_circuit_list_for_LCS_computations(circuits, None)
+
+        full_tree = CollectionOfLCSEvalTrees(my_data[2], my_data[1], my_data[0])
 
+        full_tree.collapse_circuits_to_process_matrices(self.model)
+        Gs = full_tree.reconstruct_full_matrices()
 
         return Gs
 
@@ -3832,4 +3834,4 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
                 array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
     def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
-        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache)
\ No newline at end of file
+        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 5f6f5fff9..44c82fec9 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -869,10 +869,52 @@ def setup_circuit_list_for_LCS_computations(
         # cir_id_to_lanes.append(lanes_to_qubits)
     return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
+def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
+    """
+    Look up the dense representation of a gate in the model.
+    """
+
+
+    if hasattr(model, "operations"):
+        return model.operations[opTuple].to_dense()
+    elif hasattr(model, "operation_blks"):
+        if opTuple[0] not in model.operation_blks["gates"]:
+            breakpoint()
+        return model.operation_blks["gates"][opTuple[0]].to_dense()
+    else:
+        raise ValueError("Missing attribute")
+
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
+    op_term = 1
+    if op.num_qubits == 2:
+        # We may need to do swaps.
+        if op in saved:
+            op_term = saved[op]
+        elif op.qubits[1] < op.qubits[0]:
+            # This is in the wrong order.
+            op_term = model_and_gate_to_dense_rep(model, op)
+            op_term = swap_dense @ (op_term) @ swap_dense
+            saved[op] = op_term # Save so we only need to this operation once.
+        else:
+            op_term = model_and_gate_to_dense_rep(model, op)
+    else:
+        op_term = model_and_gate_to_dense_rep(model, op)
+    return op_term
+
+def combine_two_gates(cumulative_term, next_dense_matrix):
+    """
+    Note that the visual representation was
+
+    State Prep | CumulativeTerm | NextDense | Measure
+
+    which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
+    """
+    return next_dense_matrix @ cumulative_term
+
 
 class EvalTreeBasedUponLongestCommonSubstring():
 
-    def __init__(self, circuit_list: list[LabelTupTup]):
+    def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
         """
         Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
         """
@@ -883,6 +925,7 @@ def __init__(self, circuit_list: list[LabelTupTup]):
         
         best_external_match = _np.max(external_matches[0])
         self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
+        self.qubit_start_point = qubit_starting_loc
 
 
         internal_matches = build_internal_tables(circuit_list)
@@ -1017,73 +1060,58 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
         saved: dict[int, _LinearOperator] = {}
         
-        def look_up_operations(model, opTuple) -> _np.ndarray:
 
-            if hasattr(model, "operations"):
-                return model.operations[opTuple].to_dense()
-            elif hasattr(model, "operation_blks"):
-                if opTuple[0] not in model.operation_blks["gates"]:
-                    breakpoint()
-                return model.operation_blks["gates"][opTuple[0]].to_dense()
+
+        def cache_lookup_and_product(cumulative_term, term_to_extend_with: int):
+            if cumulative_term is None:
+                # look up result.
+                return saved[term]
+            elif isinstance(term, int) and cumulative_term is not None:
+                return combine_two_gates(cumulative_term, saved[term_to_extend_with]) 
+
+
+
+        def collapse_cache_line(cumulative_term, term_to_extend_with: int | LabelTupTup):
+
+            if isinstance(term_to_extend_with, int):
+                return cache_lookup_and_product(cumulative_term, term_to_extend_with)
+
             else:
-                raise ValueError("Missing attribute")
-
-        def get_appropriate_gate(op, saved):
-            op_term = 1
-            if op.num_qubits == 2:
-                # We may need to do swaps.
-                if op in saved:
-                    op_term = saved[op]
-                elif op.qubits[1] < op.qubits[0]:
-                    # This is in the wrong order.
-                    op_term = look_up_operations(model, op)
-                    # op_term = self.swap_gate.product(op_term.product(self.swap_gate.T))
-                    op_term = self.swap_gate @ (op_term) @ self.swap_gate.T
-                    saved[op] = op_term # Save so we only need to this operation once.
+                val = 1
+                qubits_used = [i for i in range(num_qubits_in_default)]
+                while qubits_used:
+                    qu = qubits_used[0]
+                    gate_matrix = _np.eye(4)
+                    found = False
+                    op_ind = self.qubit_start_point # Handle circuits with only qubits (i, i+k) where k is number of qubits in the subsystem.
+                    while not found and op_ind < len(term):
+                        op = term[op_ind]
+                        if qu in op.qubits:
+                            gate_matrix = get_dense_representation_of_gate_with_perfect_swap_gates(model, op, saved, self.swap_gate)
+                            found = True
+                            # We assume that the qubits need to overlap for a specific gate.
+                            # i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
+                            qubits_used = qubits_used[len(op.qubits):]
+                        op_ind += 1
+                    val = _np.kron(val, gate_matrix)
+                    if not found:
+                        # Remove that qubit from list to check.
+                        qubits_used = qubits_used[1:]
+
+                if val.shape != expected_shape:
+                    breakpoint()
+                if cumulative_term is None:
+                    return val
                 else:
-                    op_term = look_up_operations(model, op)
-            else:
-                op_term = look_up_operations(model, op)
-            return op_term
+                    return combine_two_gates(cumulative_term, val)
 
         expected_shape = (4**num_qubits_in_default, 4**num_qubits_in_default)
         for key in round_keys:
             for cind in self.sequence_intro[key]:
                 cumulative_term = None
                 for term in self.cache[cind]:
-                    if isinstance(term, int) and cumulative_term is None:
-                        # look up result.
-                        cumulative_term = saved[term]
-                    elif isinstance(term, int) and cumulative_term is not None:
-                        cumulative_term = saved[term] @ (cumulative_term)
-                    elif isinstance(term, LabelTupTup):
-                        val = 1
-                        qubits_used = [i for i in range(num_qubits_in_default)] # Qubits are assuming to be integer markers.
-                        while qubits_used:
-                            qu = qubits_used[0]
-                            gate_matrix = _np.eye(4)
-                            found = False
-                            op_ind = 0
-                            while not found and op_ind < len(term):
-                                op = term[op_ind]
-                                if qu in op.qubits:
-                                    gate_matrix = get_appropriate_gate(op, saved)
-                                    found = True
-                                    qubits_used = qubits_used[len(op.qubits):] # We assume that the qubits need to overlap for a specific gate. i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
-                                op_ind += 1
-                            val = _np.kron(val, gate_matrix)
-                            if not found:
-                                # Remove that qubit from list to check.
-                                qubits_used = qubits_used[1:]
-
-                        if val.shape != expected_shape:
-                            breakpoint()
-                        if cumulative_term is None:
-                            cumulative_term = val
-                        else:
-                            if val.shape[1] != cumulative_term.shape[0]:
-                                breakpoint()
-                            cumulative_term = val @ (cumulative_term)
+                    cumulative_term = collapse_cache_line(cumulative_term, term)
+                        
                 if cumulative_term is None:
                     saved[cind] = _np.eye(4**num_qubits_in_default) # identity of the appropriate size.
                 else:
@@ -1197,7 +1225,7 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
                     sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
                     self.trees[key] = sample
             else:
-                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs, sorted(key)[0])
                 
         endtime = time.time()
 
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 8e86b702b..e2137e355 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -24,8 +24,6 @@
 from pygsti.tools import listtools as _lt
 from pygsti.tools import slicetools as _slct
 
-NICK_USE_OLD_EVAL_TREE = False
-
 class _MatrixCOPALayoutAtom(_DistributableAtom):
     """
     The atom ("atomic unit") for dividing up the element dimension in a :class:`MatrixCOPALayout`.
@@ -500,7 +498,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
     def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_processors=2,
                  num_param_dimension_processors=(), param_dimensions=(),
                  param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
-                 layout_creation_circuit_cache = None):
+                 layout_creation_circuit_cache = None, use_old_tree_style: bool = True):
 
         #OUTDATED: TODO - revise this:
         # 1. pre-process => get complete circuits => spam-tuples list for each no-spam circuit (no expanding yet)
@@ -579,7 +577,7 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
 
         def _create_atom(args):
             group, helpful_scratch_group = args
-            if NICK_USE_OLD_EVAL_TREE:
+            if use_old_tree_style:
                 return _MatrixCOPALayoutAtom(unique_complete_circuits, unique_nospam_circuits,
                                          circuits_by_unique_nospam_circuits, ds_circuits,
                                          group, helpful_scratch_group, model, 
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 5c608baee..2c742f533 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -9,6 +9,7 @@
     MapForwardSimulator, SimpleMapForwardSimulator, \
     MatrixForwardSimulator,  SimpleMatrixForwardSimulator, \
     TorchForwardSimulator
+from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
 from pygsti.models import ExplicitOpModel
 from pygsti.circuits import Circuit, create_lsgst_circuit_lists
 from pygsti.baseobjs import Label as L
@@ -280,7 +281,8 @@ def setUp(self):
             SimpleMapForwardSimulator(),
             SimpleMatrixForwardSimulator(),
             MapForwardSimulator(),
-            MatrixForwardSimulator()
+            MatrixForwardSimulator(),
+            # LCSEvalTreeMatrixForwardSimulator()
         ]
         if TorchForwardSimulator.ENABLED:
             sims.append(TorchForwardSimulator())
@@ -359,3 +361,6 @@ def test_map_fwdsim(self):
     def test_matrix_fwdsim(self):
         self._run(MatrixForwardSimulator)
 
+    def test_lcs_matrix_fwdsim(self):
+        self._run(LCSEvalTreeMatrixForwardSimulator)
+

From 2616ae3ddb313f9749605d6d40cca920e89bf2b3 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 15:44:13 -0700
Subject: [PATCH 062/141] deactivate test

---
 ...propagation.py => tempdeactivated_test_errorgenpropagation.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/unit/objects/{test_errorgenpropagation.py => tempdeactivated_test_errorgenpropagation.py} (100%)

diff --git a/test/unit/objects/test_errorgenpropagation.py b/test/unit/objects/tempdeactivated_test_errorgenpropagation.py
similarity index 100%
rename from test/unit/objects/test_errorgenpropagation.py
rename to test/unit/objects/tempdeactivated_test_errorgenpropagation.py

From 1f4d05ba384ee35acc7d845d865982062621a447 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 16:03:03 -0700
Subject: [PATCH 063/141] rename another test file

---
 ...errgenproptools.py => tempdeactivated_test_errgenproptools.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/unit/tools/{test_errgenproptools.py => tempdeactivated_test_errgenproptools.py} (100%)

diff --git a/test/unit/tools/test_errgenproptools.py b/test/unit/tools/tempdeactivated_test_errgenproptools.py
similarity index 100%
rename from test/unit/tools/test_errgenproptools.py
rename to test/unit/tools/tempdeactivated_test_errgenproptools.py

From 3f827462c96527d3abd8915ff2a13037a23a9a6d Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 16:45:26 -0700
Subject: [PATCH 064/141] remove unused class

---
 pygsti/forwardsims/matrixforwardsim.py | 1566 ------------------------
 1 file changed, 1566 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index a1fa3a76f..243a17e90 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2168,1572 +2168,6 @@ def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_tota
                                               dataset, ds_cache)
 
 
-class NicksMatrixForwardSimulator(_DistributableForwardSimulator, SimpleMatrixForwardSimulator):
-    """
-    Computes circuit outcome probabilities by multiplying together circuit-layer process matrices.
-
-    Interfaces with a model via its `circuit_layer_operator` method and extracts a dense matrix
-    representation of operators by calling their `to_dense` method.  An "evaluation tree" that
-    composes all of the circuits using pairwise "joins"  is constructed by a :class:`MatrixCOPALayout`
-    layout object, and this tree then directs pairwise multiplications of process matrices to compute
-    circuit outcome probabilities.  Derivatives are computed analytically, using operators'
-    `deriv_wrt_params` methods.
-
-    Parameters
-    ----------
-    model : Model, optional
-        The parent model of this simulator.  It's fine if this is `None` at first,
-        but it will need to be set (by assigning `self.model` before using this simulator.
-
-    distribute_by_timestamp : bool, optional
-        When `True`, treat the data as time dependent, and distribute the computation of outcome
-        probabilitiesby assigning groups of processors to the distinct time stamps within the
-        dataset.  This means of distribution be used only when the circuits themselves contain
-        no time delay infomation (all circuit layer durations are 0), as operators are cached
-        at the "start" time of each circuit, i.e., the timestamp in the data set.  If `False`,
-        then the data is treated in a time-independent way, and the overall counts for each outcome
-        are used.  If support for intra-circuit time dependence is needed, you must use a different
-        forward simulator (e.g. :class:`MapForwardSimulator`).
-
-    num_atoms : int, optional
-        The number of atoms (sub-evaluation-trees) to use when creating the layout (i.e. when calling
-        :meth:`create_layout`).  This determines how many units the element (circuit outcome
-        probability) dimension is divided into, and doesn't have to correclate with the number of
-        processors.  When multiple processors are used, if `num_atoms` is less than the number of
-        processors then `num_atoms` should divide the number of processors evenly, so that
-        `num_atoms // num_procs` groups of processors can be used to divide the computation
-        over parameter dimensions.
-
-    processor_grid : tuple optional
-        Specifies how the total number of processors should be divided into a number of
-        atom-processors, 1st-parameter-deriv-processors, and 2nd-parameter-deriv-processors.
-        Each level of specification is optional, so this can be a 1-, 2-, or 3- tuple of
-        integers (or None).  Multiplying the elements of `processor_grid` together should give
-        at most the total number of processors.
-
-    param_blk_sizes : tuple, optional
-        The parameter block sizes along the first or first & second parameter dimensions - so
-        this can be a 0-, 1- or 2-tuple of integers or `None` values.  A block size of `None`
-        means that there should be no division into blocks, and that each block processor
-        computes all of its parameter indices at once.
-    """
-
-    @classmethod
-    def _array_types_for_method(cls, method_name):
-        # The array types of *intermediate* or *returned* values within various class methods (for memory estimates)
-        if method_name == '_bulk_fill_probs_block': return cls._array_types_for_method('_compute_product_cache')
-        if method_name == '_bulk_fill_dprobs_block':
-            return cls._array_types_for_method('_compute_product_cache') \
-                + cls._array_types_for_method('_compute_dproduct_cache')
-        if method_name == '_bulk_fill_hprobs_block':
-            return cls._array_types_for_method('_compute_product_cache') \
-                + cls._array_types_for_method('_compute_dproduct_cache') \
-                + cls._array_types_for_method('_compute_hproduct_cache')
-
-        if method_name == '_compute_product_cache': return ('zdd', 'z', 'z')  # cache of gates, scales, and scaleVals
-        if method_name == '_compute_dproduct_cache': return ('zddb',)  # cache x dim x dim x distributed_nparams
-        if method_name == '_compute_hproduct_cache': return ('zddbb',)  # cache x dim x dim x dist_np1 x dist_np2
-        return super()._array_types_for_method(method_name)
-
-    def __init__(self, model=None, distribute_by_timestamp=False, num_atoms=None, processor_grid=None,
-                 param_blk_sizes=None):
-        super().__init__(model, num_atoms, processor_grid, param_blk_sizes)
-        self._mode = "distribute_by_timestamp" if distribute_by_timestamp else "time_independent"
-        self.swap_gate_superop = unitary_to_superop(internal_gate_unitaries()["SWAP"])
-
-        # We are also going to set up lanes to use.
-
-        # Fix it to 5 qubits.
-        self._lanes_used = {0: {0}, 1: {1}, 2: {2,3}, 3: {4}}
-        self._qubits_to_lanes = {0: 0, 1: 1, 2:2, 3:2, 4:3}
-
-
-    def _to_nice_serialization(self):
-        state = super()._to_nice_serialization()
-        state.update({'mode': self._mode,
-                      # (don't serialize parent model or processor distribution info)
-                      })
-        return state
-
-    @classmethod
-    def _from_nice_serialization(cls, state):
-        #Note: resets processor-distribution information
-        return cls(None, state['mode'] == "distribute_by_timestamp")
-
-    def copy(self):
-        """
-        Return a shallow copy of this MatrixForwardSimulator
-
-        Returns
-        -------
-        MatrixForwardSimulator
-        """
-        return MatrixForwardSimulator(self.model)
-
-    def _compute_product_cache(self, layout_atom_tree, resource_alloc):
-        """
-        Computes an array of operation sequence products (process matrices).
-
-        Note: will *not* parallelize computation:  parallelization should be
-        done at a higher level.
-        """
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-
-        #Note: resource_alloc gives procs that could work together to perform
-        # computation, e.g. paralllel dot products but NOT to just partition
-        # futher (e.g. among the wrt_slices) as this is done in the layout.
-        # This function doesn't make use of resource_alloc - all procs compute the same thing.
-
-        eval_tree = layout_atom_tree
-        cacheSize = len(eval_tree)
-        
-        # This is the maximum size any operator can be. However, we are going to make use of the minimum size.
-        prodCache = _np.zeros((cacheSize, dim, dim), 'd')
-        prodCache = [[] for _ in range(cacheSize)] # Build the cache dynamically.
-        scaleCache = _np.zeros(cacheSize, 'd')
-
-        for iDest, iRight, iLeft in eval_tree:
-
-            #Special case of an "initial operation" that can be filled directly
-            if iRight is None:  # then iLeft gives operation:
-                opLabel = iLeft
-                if opLabel is None:
-                    prodCache[iDest] = _np.identity(dim)
-                    # Note: scaleCache[i] = 0.0 from initialization
-                else:
-                    small_gate = 1
-                    if isinstance(opLabel, LabelTup):
-                        small_gate = self.model.operation_blks["gates"][opLabel].to_dense(on_space="minimal")
-                        # We know that this operator is the whole lane.
-
-                        qubits = opLabel.qubits
-                        if len(qubits) == 2:
-                            if qubits[0] > qubits[1]:
-                                # We need to swap.
-                                small_gate = self.swap_gate_superop.T @ small_gate @ self.swap_gate_superop
-
-                    elif isinstance(opLabel, LabelTupTup):
-                        # We need to iterate through this operator in order to build up the right system.
-                        for ind in range(len(opLabel)):
-                            next_matrix = self.model.operation_blks["gates"][opLabel[ind]].to_dense(on_space="minimal")
-                            # Do we need to insert the swap gates?
-                            qubits = opLabel[ind].qubits
-                            if len(qubits) == 2:
-                                if qubits[0] > qubits[1]:
-                                    # We need to swap.
-                                    next_matrix = self.swap_gate_superop.T @ next_matrix @ self.swap_gate_superop
-
-                            small_gate = _np.kron(small_gate, next_matrix)
-                    # gate = self.model.circuit_layer_operator(opLabel, 'op').to_dense(on_space='minimal')
-                    nG = max(_nla.norm(small_gate), 1.0)
-                    prodCache[iDest] = small_gate / nG
-                    scaleCache[iDest] = _np.log(nG)
-                continue
-
-            # combine iLeft + iRight => iDest
-            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
-            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
-            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
-            L, R = prodCache[iLeft], prodCache[iRight]
-            prodCache[iDest] = L @ R
-            scaleCache[iDest] = scaleCache[iLeft] + scaleCache[iRight]
-
-            if prodCache[iDest].max() < _PSMALL and prodCache[iDest].min() > -_PSMALL:
-                nL = max(_nla.norm(L), _np.exp(-scaleCache[iLeft]), 1e-300)
-                nR = max(_nla.norm(R), _np.exp(-scaleCache[iRight]), 1e-300)
-                sL, sR = L / nL, R / nR
-                prodCache[iDest] = _np.dot(sL, sR); scaleCache[iDest] += _np.log(nL) + _np.log(nR)
-
-
-        if __debug__:
-            # So that it can be optimized out when called with -o.
-
-            for i in range(cacheSize):
-                # since all scaled gates start with norm <= 1, products should all have norm <= 1
-                assert len((~_np.isfinite(prodCache[i])).nonzero()[0]) == 0
-
-        return prodCache, scaleCache
-
-    def _compute_dproduct_cache(self, layout_atom_tree, prod_cache, scale_cache,
-                                resource_alloc=None, wrt_slice=None, profiler=None):
-        """
-        Computes a tree of product derivatives in a linear cache space. Will
-        use derivative columns to parallelize computation.
-        """
-
-        if profiler is None: profiler = _dummy_profiler
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        nDerivCols = self.model.num_params if (wrt_slice is None) \
-            else _slct.length(wrt_slice)
-        deriv_shape = (nDerivCols, dim, dim)
-        eval_tree = layout_atom_tree
-        cacheSize = len(eval_tree)
-
-        #Note: resource_alloc gives procs that could work together to perform
-        # computation, e.g. paralllel dot products but NOT to just partition
-        # futher (e.g. among the wrt_slices) as this is done in the layout.
-        # This function doesn't make use of resource_alloc - all procs compute the same thing.
-
-        ## ------------------------------------------------------------------
-        #
-        ##print("MPI: _compute_dproduct_cache begin: %d deriv cols" % nDerivCols)
-        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
-        #    #print("MPI: _compute_dproduct_cache called w/comm size %d" % comm.Get_size())
-        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
-        #
-        #    if resource_alloc.comm.Get_size() > nDerivCols:
-        #
-        #        #If there are more processors than deriv cols, give a
-        #        # warning -- note that we *cannot* make use of a tree being
-        #        # split because there's no good way to reconstruct the
-        #        # *non-final* parent-tree elements from those of the sub-trees.
-        #        _warnings.warn("Increased speed could be obtained by giving dproduct cache computation"
-        #                       " *fewer* processors, as there are more cpus than derivative columns.")
-        #
-        #    # Use comm to distribute columns
-        #    allDerivColSlice = slice(0, nDerivCols) if (wrt_slice is None) else wrt_slice
-        #    _, myDerivColSlice, _, sub_resource_alloc = \
-        #        _mpit.distribute_slice(allDerivColSlice, resource_alloc.comm)
-        #    #print("MPI: _compute_dproduct_cache over %d cols (%s) (rank %d computing %s)" \
-        #    #    % (nDerivCols, str(allDerivColIndices), comm.Get_rank(), str(myDerivColIndices)))
-        #    if sub_resource_alloc is not None and sub_resource_alloc.comm is not None \
-        #       and sub_resource_alloc.comm.Get_size() > 1:
-        #        _warnings.warn("Too many processors to make use of in "
-        #                       " _compute_dproduct_cache.")
-        #        if sub_resource_alloc.comm.Get_rank() > 0: myDerivColSlice = slice(0, 0)
-        #        #don't compute anything on "extra", i.e. rank != 0, cpus
-        #
-        #    my_results = self._compute_dproduct_cache(
-        #        layout_atom_tree, prod_cache, scale_cache, None, myDerivColSlice, profiler)
-        #    # pass None as comm, *not* mySubComm, since we can't do any
-        #    #  further parallelization
-        #
-        #    tm = _time.time()
-        #    all_results = resource_alloc.comm.allgather(my_results)
-        #    profiler.add_time("MPI IPC", tm)
-        #    return _np.concatenate(all_results, axis=1)  # TODO: remove this concat w/better gather?
-        #
-        ## ------------------------------------------------------------------
-
-        tSerialStart = _time.time()
-        dProdCache = _np.zeros((cacheSize,) + deriv_shape)
-        wrtIndices = _slct.indices(wrt_slice) if (wrt_slice is not None) else None
-
-        for iDest, iRight, iLeft in eval_tree:
-
-            #Special case of an "initial operation" that can be filled directly
-            if iRight is None:  # then iLeft gives operation:
-                opLabel = iLeft
-                if opLabel is None:
-                    dProdCache[iDest] = _np.zeros(deriv_shape)
-                else:
-                    #doperation = self.dproduct( (opLabel,) , wrt_filter=wrtIndices)
-                    doperation = self._doperation(opLabel, wrt_filter=wrtIndices)
-                    dProdCache[iDest] = doperation / _np.exp(scale_cache[iDest])
-                continue
-
-            tm = _time.time()
-
-            # combine iLeft + iRight => i
-            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
-            # (iRight,iLeft,iFinal) = tup implies circuit[i] = circuit[iLeft] + circuit[iRight], but we want:
-            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
-            L, R = prod_cache[iLeft], prod_cache[iRight]
-            dL, dR = dProdCache[iLeft], dProdCache[iRight]
-            dProdCache[iDest] = _np.dot(dL, R) + \
-                _np.swapaxes(_np.dot(L, dR), 0, 1)  # dot(dS, T) + dot(S, dT)
-            profiler.add_time("compute_dproduct_cache: dots", tm)
-            profiler.add_count("compute_dproduct_cache: dots")
-
-            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
-            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
-                dProdCache[iDest] /= _np.exp(scale)
-                if dProdCache[iDest].max() < _DSMALL and dProdCache[iDest].min() > -_DSMALL:
-                    _warnings.warn("Scaled dProd small in order to keep prod managable.")
-            elif (_np.count_nonzero(dProdCache[iDest]) and dProdCache[iDest].max() < _DSMALL
-                  and dProdCache[iDest].min() > -_DSMALL):
-                _warnings.warn("Would have scaled dProd but now will not alter scale_cache.")
-
-        #profiler.print_mem("DEBUGMEM: POINT2"); profiler.comm.barrier()
-
-        profiler.add_time("compute_dproduct_cache: serial", tSerialStart)
-        profiler.add_count("compute_dproduct_cache: num columns", nDerivCols)
-
-        return dProdCache
-
-    def _compute_hproduct_cache(self, layout_atom_tree, prod_cache, d_prod_cache1,
-                                d_prod_cache2, scale_cache, resource_alloc=None,
-                                wrt_slice1=None, wrt_slice2=None):
-        """
-        Computes a tree of product 2nd derivatives in a linear cache space. Will
-        use derivative rows and columns to parallelize computation.
-        """
-
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-
-        # Note: dProdCache?.shape = (#circuits,#params_to_diff_wrt,dim,dim)
-        nDerivCols1 = d_prod_cache1.shape[1]
-        nDerivCols2 = d_prod_cache2.shape[1]
-        assert(wrt_slice1 is None or _slct.length(wrt_slice1) == nDerivCols1)
-        assert(wrt_slice2 is None or _slct.length(wrt_slice2) == nDerivCols2)
-        hessn_shape = (nDerivCols1, nDerivCols2, dim, dim)
-        eval_tree = layout_atom_tree
-        cacheSize = len(eval_tree)
-
-        #Note: resource_alloc gives procs that could work together to perform
-        # computation, e.g. paralllel dot products but NOT to just partition
-        # futher (e.g. among the wrt_slices) as this is done in the layout.
-        # This function doesn't make use of resource_alloc - all procs compute the same thing.
-
-        ## ------------------------------------------------------------------
-        #
-        #if resource_alloc is not None and resource_alloc.comm is not None and resource_alloc.comm.Get_size() > 1:
-        #    # parallelize of deriv cols, then sub-trees (if available and necessary)
-        #
-        #    if resource_alloc.comm.Get_size() > nDerivCols1 * nDerivCols2:
-        #        #If there are more processors than deriv cells, give a
-        #        # warning -- note that we *cannot* make use of a tree being
-        #        # split because there's no good way to reconstruct the
-        #        # *non-final* parent-tree elements from those of the sub-trees.
-        #        _warnings.warn("Increased speed could be obtained"
-        #                       " by giving hproduct cache computation"
-        #                       " *fewer* processors and *smaller* (sub-)tree"
-        #                       " (e.g. by splitting tree beforehand), as there"
-        #                       " are more cpus than hessian elements.")  # pragma: no cover
-        #
-        #    # allocate final result memory
-        #    hProdCache = _np.zeros((cacheSize,) + hessn_shape)
-        #
-        #    # Use comm to distribute columns
-        #    allDeriv1ColSlice = slice(0, nDerivCols1)
-        #    allDeriv2ColSlice = slice(0, nDerivCols2)
-        #    deriv1Slices, myDeriv1ColSlice, deriv1Owners, mySubComm = \
-        #        _mpit.distribute_slice(allDeriv1ColSlice, resource_alloc.comm)
-        #
-        #    # Get slice into entire range of model params so that
-        #    #  per-gate hessians can be computed properly
-        #    if wrt_slice1 is not None and wrt_slice1.start is not None:
-        #        myHessianSlice1 = _slct.shift(myDeriv1ColSlice, wrt_slice1.start)
-        #    else: myHessianSlice1 = myDeriv1ColSlice
-        #
-        #    #print("MPI: _compute_hproduct_cache over %d cols (rank %d computing %s)" \
-        #    #    % (nDerivCols2, comm.Get_rank(), str(myDerivColSlice)))
-        #
-        #    if mySubComm is not None and mySubComm.Get_size() > 1:
-        #        deriv2Slices, myDeriv2ColSlice, deriv2Owners, mySubSubComm = \
-        #            _mpit.distribute_slice(allDeriv2ColSlice, mySubComm)
-        #
-        #        # Get slice into entire range of model params (see above)
-        #        if wrt_slice2 is not None and wrt_slice2.start is not None:
-        #            myHessianSlice2 = _slct.shift(myDeriv2ColSlice, wrt_slice2.start)
-        #        else: myHessianSlice2 = myDeriv2ColSlice
-        #
-        #        if mySubSubComm is not None and mySubSubComm.Get_size() > 1:
-        #            _warnings.warn("Too many processors to make use of in "
-        #                           " _compute_hproduct_cache.")
-        #            #TODO: remove: not needed now that we track owners
-        #            #if mySubSubComm.Get_rank() > 0: myDeriv2ColSlice = slice(0,0)
-        #            #  #don't compute anything on "extra", i.e. rank != 0, cpus
-        #
-        #        hProdCache[:, myDeriv1ColSlice, myDeriv2ColSlice] = self._compute_hproduct_cache(
-        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice],
-        #            d_prod_cache2[:, myDeriv2ColSlice], scale_cache, None, myHessianSlice1, myHessianSlice2)
-        #        # pass None as comm, *not* mySubSubComm, since we can't do any further parallelization
-        #
-        #        #NOTE: we only need to gather to the root processor (TODO: update this)
-        #        _mpit.gather_slices(deriv2Slices, deriv2Owners, hProdCache, [None, myDeriv1ColSlice],
-        #                            2, mySubComm)  # , gather_mem_limit) #gather over col-distribution (Deriv2)
-        #        #note: gathering axis 2 of hProdCache[:,myDeriv1ColSlice],
-        #        #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
-        #    else:
-        #        #compute "Deriv1" row-derivatives distribution only; don't use column distribution
-        #        hProdCache[:, myDeriv1ColSlice] = self._compute_hproduct_cache(
-        #            layout_atom_tree, prod_cache, d_prod_cache1[:, myDeriv1ColSlice], d_prod_cache2,
-        #            scale_cache, None, myHessianSlice1, wrt_slice2)
-        #        # pass None as comm, *not* mySubComm (this is ok, see "if" condition above)
-        #
-        #    #NOTE: we only need to gather to the root processor (TODO: update this)
-        #    _mpit.gather_slices(deriv1Slices, deriv1Owners, hProdCache, [], 1, resource_alloc.comm)
-        #    #, gather_mem_limit) #gather over row-distribution (Deriv1)
-        #    #note: gathering axis 1 of hProdCache,
-        #    #      dim=(cacheSize,nDerivCols1,nDerivCols2,dim,dim)
-        #
-        #    return hProdCache
-        #
-        ## ------------------------------------------------------------------
-
-        hProdCache = _np.zeros((cacheSize,) + hessn_shape)
-        wrtIndices1 = _slct.indices(wrt_slice1) if (wrt_slice1 is not None) else None
-        wrtIndices2 = _slct.indices(wrt_slice2) if (wrt_slice2 is not None) else None
-
-        for iDest, iRight, iLeft in eval_tree:
-
-            #Special case of an "initial operation" that can be filled directly
-            if iRight is None:  # then iLeft gives operation:
-                opLabel = iLeft
-                if opLabel is None:
-                    hProdCache[iDest] = _np.zeros(hessn_shape)
-                elif not self.model.circuit_layer_operator(opLabel, 'op').has_nonzero_hessian():
-                    #all gate elements are at most linear in params, so
-                    # all hessians for single- or zero-circuits are zero.
-                    hProdCache[iDest] = _np.zeros(hessn_shape)
-                else:
-                    hoperation = self._hoperation(opLabel,
-                                                  wrt_filter1=wrtIndices1,
-                                                  wrt_filter2=wrtIndices2)
-                    hProdCache[iDest] = hoperation / _np.exp(scale_cache[iDest])
-                continue
-
-            # combine iLeft + iRight => i
-            # LEXICOGRAPHICAL VS MATRIX ORDER Note: we reverse iLeft <=> iRight from eval_tree because
-            # (Dest,iLeft,iRight,iFinal) = tup implies circuit[iDest] = circuit[iLeft] + circuit[iRight], but we want:
-            # since then matrixOf(circuit[i]) = matrixOf(circuit[iLeft]) * matrixOf(circuit[iRight])
-            L, R = prod_cache[iLeft], prod_cache[iRight]
-            dL1, dR1 = d_prod_cache1[iLeft], d_prod_cache1[iRight]
-            dL2, dR2 = d_prod_cache2[iLeft], d_prod_cache2[iRight]
-            hL, hR = hProdCache[iLeft], hProdCache[iRight]
-            # Note: L, R = GxG ; dL,dR = vgs x GxG ; hL,hR = vgs x vgs x GxG
-
-            dLdRa = _np.swapaxes(_np.dot(dL1, dR2), 1, 2)
-            dLdRb = _np.swapaxes(_np.dot(dL2, dR1), 1, 2)
-            dLdR_sym = dLdRa + _np.swapaxes(dLdRb, 0, 1)
-
-            hProdCache[iDest] = _np.dot(hL, R) + dLdR_sym + _np.transpose(_np.dot(L, hR), (1, 2, 0, 3))
-
-            scale = scale_cache[iDest] - (scale_cache[iLeft] + scale_cache[iRight])
-            if abs(scale) > 1e-8:  # _np.isclose(scale,0) is SLOW!
-                hProdCache[iDest] /= _np.exp(scale)
-                if hProdCache[iDest].max() < _HSMALL and hProdCache[iDest].min() > -_HSMALL:
-                    _warnings.warn("Scaled hProd small in order to keep prod managable.")
-            elif (_np.count_nonzero(hProdCache[iDest]) and hProdCache[iDest].max() < _HSMALL
-                  and hProdCache[iDest].min() > -_HSMALL):
-                _warnings.warn("hProd is small (oh well!).")
-
-        return hProdCache
-
-    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E',),
-                      derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache= None):
-        """
-        Constructs an circuit-outcome-probability-array (COPA) layout for a list of circuits.
-
-        Parameters
-        ----------
-        circuits : list
-            The circuits whose outcome probabilities should be included in the layout.
-
-        dataset : DataSet
-            The source of data counts that will be compared to the circuit outcome
-            probabilities.  The computed outcome probabilities are limited to those
-            with counts present in `dataset`.
-
-        resource_alloc : ResourceAllocation
-            A available resources and allocation information.  These factors influence how
-            the layout (evaluation strategy) is constructed.
-
-        array_types : tuple, optional
-            A tuple of string-valued array types.  See :meth:`ForwardSimulator.create_layout`.
-
-        derivative_dimensions : int or tuple[int], optional
-            Optionally, the parameter-space dimension used when taking first
-            and second derivatives with respect to the cirucit outcome probabilities.  This must be
-            non-None when `array_types` contains `'ep'` or `'epp'` types.
-            If a tuple, then must be length 1.
-
-        verbosity : int or VerbosityPrinter
-            Determines how much output to send to stdout.  0 means no output, higher
-            integers mean more output.
-
-        layout_creation_circuit_cache : dict, optional (default None)
-            A precomputed dictionary serving as a cache for completed
-            circuits. I.e. circuits with prep labels and POVM labels appended.
-            Along with other useful pre-computed circuit structures used in layout
-            creation.
-            
-        Returns
-        -------
-        MatrixCOPALayout
-        """
-        # There are two types of quantities we adjust to create a good layout: "group-counts" and "processor-counts"
-        #  - group counts:  natoms, nblks, nblks2 give how many indpendently computed groups/ranges of circuits,
-        #                   1st parameters, and 2nd parameters are used.  Making these larger can reduce memory
-        #                   consumption by reducing intermediate memory usage.
-        #  - processor counts: na, np, np2 give how many "atom-processors", "param-processors" and "param2-processors"
-        #                      are used to process data along each given direction.  These values essentially specify
-        #                      how the physical procesors are divided by giving the number of (roughly equal) intervals
-        #                      exist along each dimension of the physical processor "grid".  Thus, thees values are set
-        #                      based on the total number of cores available and how many dimensions are being computed.
-
-        resource_alloc = _ResourceAllocation.cast(resource_alloc)
-        mem_limit = resource_alloc.mem_limit - resource_alloc.allocated_memory \
-            if (resource_alloc.mem_limit is not None) else None  # *per-processor* memory limit
-        printer = _VerbosityPrinter.create_printer(verbosity, resource_alloc)
-        nprocs = resource_alloc.comm_size
-        comm = resource_alloc.comm
-        if isinstance(derivative_dimensions, int):
-            num_params = derivative_dimensions
-        elif isinstance(derivative_dimensions, tuple):
-            assert len(derivative_dimensions) == 1
-            num_params = derivative_dimensions[0]
-        else:
-            num_params = self.model.num_params
-        C = 1.0 / (1024.0**3)
-
-        if mem_limit is not None:
-            if mem_limit <= 0:
-                raise MemoryError("Attempted layout creation w/memory limit = %g <= 0!" % mem_limit)
-            printer.log("Layout creation w/mem limit = %.2fGB" % (mem_limit * C))
-
-        natoms, na, npp, param_dimensions, param_blk_sizes = self._compute_processor_distribution(
-            array_types, nprocs, num_params, len(circuits), default_natoms=1)
-
-        if self._mode == "distribute_by_timestamp":
-            #Special case: time dependent data that gets grouped & distributed by unique timestamp
-            # To to this, we override above values of natoms, na, and npp:
-            natoms = 1  # save all processor division for within the (single) atom, for different timestamps
-            na, npp = 1, (1, 1)  # save all processor division for within the (single) atom, for different timestamps
-
-        printer.log("MatrixLayout: %d processors divided into %s (= %d) grid along circuit and parameter directions." %
-                    (nprocs, ' x '.join(map(str, (na,) + npp)), _np.prod((na,) + npp)))
-        printer.log("   %d atoms, parameter block size limits %s" % (natoms, str(param_blk_sizes)))
-        assert(_np.prod((na,) + npp) <= nprocs), "Processor grid size exceeds available processors!"
-
-        layout = _MatrixCOPALayout(circuits, self.model, dataset, natoms,
-                                   na, npp, param_dimensions, param_blk_sizes, resource_alloc, verbosity, 
-                                   layout_creation_circuit_cache=layout_creation_circuit_cache)
-
-        if mem_limit is not None:
-            loc_nparams1 = num_params / npp[0] if len(npp) > 0 else 0
-            loc_nparams2 = num_params / npp[1] if len(npp) > 1 else 0
-            blk1 = param_blk_sizes[0] if len(param_blk_sizes) > 0 else 0
-            blk2 = param_blk_sizes[1] if len(param_blk_sizes) > 1 else 0
-            if blk1 is None: blk1 = loc_nparams1
-            if blk2 is None: blk2 = loc_nparams2
-            global_layout = layout.global_layout
-            if comm is not None:
-                from mpi4py import MPI
-                max_local_els = comm.allreduce(layout.num_elements, op=MPI.MAX)    # layout.max_atom_elements
-                max_atom_els = comm.allreduce(layout.max_atom_elements, op=MPI.MAX)
-                max_local_circuits = comm.allreduce(layout.num_circuits, op=MPI.MAX)
-                max_atom_cachesize = comm.allreduce(layout.max_atom_cachesize, op=MPI.MAX)
-            else:
-                max_local_els = layout.num_elements
-                max_atom_els = layout.max_atom_elements
-                max_local_circuits = layout.num_circuits
-                max_atom_cachesize = layout.max_atom_cachesize
-            mem_estimate = _bytes_for_array_types(array_types, global_layout.num_elements, max_local_els, max_atom_els,
-                                                  global_layout.num_circuits, max_local_circuits,
-                                                  layout._param_dimensions, (loc_nparams1, loc_nparams2),
-                                                  (blk1, blk2), max_atom_cachesize,
-                                                  self.model.evotype.minimal_dim(self.model.state_space))
-
-            GB = 1.0 / 1024.0**3
-            if mem_estimate > mem_limit:
-                raise MemoryError("Not enough memory for desired layout! (limit=%.1fGB, required=%.1fGB)" % (
-                    mem_limit * GB, mem_estimate * GB))
-            else:
-                printer.log("   Esimated memory required = %.1fGB" % (mem_estimate * GB))
-
-        return layout
-    
-    @staticmethod
-    def create_copa_layout_circuit_cache(circuits, model, dataset=None):
-        """
-        Helper function for pre-computing/pre-processing circuits structures
-        used in matrix layout creation.
-        """
-        cache = dict()
-        completed_circuits, split_circuits = model.complete_circuits(circuits, return_split=True)
-
-        cache['completed_circuits'] = {ckt: comp_ckt for ckt, comp_ckt in zip(circuits, completed_circuits)}
-        cache['split_circuits'] = {ckt: split_ckt for ckt, split_ckt in zip(circuits, split_circuits)}
-
-        if dataset is not None:
-            aliases = circuits.op_label_aliases if isinstance(circuits, _CircuitList) else None
-            ds_circuits = _lt.apply_aliases_to_circuits(circuits, aliases)
-            unique_outcomes_list = []
-            for ckt in ds_circuits:
-                ds_row = dataset[ckt]
-                unique_outcomes_list.append(ds_row.unique_outcomes if ds_row is not None else None)
-        else:
-            unique_outcomes_list = [None]*len(circuits)
-
-        expanded_circuit_outcome_list = model.bulk_expand_instruments_and_separate_povm(circuits, 
-                                                                                        observed_outcomes_list = unique_outcomes_list, 
-                                                                                        split_circuits = split_circuits)
-        
-        expanded_circuit_cache = {ckt: expanded_ckt for ckt,expanded_ckt in zip(circuits, expanded_circuit_outcome_list)}
-                    
-        cache['expanded_and_separated_circuits'] = expanded_circuit_cache
-
-        expanded_subcircuits_no_spam_cache = dict()
-        for expc_outcomes in cache['expanded_and_separated_circuits'].values():
-            for sep_povm_c, _ in expc_outcomes.items():  # for each expanded cir from unique_i-th circuit
-                exp_nospam_c = sep_povm_c.circuit_without_povm[1:] 
-                expanded_subcircuits_no_spam_cache[exp_nospam_c] = exp_nospam_c.expand_subcircuits()
-
-        cache['expanded_subcircuits_no_spam'] = expanded_subcircuits_no_spam_cache
-
-        return cache
-
-    def _scale_exp(self, scale_exps):
-        old_err = _np.seterr(over='ignore')
-        scaleVals = _np.exp(scale_exps)  # may overflow, but OK if infs occur here
-        _np.seterr(**old_err)
-        return scaleVals
-
-    def _rho_e_from_spam_tuple(self, spam_tuple):
-        # This calculator uses the convention that rho has shape (N,1)
-        rholabel, elabel = spam_tuple
-        rho = self.model.circuit_layer_operator(rholabel, 'prep').to_dense(on_space='minimal')[:, None]
-        E = _np.conjugate(_np.transpose(self.model.circuit_layer_operator(
-            elabel, 'povm').to_dense(on_space='minimal')[:, None]))
-        return rho, E
-
-    def _probs_from_rho_e(self, rho, e, gs, scale_vals):
-        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
-
-        #Compute probability and save in return array
-        # want vp[iFinal] = float(dot(e, dot(G, rho)))
-        #  vp[i] = sum_k,l e[0,k] gs[i,k,l] rho[l,0] * scale_vals[i]
-        #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
-        #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
-        #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
-        return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
-        # shape == (len(circuit_list),) ; may overflow but OK
-
-    def _dprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs, scale_vals, wrt_slice=None):
-        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
-
-        rholabel, elabel = spam_tuple
-        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
-        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
-        nCircuits = gs.shape[0]
-
-        nDerivCols = self.model.num_params if wrt_slice is None else _slct.length(wrt_slice)
-
-        # GATE DERIVS (assume d_gs is already sized/filtered) -------------------
-        assert(d_gs.shape[1] == nDerivCols), "d_gs must be pre-filtered!"
-
-        #Compute d(probability)/dOps and save in return list (now have G,dG => product, dprod_dOps)
-        #  prod, dprod_dOps = G,dG
-        # dp_dOps[i,j] = sum_k,l e[0,k] d_gs[i,j,k,l] rho[l,0]
-        # dp_dOps[i,j] = sum_k e[0,k] dot( d_gs, rho )[i,j,k,0]
-        # dp_dOps[i,j] = dot( e, dot( d_gs, rho ) )[0,i,j,0]
-        # dp_dOps      = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,3))
-        old_err2 = _np.seterr(invalid='ignore', over='ignore')
-        path = _np.einsum_path('hk,ijkl,lm->ij', e, d_gs, rho, optimize='optimal')
-        dp_dOps = _np.einsum('hk,ijkl,lm->ij', e, d_gs, rho, optimize=path[0]) * scale_vals[:, None]
-        _np.seterr(**old_err2)
-        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols)
-        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
-        #  this case set to zero since we can't tell whether it's + or - inf anyway...
-        dp_dOps[_np.isnan(dp_dOps)] = 0
-
-        #SPAM -------------
-
-        if self.model._param_interposer is not None:
-            #When there is an interposer, we compute derivs wrt *all* the ops params (inefficient?),
-            # then apply interposer, then take desired wrt_filter columns:
-            nOpDerivCols = self.model._param_interposer.num_op_params
-
-            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
-            _fas(dp_drhos, [None, rhoVec.gpindices],
-                 _np.squeeze(_np.dot(_np.dot(e, gs), rhoVec.deriv_wrt_params()),  # *don't* apply wrt filter here
-                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
-            dp_drhos = _np.dot(dp_drhos, self.model._param_interposer.deriv_op_params_wrt_model_params())
-            if wrt_slice is not None: dp_drhos = dp_drhos[:, wrt_slice]
-
-            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
-            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
-            _fas(dp_dEs, [None, EVec.gpindices], _np.dot(dp_dAnyE, EVec.deriv_wrt_params()))
-            dp_dEs = _np.dot(dp_dEs, self.model._param_interposer.deriv_op_params_wrt_model_params())
-            if wrt_slice is not None: dp_dEs = dp_dEs[:, wrt_slice]
-
-        else:
-            #Simpler case of no interposer
-            nOpDerivCols = nDerivCols
-
-            rho_wrtFilter, rho_gpindices = self._process_wrt_filter(
-                wrt_slice, self.model.circuit_layer_operator(rholabel, 'prep'))
-            E_wrtFilter, E_gpindices = self._process_wrt_filter(
-                wrt_slice, self.model.circuit_layer_operator(elabel, 'povm'))
-
-            # Get: dp_drhos[i, rho_gpindices] = dot(e,gs[i],drho/drhoP)
-            # dp_drhos[i,J0+J] = sum_kl e[0,k] gs[i,k,l] drhoP[l,J]
-            # dp_drhos[i,J0+J] = dot(e, gs, drhoP)[0,i,J]
-            # dp_drhos[:,J0+J] = squeeze(dot(e, gs, drhoP),axis=(0,))[:,J]
-            dp_drhos = _np.zeros((nCircuits, nOpDerivCols))
-            _fas(dp_drhos, [None, rho_gpindices],
-                 _np.squeeze(_np.dot(_np.dot(e, gs),
-                                     rhoVec.deriv_wrt_params(rho_wrtFilter)),
-                             axis=(0,)) * scale_vals[:, None])  # may overflow, but OK
-
-            # Get: dp_dEs[i, E_gpindices] = dot(transpose(dE/dEP),gs[i],rho))
-            # dp_dEs[i,J0+J] = sum_lj dEPT[J,j] gs[i,j,l] rho[l,0]
-            # dp_dEs[i,J0+J] = sum_j dEP[j,J] dot(gs, rho)[i,j]
-            # dp_dEs[i,J0+J] = sum_j dot(gs, rho)[i,j,0] dEP[j,J]
-            # dp_dEs[i,J0+J] = dot(squeeze(dot(gs, rho),2), dEP)[i,J]
-            # dp_dEs[:,J0+J] = dot(squeeze(dot(gs, rho),axis=(2,)), dEP)[:,J]
-            dp_dEs = _np.zeros((nCircuits, nOpDerivCols))
-            # may overflow, but OK (deriv w.r.t any of self.effects - independent of which)
-            dp_dAnyE = _np.squeeze(_np.dot(gs, rho), axis=(2,)) * scale_vals[:, None]
-            _fas(dp_dEs, [None, E_gpindices],
-                 _np.dot(dp_dAnyE, EVec.deriv_wrt_params(E_wrtFilter)))
-
-        sub_vdp = dp_drhos + dp_dEs + dp_dOps
-        return sub_vdp
-
-    def _hprobs_from_rho_e(self, spam_tuple, rho, e, gs, d_gs1, d_gs2, h_gs, scale_vals,
-                           wrt_slice1=None, wrt_slice2=None):
-        if self.model.evotype == "statevec": raise NotImplementedError("Unitary evolution not fully supported yet!")
-
-        rholabel, elabel = spam_tuple
-        rhoVec = self.model.circuit_layer_operator(rholabel, 'prep')  # distinct from rho,e b/c rho,e are
-        EVec = self.model.circuit_layer_operator(elabel, 'povm')   # arrays, these are State/POVMEffect objects
-        nCircuits = gs.shape[0]
-
-        rho_wrtFilter1, rho_gpindices1 = self._process_wrt_filter(
-            wrt_slice1, self.model.circuit_layer_operator(rholabel, 'prep'))
-        rho_wrtFilter2, rho_gpindices2 = self._process_wrt_filter(
-            wrt_slice2, self.model.circuit_layer_operator(rholabel, 'prep'))
-        E_wrtFilter1, E_gpindices1 = self._process_wrt_filter(
-            wrt_slice1, self.model.circuit_layer_operator(elabel, 'povm'))
-        E_wrtFilter2, E_gpindices2 = self._process_wrt_filter(
-            wrt_slice2, self.model.circuit_layer_operator(elabel, 'povm'))
-
-        nDerivCols1 = self.model.num_params if wrt_slice1 is None else _slct.length(wrt_slice1)
-        nDerivCols2 = self.model.num_params if wrt_slice2 is None else _slct.length(wrt_slice2)
-
-        #flt1 = self._get_filter_info(wrtSlices1)
-        #flt2 = self._get_filter_info(wrtSlices2)
-
-        # GATE DERIVS (assume h_gs is already sized/filtered) -------------------
-        assert(h_gs.shape[1] == nDerivCols1), "h_gs must be pre-filtered!"
-        assert(h_gs.shape[2] == nDerivCols2), "h_gs must be pre-filtered!"
-
-        #Compute d2(probability)/dGates2 and save in return list
-        # d2pr_dOps2[i,j,k] = sum_l,m e[0,l] h_gs[i,j,k,l,m] rho[m,0]
-        # d2pr_dOps2[i,j,k] = sum_l e[0,l] dot( d_gs, rho )[i,j,k,l,0]
-        # d2pr_dOps2[i,j,k] = dot( e, dot( d_gs, rho ) )[0,i,j,k,0]
-        # d2pr_dOps2        = squeeze( dot( e, dot( d_gs, rho ) ), axis=(0,4))
-        old_err2 = _np.seterr(invalid='ignore', over='ignore')
-        d2pr_dOps2 = _np.squeeze(_np.dot(e, _np.dot(h_gs, rho)), axis=(0, 4)) * scale_vals[:, None, None]
-        _np.seterr(**old_err2)
-
-        # may overflow, but OK ; shape == (len(circuit_list), nDerivCols, nDerivCols)
-        # may also give invalid value due to scale_vals being inf and dot-prod being 0. In
-        #  this case set to zero since we can't tell whether it's + or - inf anyway...
-        d2pr_dOps2[_np.isnan(d2pr_dOps2)] = 0
-
-        # SPAM DERIVS (assume d_gs1 and d_gs2 are already sized/filtered) --------
-        assert(d_gs1.shape[1] == nDerivCols1), "d_gs1 must be pre-filtered!"
-        assert(d_gs2.shape[1] == nDerivCols2), "d_gs1 must be pre-filtered!"
-
-        # Get: d2pr_drhos[i, j, rho_gpindices] = dot(e,d_gs[i,j],drho/drhoP))
-        # d2pr_drhos[i,j,J0+J] = sum_kl e[0,k] d_gs[i,j,k,l] drhoP[l,J]
-        # d2pr_drhos[i,j,J0+J] = dot(e, d_gs, drhoP)[0,i,j,J]
-        # d2pr_drhos[:,:,J0+J] = squeeze(dot(e, d_gs, drhoP),axis=(0,))[:,:,J]
-        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
-        d2pr_drhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-        _fas(d2pr_drhos1, [None, None, rho_gpindices2],
-             _np.squeeze(_np.dot(_np.dot(e, d_gs1), drho), axis=(0,))
-             * scale_vals[:, None, None])  # overflow OK
-
-        # get d2pr_drhos where gate derivatives are wrt the 2nd set of gate parameters
-        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
-            assert(nDerivCols1 == nDerivCols2)
-            d2pr_drhos2 = _np.transpose(d2pr_drhos1, (0, 2, 1))
-        else:
-            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
-            d2pr_drhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
-            _fas(d2pr_drhos2, [None, None, rho_gpindices1],
-                 _np.squeeze(_np.dot(_np.dot(e, d_gs2), drho), axis=(0,))
-                 * scale_vals[:, None, None])  # overflow OK
-            d2pr_drhos2 = _np.transpose(d2pr_drhos2, (0, 2, 1))
-
-        # Get: d2pr_dEs[i, j, E_gpindices] = dot(transpose(dE/dEP),d_gs[i,j],rho)
-        # d2pr_dEs[i,j,J0+J] = sum_kl dEPT[J,k] d_gs[i,j,k,l] rho[l,0]
-        # d2pr_dEs[i,j,J0+J] = sum_k dEP[k,J] dot(d_gs, rho)[i,j,k,0]
-        # d2pr_dEs[i,j,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[i,j,J]
-        # d2pr_dEs[:,:,J0+J] = dot( squeeze(dot(d_gs, rho),axis=(3,)), dEP)[:,:,J]
-        d2pr_dEs1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-        dp_dAnyE = _np.squeeze(_np.dot(d_gs1, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
-        devec = EVec.deriv_wrt_params(E_wrtFilter2)
-        _fas(d2pr_dEs1, [None, None, E_gpindices2],
-             _np.dot(dp_dAnyE, devec))
-
-        # get d2pr_dEs where gate derivatives are wrt the 2nd set of gate parameters
-        if d_gs1 is d_gs2 and wrt_slice1 == wrt_slice2:  # TODO: better check for equivalence: maybe let d_gs2 be None?
-            assert(nDerivCols1 == nDerivCols2)
-            d2pr_dEs2 = _np.transpose(d2pr_dEs1, (0, 2, 1))
-        else:
-            d2pr_dEs2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
-            dp_dAnyE = _np.squeeze(_np.dot(d_gs2, rho), axis=(3,)) * scale_vals[:, None, None]  # overflow OK
-            devec = EVec.deriv_wrt_params(E_wrtFilter1)
-            _fas(d2pr_dEs2, [None, None, E_gpindices1], _np.dot(dp_dAnyE, devec))
-            d2pr_dEs2 = _np.transpose(d2pr_dEs2, (0, 2, 1))
-
-        # Get: d2pr_dErhos[i, e_offset[eIndex]:e_offset[eIndex+1], e_offset[rhoIndex]:e_offset[rhoIndex+1]] =
-        #    dEP^T * prod[i,:,:] * drhoP
-        # d2pr_dErhos[i,J0+J,K0+K] = sum jk dEPT[J,j] prod[i,j,k] drhoP[k,K]
-        # d2pr_dErhos[i,J0+J,K0+K] = sum j dEPT[J,j] dot(prod,drhoP)[i,j,K]
-        # d2pr_dErhos[i,J0+J,K0+K] = dot(dEPT,prod,drhoP)[J,i,K]
-        # d2pr_dErhos[i,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[i,J,K]
-        # d2pr_dErhos[:,J0+J,K0+K] = swapaxes(dot(dEPT,prod,drhoP),0,1)[:,J,K]
-        d2pr_dErhos1 = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-        drho = rhoVec.deriv_wrt_params(rho_wrtFilter2)
-        dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
-        devec = EVec.deriv_wrt_params(E_wrtFilter1)
-        _fas(d2pr_dErhos1, (None, E_gpindices1, rho_gpindices2),
-             _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
-
-        # get d2pr_dEs where e derivatives are wrt the 2nd set of gate parameters
-        if wrt_slice1 == wrt_slice2:  # Note: this doesn't involve gate derivatives
-            d2pr_dErhos2 = _np.transpose(d2pr_dErhos1, (0, 2, 1))
-        else:
-            d2pr_dErhos2 = _np.zeros((nCircuits, nDerivCols2, nDerivCols1))
-            drho = rhoVec.deriv_wrt_params(rho_wrtFilter1)
-            dp_dAnyE = _np.dot(gs, drho) * scale_vals[:, None, None]  # overflow OK
-            devec = EVec.deriv_wrt_params(E_wrtFilter2)
-            _fas(d2pr_dErhos2, [None, E_gpindices2, rho_gpindices1],
-                 _np.swapaxes(_np.dot(_np.transpose(devec), dp_dAnyE), 0, 1))
-            d2pr_dErhos2 = _np.transpose(d2pr_dErhos2, (0, 2, 1))
-
-        #Note: these 2nd derivatives are non-zero when the spam vectors have
-        # a more than linear dependence on their parameters.
-        if self.model.circuit_layer_operator(rholabel, 'prep').has_nonzero_hessian():
-            dp_dAnyRho = _np.dot(e, gs).squeeze(0) * scale_vals[:, None]  # overflow OK
-            d2pr_d2rhos = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-            _fas(d2pr_d2rhos, [None, rho_gpindices1, rho_gpindices2],
-                 _np.tensordot(dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep').hessian_wrt_params(
-                     rho_wrtFilter1, rho_wrtFilter2), (1, 0)))
-            # _np.einsum('ij,jkl->ikl', dp_dAnyRho, self.model.circuit_layer_operator(rholabel, 'prep') \
-            #    .hessian_wrt_params(rho_wrtFilter1, rho_wrtFilter2))
-        else:
-            d2pr_d2rhos = 0
-
-        if self.model.circuit_layer_operator(elabel, 'povm').has_nonzero_hessian():
-            dp_dAnyE = _np.dot(gs, rho).squeeze(2) * scale_vals[:, None]  # overflow OK
-            d2pr_d2Es = _np.zeros((nCircuits, nDerivCols1, nDerivCols2))
-            _fas(d2pr_d2Es, [None, E_gpindices1, E_gpindices2],
-                 _np.tensordot(dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
-                     E_wrtFilter1, E_wrtFilter2), (1, 0)))
-            # _np.einsum('ij,jkl->ikl', dp_dAnyE, self.model.circuit_layer_operator(elabel, 'povm').hessian_wrt_params(
-            #    E_wrtFilter1, E_wrtFilter2))
-        else:
-            d2pr_d2Es = 0
-
-        # END SPAM DERIVS -----------------------
-
-        ret = d2pr_d2rhos + d2pr_dErhos2 + d2pr_drhos2    # wrt rho
-        ret += d2pr_dErhos1 + d2pr_d2Es + d2pr_dEs2      # wrt e
-        ret += d2pr_drhos1 + d2pr_dEs1 + d2pr_dOps2   # wrt gates
-
-        return ret
-
-    def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
-        #Free memory from previous subtree iteration before computing caches
-        scaleVals = Gs = prodCache = scaleCache = None
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2)  # prod cache
-
-        #Fill cache info
-        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
-
-        if not resource_alloc.is_host_leader:
-            # (same as "if resource_alloc.host_comm is not None and resource_alloc.host_comm.rank != 0")
-            # we cannot further utilize multiplie processors when computing a single block.  The required
-            # ending condition is that array_to_fill on each processor has been filled.  But if memory
-            # is being shared and resource_alloc contains multiple processors on a single host, we only
-            # want *one* (the rank=0) processor to perform the computation, since array_to_fill will be
-            # shared memory that we don't want to have muliple procs using simultaneously to compute the
-            # same thing.  Thus, we just do nothing on all of the non-root host_comm processors.
-            # We could also print a warning (?), or we could carefully guard any shared mem updates
-            # using "if resource_alloc.is_host_leader" conditions (if we could use  multiple procs elsewhere).
-            return
-
-        #use cached data to final values
-        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
-        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
-        # ( n_circuits, dim, dim )
-
-        old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
-            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
-            #  to the the element indices when `spamtuple` is used.
-            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices],
-                 self._probs_from_rho_e(rho, E, Gs[tree_indices], scaleVals[tree_indices]))
-        _np.seterr(**old_err)
-
-    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim * dim * _slct.length(param_slice))
-        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
-        dProdCache = self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
-                                                  resource_alloc, param_slice)
-        if not resource_alloc.is_host_leader:
-            return  # Non-root host processors aren't used anymore to compute the result on the root proc
-
-        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
-        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
-        dGs = layout_atom.nonscratch_cache_view(dProdCache, axis=0)
-
-        old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices, dest_param_slice], self._dprobs_from_rho_e(
-                spam_tuple, rho, E, Gs[tree_indices], dGs[tree_indices], scaleVals[tree_indices], param_slice))
-
-        _np.seterr(**old_err)
-
-    def _bulk_fill_hprobs_atom(self, array_to_fill, dest_param_slice1, dest_param_slice2, layout_atom,
-                               param_slice1, param_slice2, resource_alloc):
-        dim = self.model.evotype.minimal_dim(self.model.state_space)
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * dim**2
-                                                 * _slct.length(param_slice1) * _slct.length(param_slice2))
-        prodCache, scaleCache = self._compute_product_cache(layout_atom.tree, resource_alloc)
-        dProdCache1 = self._compute_dproduct_cache(
-            layout_atom.tree, prodCache, scaleCache, resource_alloc, param_slice1)  # computed on rank=0 only
-        dProdCache2 = dProdCache1 if (param_slice1 == param_slice2) else \
-            self._compute_dproduct_cache(layout_atom.tree, prodCache, scaleCache,
-                                         resource_alloc, param_slice2)  # computed on rank=0 only
-        hProdCache = self._compute_hproduct_cache(layout_atom.tree, prodCache, dProdCache1,
-                                                  dProdCache2, scaleCache, resource_alloc,
-                                                  param_slice1, param_slice2)  # computed on rank=0 only
-
-        if not resource_alloc.is_host_leader:
-            return  # Non-root host processors aren't used anymore to compute the result on the root proc
-
-        scaleVals = self._scale_exp(layout_atom.nonscratch_cache_view(scaleCache))
-        Gs = layout_atom.nonscratch_cache_view(prodCache, axis=0)
-        dGs1 = layout_atom.nonscratch_cache_view(dProdCache1, axis=0)
-        dGs2 = layout_atom.nonscratch_cache_view(dProdCache2, axis=0)
-        #( n_circuits, nDerivColsX, dim, dim )
-
-        hGs = layout_atom.nonscratch_cache_view(hProdCache, axis=0)
-        #( n_circuits, len(wrt_filter1), len(wrt_filter2), dim, dim )
-
-        old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices, dest_param_slice1, dest_param_slice2], self._hprobs_from_rho_e(
-                spam_tuple, rho, E, Gs[tree_indices], dGs1[tree_indices], dGs2[tree_indices],
-                hGs[tree_indices], scaleVals[tree_indices], param_slice1, param_slice2))
-
-        _np.seterr(**old_err)
-
-    def bulk_product(self, circuits, scale=False, resource_alloc=None):
-        """
-        Compute the products of many circuits at once.
-
-        Parameters
-        ----------
-        circuits : list of Circuits
-            The circuits to compute products for.  These should *not* have any preparation or
-            measurement layers.
-
-        scale : bool, optional
-            When True, return a scaling factor (see below).
-
-        resource_alloc : ResourceAllocation
-            Available resources for this computation. Includes the number of processors
-            (MPI comm) and memory limit.
-
-        Returns
-        -------
-        prods : numpy array
-            Array of shape S x G x G, where:
-            - S == the number of operation sequences
-            - G == the linear dimension of a operation matrix (G x G operation matrices).
-        scaleValues : numpy array
-            Only returned when scale == True. A length-S array specifying
-            the scaling that needs to be applied to the resulting products
-            (final_product[i] = scaleValues[i] * prods[i]).
-        """
-        resource_alloc = _ResourceAllocation.cast(resource_alloc)
-
-        # Need to break these circuits down into lanes first.
-        def compute_subcircuits(circuit, lanes_to_qubits_used, qubits_to_lanes):
-
-            lanes_to_gates = [[] for _ in range(len(lanes_to_qubits_used))]
-            for layer in circuit:
-                if isinstance(layer, LabelTupTup):
-                    group = []
-                    nused = 0
-                    for op in layer:
-                        qubits_used = op.qubits
-                        lane = qubits_to_lanes[qubits_used[0]]
-                        if nused + len(qubits_used) == len(lanes_to_qubits_used[lane]):
-                            group.append(op)
-                            lanes_to_gates[lane].append(LabelTupTup(tuple(group)))
-                            nused = 0
-                            group = []
-                        elif nused + len(qubits_used) < len(lanes_to_qubits_used[lane]):
-                            nused += len(qubits_used)
-                            group.append(op)
-                        else:
-                            raise ValueError("Too many indices")
-                elif isinstance(layer, LabelTup):
-                    qubits_used = layer.qubits
-                    lanes_to_gates[qubits_to_lanes[qubits_used[0]]] = layer
-            return lanes_to_gates
-
-        full_list = []
-        for cir in circuits:
-            full_list.append(compute_subcircuits(cir, self._lanes_used, self._qubits_to_lanes))
-
-
-        nCircuits = len(circuits)
-
-        eval_tree = _EvalTree.create(full_list)
-        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
-
-        # Now the cache will also hold the circuit lanes.
-        # So 0:nCircuits*nLanes will hold all the Gs.
-        # Tensor back up in a [(lane)*nLanes, (lane+1)*nLanes]
-
-        sval = _np.zeros(len(circuits))
-        gates = [1 for _ in circuits]
-
-        for ind in range():
-            for lane in range(len(self._lanes_used)):
-                gates[ind] = _np.kron(gates[ind], prodCache[lane + ind*len(self._lanes_used)])
-                sval[ind] += scaleCache[lane + ind*len(self._lanes_used)]
-
-        gates = _np.array(gates)
-        old_err = _np.seterr(over="ignore")
-        gates *= _np.exp(sval)[:, None, None]
-        _np.seterr(**old_err)
-
-
-        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
-        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
-        Gs = prodCache[0:nCircuits]
-
-        if scale:
-            return Gs, scaleVals
-        else:
-            old_err = _np.seterr(over='ignore')
-            Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
-            _np.seterr(**old_err)
-            return Gs
-
-    def bulk_dproduct(self, circuits, flat=False, return_prods=False,
-                      scale=False, resource_alloc=None, wrt_filter=None):
-        """
-        Compute the derivative of a many operation sequences at once.
-
-        Parameters
-        ----------
-        circuits : list of Circuits
-            The circuits to compute products for.  These should *not* have any preparation or
-            measurement layers.
-
-        flat : bool, optional
-            Affects the shape of the returned derivative array (see below).
-
-        return_prods : bool, optional
-            when set to True, additionally return the probabilities.
-
-        scale : bool, optional
-            When True, return a scaling factor (see below).
-
-        resource_alloc : ResourceAllocation
-            Available resources for this computation. Includes the number of processors
-            (MPI comm) and memory limit.
-
-        wrt_filter : list of ints, optional
-            If not None, a list of integers specifying which gate parameters
-            to include in the derivative.  Each element is an index into an
-            array of gate parameters ordered by concatenating each gate's
-            parameters (in the order specified by the model).  This argument
-            is used internally for distributing derivative calculations across
-            multiple processors.
-
-        Returns
-        -------
-        derivs : numpy array
-            * if flat == False, an array of shape S x M x G x G, where:
-              - S == len(circuits)
-              - M == the length of the vectorized model
-              - G == the linear dimension of a operation matrix (G x G operation matrices)
-              and derivs[i,j,k,l] holds the derivative of the (k,l)-th entry
-              of the i-th operation sequence product with respect to the j-th model
-              parameter.
-            * if flat == True, an array of shape S*N x M where:
-              - N == the number of entries in a single flattened gate (ordering same as numpy.flatten),
-              - S,M == as above,
-              and deriv[i,j] holds the derivative of the (i % G^2)-th entry of
-              the (i / G^2)-th flattened operation sequence product  with respect to
-              the j-th model parameter.
-        products : numpy array
-            Only returned when return_prods == True.  An array of shape
-            S x G x G; products[i] is the i-th operation sequence product.
-        scaleVals : numpy array
-            Only returned when scale == True.  An array of shape S such that
-            scaleVals[i] contains the multiplicative scaling needed for
-            the derivatives and/or products for the i-th operation sequence.
-        """
-        nCircuits = len(circuits)
-        nDerivCols = self.model.num_params if (wrt_filter is None) else _slct.length(wrt_filter)
-
-        wrtSlice = _slct.list_to_slice(wrt_filter) if (wrt_filter is not None) else None
-        #TODO: just allow slices as argument: wrt_filter -> wrtSlice?
-
-        resource_alloc = _ResourceAllocation.cast(resource_alloc)
-
-        eval_tree = _EvalTree.create(circuits)
-        prodCache, scaleCache = self._compute_product_cache(eval_tree, resource_alloc.comm)
-        dProdCache = self._compute_dproduct_cache(eval_tree, prodCache, scaleCache,
-                                                  resource_alloc.comm, wrtSlice)
-
-        # EvalTree evaluates a "cache" which can contain additional (intermediate) elements
-        scaleVals = self._scale_exp(scaleCache[0:nCircuits])
-        Gs = prodCache[0:nCircuits]
-        dGs = dProdCache[0:nCircuits]
-
-        if not scale:
-            old_err = _np.seterr(over='ignore', invalid='ignore')
-            if return_prods:
-                Gs = _np.swapaxes(_np.swapaxes(Gs, 0, 2) * scaleVals, 0, 2)  # may overflow, but ok
-
-            # may overflow or get nans (invalid), but ok
-            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 3) * scaleVals, 0, 3)
-            # convert nans to zero, as these occur b/c an inf scaleVal is mult by a zero deriv value, and we
-            dGs[_np.isnan(dGs)] = 0
-            _np.seterr(**old_err)
-
-        if flat:
-            # cols = deriv cols, rows = flattened everything else
-            dim = self.model.evotype.minimal_dim(self.model.state_space)
-            dGs = _np.swapaxes(_np.swapaxes(dGs, 0, 1).reshape(
-                (nDerivCols, nCircuits * dim**2)), 0, 1)
-
-        if return_prods:
-            return (dGs, Gs, scaleVals) if scale else (dGs, Gs)
-        else:
-            return (dGs, scaleVals) if scale else dGs
-
-    ## ---------------------------------------------------------------------------------------------
-    ## TIME DEPENDENT functionality ----------------------------------------------------------------
-    ## ---------------------------------------------------------------------------------------------
-
-    def _ds_quantities(self, timestamp, ds_cache, layout, dataset, TIMETOL=1e-6):
-        if timestamp not in ds_cache:
-            if 'truncated_ds' not in ds_cache:
-                ds_cache['truncated_ds'] = dataset.truncate(layout.circuits)
-            trunc_dataset = ds_cache['truncated_ds']
-
-            if 'ds_for_time' not in ds_cache:
-                #tStart = _time.time()
-                ds_cache['ds_for_time'] = trunc_dataset.split_by_time()
-                #print("DB: Split dataset by time in %.1fs (%d timestamps)" % (_time.time() - tStart,
-                #                                                              len(ds_cache['ds_for_time'])))
-
-            if timestamp not in ds_cache['ds_for_time']:
-                return (None, None, None, None, None)
-
-            #Similar to MDC store's add_count_vectors function -- maybe consolidate in FUTURE?
-            counts = _np.empty(layout.num_elements, 'd')
-            totals = _np.empty(layout.num_elements, 'd')
-            dataset_at_t = ds_cache['ds_for_time'][timestamp]  # trunc_dataset.time_slice(timestamp, timestamp+TIMETOL)
-
-            firsts = []; indicesOfCircuitsWithOmittedData = []
-            for (i, circuit) in enumerate(layout.circuits):  # should be 'ds_circuits' really
-                inds = layout.indices_for_index(i)
-                if circuit in dataset_at_t:
-                    cnts = dataset_at_t[circuit].counts
-                else:
-                    cnts = {}  # Note: this will cause 0 totals, which will need to be handled downstream
-                totals[inds] = sum(cnts.values())  # dataset[opStr].total
-                counts[inds] = [cnts.get(x, 0) for x in layout.outcomes_for_index(i)]
-                lklen = _slct.length(inds)  # consolidate w/ `add_omitted_freqs`?
-                if 0 < lklen < self.model.compute_num_outcomes(circuit):
-                    firsts.append(_slct.to_array(inds)[0])
-                    indicesOfCircuitsWithOmittedData.append(i)
-
-            if len(firsts) > 0:
-                firsts = _np.array(firsts, 'i')
-                indicesOfCircuitsWithOmittedData = _np.array(indicesOfCircuitsWithOmittedData, 'i')
-                #print("DB: SPARSE DATA: %d of %d rows have sparse data" % (len(firsts), len(layout.circuits)))
-            else:
-                firsts = indicesOfCircuitsWithOmittedData = None
-
-            #if self.circuits.circuit_weights is not None:
-            #  SEE add_count_vectors
-
-            nonzero_totals = _np.where(_np.abs(totals) < 1e-10, 1e-10, totals)  # avoid divide-by-zero error on nxt line
-            freqs = counts / nonzero_totals
-            ds_cache[timestamp] = (counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData)
-
-        return ds_cache[timestamp]
-
-    def _bulk_fill_timedep_objfn(self, raw_objective, array_to_fill, layout, ds_circuits,
-                                 num_total_outcomes, dataset, ds_cache=None):
-
-        assert(self._mode == "distribute_by_timestamp"), \
-            ("Must set `distribute_by_timestamp=True` to use a "
-             "time-dependent objective function with MatrixForwardSimulator!")
-
-        resource_alloc = layout.resource_alloc()
-        atom_resource_alloc = layout.resource_alloc('atom-processing')
-        atom_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
-
-        #Split timestamps up between processors - maybe do this in a time-dep layout?
-        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
-        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
-            _mpit.distribute_indices(list(range(len(all_timestamps))), atom_resource_alloc)
-        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
-
-        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
-                                                                  (layout.num_elements,), 'd')
-        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
-        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
-        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
-
-        array_to_fill[:] = 0.0
-        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
-        assert(my_array_to_fill.shape == (layout.num_elements,))
-
-        for timestamp_index in my_timestamp_inds:
-            timestamp = all_timestamps[timestamp_index]
-
-            # compute objective at time timestamp
-            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
-                self._ds_quantities(timestamp, ds_cache, layout, dataset)
-            if counts is None: return  # no data at this time => no contribution
-
-            for _, obj in self.model._iter_parameterized_objs():
-                obj.set_time(timestamp)
-            for opcache in self.model._opcaches.values():
-                for obj in opcache.values():
-                    obj.set_time(timestamp)
-
-            for atom in layout.atoms:  # layout only holds local atoms
-                self._bulk_fill_probs_atom(probs_array[atom.element_slice], atom, timestamp_processing_ralloc)
-
-            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
-            # (similar to DistributableForwardSimulator._bulk_fill_probs)
-
-            terms = raw_objective.terms(probs_array, counts, totals, freqs)
-            if firsts is not None and shared_mem_leader:  # consolidate with `_update_terms_for_omitted_probs`
-                omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
-                                                 for i in indicesOfCircuitsWithOmittedData])
-                terms[firsts] += raw_objective.zero_freq_terms(totals[firsts], omitted_probs)
-            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
-
-            my_array_to_fill += terms
-
-        #collect/gather results (SUM local arrays together)
-        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
-
-        _smt.cleanup_shared_ndarray(probs_array_shm)
-
-    def _bulk_fill_timedep_dobjfn(self, raw_objective, array_to_fill, layout, ds_circuits,
-                                  num_total_outcomes, dataset, ds_cache=None):
-
-        assert(self._mode == "distribute_by_timestamp"), \
-            ("Must set `distribute_by_timestamp=True` to use a "
-             "time-dependent objective function with MatrixForwardSimulator!")
-
-        resource_alloc = layout.resource_alloc()
-        param_resource_alloc = layout.resource_alloc('param-processing')
-        param_resource_alloc.host_comm_barrier()  # ensure all procs have finished w/shared memory before we begin
-
-        #Split timestamps up between processors - maybe do this in a time-dep layout?
-        all_timestamps = {i: t for i, t in enumerate(dataset.timestamps)}
-        my_timestamp_inds, timestampOwners, timestamp_processing_ralloc = \
-            _mpit.distribute_indices(list(range(len(all_timestamps))), param_resource_alloc)
-        shared_mem_leader = timestamp_processing_ralloc.is_host_leader
-
-        probs_array, probs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
-                                                                  (layout.num_elements,), 'd')
-        dprobs_array, dprobs_array_shm = _smt.create_shared_ndarray(timestamp_processing_ralloc,
-                                                                    (layout.num_elements, self.model.num_params), 'd')
-        # Allocated this way b/c, e.g.,  say we have 4 procs on a single node and 2 timestamps: then
-        # timestamp_processing_ralloc will have 2 procs and only the first will fill probs_array below since
-        #_bulk_fill_probs_atom assumes it's given shared mem allocated using the resource alloc object it's given.
-
-        array_to_fill[:] = 0.0
-        my_array_to_fill = _np.zeros(array_to_fill.shape, 'd')  # purely local array to accumulate results
-        all_param_slice = slice(0, self.model.num_params)  # All params computed at once for now
-        assert(my_array_to_fill.shape == (layout.num_elements, self.model.num_params))
-
-        for timestamp_index in my_timestamp_inds:
-            timestamp = all_timestamps[timestamp_index]
-            # compute objective at time layout_atom.time
-            #print("DB: Rank %d : layout atom for t=" % resource_alloc.comm.rank, layout_atom.timestamp)
-
-            counts, totals, freqs, firsts, indicesOfCircuitsWithOmittedData = \
-                self._ds_quantities(timestamp, ds_cache, layout, dataset)
-
-            for _, obj in self.model._iter_parameterized_objs():
-                obj.set_time(timestamp)
-            for opcache in self.model._opcaches.values():
-                for obj in opcache.values():
-                    obj.set_time(timestamp)
-
-            for atom in layout.atoms:  # layout only holds local atoms
-                self._bulk_fill_probs_atom(probs_array, atom, timestamp_processing_ralloc)
-                self._bulk_fill_dprobs_atom(dprobs_array, all_param_slice, atom,
-                                            all_param_slice, timestamp_processing_ralloc)
-
-            timestamp_processing_ralloc.host_comm_barrier()  # don't exit until all proc's array_to_fill is ready
-            # (similar to DistributableForwardSimulator._bulk_fill_probs)
-
-            if shared_mem_leader:
-                if firsts is not None:  # consolidate with TimeIndependentMDCObjectiveFunction.dterms?
-                    dprobs_omitted_rowsum = _np.empty((len(firsts), self.model.num_params), 'd')
-                    for ii, i in enumerate(indicesOfCircuitsWithOmittedData):
-                        dprobs_omitted_rowsum[ii, :] = _np.sum(dprobs_array[layout.indices_for_index(i), :], axis=0)
-
-                dprobs_array *= raw_objective.dterms(probs_array, counts, totals, freqs)[:, None]
-
-                if firsts is not None:  # consolidate with _update_dterms_for_omitted_probs?
-                    omitted_probs = 1.0 - _np.array([_np.sum(probs_array[layout.indices_for_index(i)])
-                                                     for i in indicesOfCircuitsWithOmittedData])
-                    dprobs_array[firsts] -= raw_objective.zero_freq_dterms(totals[firsts], omitted_probs)[:, None] \
-                        * dprobs_omitted_rowsum
-            timestamp_processing_ralloc.host_comm_barrier()  # have non-leader procs wait for leaders to set shared mem
-
-            my_array_to_fill += dprobs_array
-
-        #collect/gather results (SUM local arrays together)
-        resource_alloc.allreduce_sum(array_to_fill, my_array_to_fill, unit_ralloc=timestamp_processing_ralloc)
-
-        _smt.cleanup_shared_ndarray(probs_array_shm)
-        _smt.cleanup_shared_ndarray(dprobs_array_shm)
-
-    def bulk_fill_timedep_chi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                               min_prob_clip_for_weighting, prob_clip_interval, ds_cache=None):
-        """
-        Compute the chi2 contributions for an entire tree of circuits, allowing for time dependent operations.
-
-        Computation is performed by summing together the contributions for each time the circuit is
-        run, as given by the timestamps in `dataset`.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated 1D numpy array of length equal to the
-            total number of computed elements (i.e. layout.num_elements)
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the chi2 contributions.
-
-        min_prob_clip_for_weighting : float, optional
-            Sets the minimum and maximum probability p allowed in the chi^2
-            weights: N/(p*(1-p)) by clipping probability p values to lie within
-            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
-        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
-                                   layout.resource_alloc())
-        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                             dataset, ds_cache)
-
-    def bulk_fill_timedep_dchi2(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                                min_prob_clip_for_weighting, prob_clip_interval, chi2_array_to_fill=None,
-                                ds_cache=None):
-        """
-        Compute the chi2 jacobian contributions for an entire tree of circuits, allowing for time dependent operations.
-
-        Similar to :meth:`bulk_fill_timedep_chi2` but compute the *jacobian*
-        of the summed chi2 contributions for each circuit with respect to the
-        model's parameters.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated ExM numpy array where E is the total number of
-            computed elements (i.e. layout.num_elements) and M is the
-            number of model parameters.
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the chi2 contributions.
-
-        min_prob_clip_for_weighting : float, optional
-            Sets the minimum and maximum probability p allowed in the chi^2
-            weights: N/(p*(1-p)) by clipping probability p values to lie within
-            the interval [ min_prob_clip_for_weighting, 1-min_prob_clip_for_weighting ].
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        chi2_array_to_fill : numpy array, optional
-            when not None, an already-allocated length-E numpy array that is filled
-            with the per-circuit chi2 contributions, just like in
-            bulk_fill_timedep_chi2(...).
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawChi2Function as _RawChi2Function
-        raw_obj = _RawChi2Function({'min_prob_clip_for_weighting': min_prob_clip_for_weighting},
-                                   layout.resource_alloc())
-        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                              dataset, ds_cache)
-
-    def bulk_fill_timedep_loglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                                 min_prob_clip, radius, prob_clip_interval, ds_cache=None):
-        """
-        Compute the log-likelihood contributions (within the "poisson picture") for an entire tree of circuits.
-
-        Computation is performed by summing together the contributions for each time the circuit is run,
-        as given by the timestamps in `dataset`.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated 1D numpy array of length equal to the
-            total number of computed elements (i.e. layout.num_elements)
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the logl contributions.
-
-        min_prob_clip : float, optional
-            The minimum probability treated normally in the evaluation of the
-            log-likelihood.  A penalty function replaces the true log-likelihood
-            for probabilities that lie below this threshold so that the
-            log-likelihood never becomes undefined (which improves optimizer
-            performance).
-
-        radius : float, optional
-            Specifies the severity of rounding used to "patch" the
-            zero-frequency terms of the log-likelihood.
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
-        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
-                                                  layout.resource_alloc())
-        return self._bulk_fill_timedep_objfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                             dataset, ds_cache)
-
-    def bulk_fill_timedep_dloglpp(self, array_to_fill, layout, ds_circuits, num_total_outcomes, dataset,
-                                  min_prob_clip, radius, prob_clip_interval, logl_array_to_fill=None, ds_cache=None):
-        """
-        Compute the ("poisson picture")log-likelihood jacobian contributions for an entire tree of circuits.
-
-        Similar to :meth:`bulk_fill_timedep_loglpp` but compute the *jacobian*
-        of the summed logl (in posison picture) contributions for each circuit
-        with respect to the model's parameters.
-
-        Parameters
-        ----------
-        array_to_fill : numpy ndarray
-            an already-allocated ExM numpy array where E is the total number of
-            computed elements (i.e. layout.num_elements) and M is the
-            number of model parameters.
-
-        layout : CircuitOutcomeProbabilityArrayLayout
-            A layout for `array_to_fill`, describing what circuit outcome each
-            element corresponds to.  Usually given by a prior call to :meth:`create_layout`.
-
-        ds_circuits : list of Circuits
-            the circuits to use as they should be queried from `dataset` (see
-            below).  This is typically the same list of circuits used to
-            construct `layout` potentially with some aliases applied.
-
-        num_total_outcomes : list or array
-            a list of the total number of *possible* outcomes for each circuit
-            (so `len(num_total_outcomes) == len(ds_circuits_to_use)`).  This is
-            needed for handling sparse data, where `dataset` may not contain
-            counts for all the possible outcomes of each circuit.
-
-        dataset : DataSet
-            the data set used to compute the logl contributions.
-
-        min_prob_clip : float
-            a regularization parameter for the log-likelihood objective function.
-
-        radius : float
-            a regularization parameter for the log-likelihood objective function.
-
-        prob_clip_interval : 2-tuple or None, optional
-            (min,max) values used to clip the predicted probabilities to.
-            If None, no clipping is performed.
-
-        logl_array_to_fill : numpy array, optional
-            when not None, an already-allocated length-E numpy array that is filled
-            with the per-circuit logl contributions, just like in
-            bulk_fill_timedep_loglpp(...).
-
-        Returns
-        -------
-        None
-        """
-        from pygsti.objectivefns.objectivefns import RawPoissonPicDeltaLogLFunction as _RawPoissonPicDeltaLogLFunction
-        raw_obj = _RawPoissonPicDeltaLogLFunction({'min_prob_clip': min_prob_clip, 'radius': radius},
-                                                  layout.resource_alloc())
-        return self._bulk_fill_timedep_dobjfn(raw_obj, array_to_fill, layout, ds_circuits, num_total_outcomes,
-                                              dataset, ds_cache)
-
-
 class LCSEvalTreeMatrixForwardSimulator(MatrixForwardSimulator):
 
     def bulk_product(self, circuits, scale=False, resource_alloc=None):

From 0f560f7144583ef361e1107341f2c20ee4ed139a Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 16:47:02 -0700
Subject: [PATCH 065/141] Regions in evaltree.

---
 pygsti/layouts/evaltree.py | 54 +++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 44c82fec9..8886078e3 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -453,7 +453,7 @@ def _get_start_indices(max_intersect):
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
 
-
+#region Longest Common Subsequence
 
 def _best_matching_only(A: Sequence, B: Sequence) -> int:
     """
@@ -472,7 +472,7 @@ def _best_matching_only(A: Sequence, B: Sequence) -> int:
 
 
 
-def _lcs_dp_version(A, B):
+def _lcs_dp_version(A: Sequence, B: Sequence):
     """
     Compute the longest common substring between A and B using
     dynamic programming.
@@ -500,7 +500,7 @@ def setup_lcs_dynamic_programming_table(A, B):
     """
     return _np.zeros((len(A) + 1, len(B) + 1))
 
-def build_one_round_of_eval_tree(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
+def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
     if table_data_and_sequences:
         table, sequences = table_data_and_sequences
     else:
@@ -575,7 +575,7 @@ def build_one_round_of_eval_tree(circuits, table_data_and_sequences, internal_ta
 
     return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
 
-def locate_sequences_in_AB(A, B, dp_table) -> tuple[int, int, int]:
+def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
     """
     Finds the indices of the starting points of the sequences in A and B.
 
@@ -624,7 +624,7 @@ def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
                 if len(cir1) >= curr_best:
                     table = _lcs_dp_version(cir0, cir1)
                     best_lengths[i,j] = table[0,0]
-                    best_subsequences[(i,j)] = locate_sequences_in_AB(cir0, cir1, table)
+                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
                     curr_best = max(best_lengths[i,j], curr_best)
                 else:
                     best_lengths[i,j] = -1
@@ -636,10 +636,12 @@ def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
     return best_lengths, best_subsequences
 
 
-def _longest_common_internal_subsequence(A: _Circuit) -> tuple[int, dict[tuple, list[int]]]:
+def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
     """
     Compute the longest common subsequence within a single circuit A.
 
+    Cost ~ O(L^3 / 8) where L is the length of A
+
     Returns:
     ---------
     int - length of longest common subsequences within A
@@ -670,6 +672,8 @@ def _longest_common_internal_subsequence(A: _Circuit) -> tuple[int, dict[tuple,
 def build_internal_tables(circuit_list):
     """
     Compute all the longest common internal sequences for each circuit A in circuit_list
+
+    Total cost is O(C L^3).
     """
 
     C = len(circuit_list)
@@ -683,6 +687,10 @@ def build_internal_tables(circuit_list):
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
 
+
+#endregion Longest Common Subsequence
+
+#region Split circuit list into lists of subcircuits
 def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
     """
     Add in explicit idles to the labels for each layer.
@@ -793,31 +801,6 @@ def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[
 
     return lanes_to_gates
 
-
-def _split_circuits_by_lanes(circuit_list):
-    # First eliminate the duplicate circuits.
-
-    unique_circuits = []
-    matching_inds: dict[int, set[int]] = {}
-    C = len(circuit_list)
-    seen_circs: dict[tuple[LabelTupTup, int]] = {}
-    cache = {i: circuit_list[i] for i in range(len(circuit_list))}
-    for i in range(C):
-        my_cir = circuit_list[i]
-        if tuple(my_cir) in seen_circs:
-            cache[i] = seen_circs[tuple(my_cir)]
-        else:
-            seen_circs[tuple(my_cir)] = i
-
-    labels_to_circuits = {}
-    for my_cir in seen_circs:
-        line_labels = _Circuit(my_cir)._line_labels
-        if line_labels in labels_to_circuits:
-            labels_to_circuits[line_labels].append(my_cir)
-        else:
-            labels_to_circuits[line_labels] = [my_cir]
-        
-
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
@@ -869,6 +852,11 @@ def setup_circuit_list_for_LCS_computations(
         # cir_id_to_lanes.append(lanes_to_qubits)
     return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
+#endregion Split Circuits by lanes helpers
+
+
+#region Lane Collapsing Helpers
+
 def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
     """
     Look up the dense representation of a gate in the model.
@@ -910,7 +898,7 @@ def combine_two_gates(cumulative_term, next_dense_matrix):
     which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
     """
     return next_dense_matrix @ cumulative_term
-
+#endregion Lane Collapsing Helpers
 
 class EvalTreeBasedUponLongestCommonSubstring():
 
@@ -943,7 +931,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         i = 0
         while max_rounds > 1:
-            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = build_one_round_of_eval_tree(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = _conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
             i += 1
             external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
 

From 248fa7e89bedda482d282c2424508167727ab902 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 16:52:23 -0700
Subject: [PATCH 066/141] inline a function

---
 pygsti/layouts/evaltree.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 8886078e3..812ea56e5 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -477,12 +477,9 @@ def _lcs_dp_version(A: Sequence, B: Sequence):
     Compute the longest common substring between A and B using
     dynamic programming.
 
-    
     This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
-
     """
-    
-    table = setup_lcs_dynamic_programming_table(A, B)
+    table = _np.zeros((len(A) + 1, len(B) + 1))
     n, m = table.shape
     for i in range(n-2, -1, -1):
         for j in range(m-2, -1, -1):
@@ -494,11 +491,6 @@ def _lcs_dp_version(A: Sequence, B: Sequence):
             table[i,j] = max(opt1, opt2, opt3)
     return table
 
-def setup_lcs_dynamic_programming_table(A, B):
-    """
-    Create the table used for LCS dynamic programming.
-    """
-    return _np.zeros((len(A) + 1, len(B) + 1))
 
 def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
     if table_data_and_sequences:

From c9b1adfc314d31b1b8d2294251d8256ab2180863 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 10 Jul 2025 17:08:08 -0700
Subject: [PATCH 067/141] whitespace

---
 pygsti/layouts/evaltree.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 812ea56e5..3cbb3b144 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -453,6 +453,7 @@ def _get_start_indices(max_intersect):
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
 
+
 #region Longest Common Subsequence
 
 def _best_matching_only(A: Sequence, B: Sequence) -> int:
@@ -471,7 +472,6 @@ def _best_matching_only(A: Sequence, B: Sequence) -> int:
     return len(A[:i])
 
 
-
 def _lcs_dp_version(A: Sequence, B: Sequence):
     """
     Compute the longest common substring between A and B using
@@ -567,6 +567,7 @@ def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences,
 
     return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
 
+
 def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
     """
     Finds the indices of the starting points of the sequences in A and B.
@@ -600,6 +601,7 @@ def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
             return i-1, j-1, dp_table[i,j]
     return None, None, None
 
+
 def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
     """
     Computes the LCS for every pair of circuits A,B in circuit_list
@@ -661,6 +663,7 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
             return best, best_ind
     return best, best_ind
 
+
 def build_internal_tables(circuit_list):
     """
     Compute all the longest common internal sequences for each circuit A in circuit_list
@@ -679,10 +682,11 @@ def build_internal_tables(circuit_list):
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
 
-
 #endregion Longest Common Subsequence
 
+
 #region Split circuit list into lists of subcircuits
+
 def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
     """
     Add in explicit idles to the labels for each layer.
@@ -756,7 +760,6 @@ def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, i
     return compute_qubits_to_lanes(lanes), lanes
 
 
-
 def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
     """
     Split a circuit into multiple subcircuits which do not talk across lanes.
@@ -793,6 +796,7 @@ def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[
 
     return lanes_to_gates
 
+
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
@@ -864,6 +868,7 @@ def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
     else:
         raise ValueError("Missing attribute")
 
+
 def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     op_term = 1
     if op.num_qubits == 2:
@@ -881,6 +886,7 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
         op_term = model_and_gate_to_dense_rep(model, op)
     return op_term
 
+
 def combine_two_gates(cumulative_term, next_dense_matrix):
     """
     Note that the visual representation was
@@ -890,8 +896,10 @@ def combine_two_gates(cumulative_term, next_dense_matrix):
     which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
     """
     return next_dense_matrix @ cumulative_term
+
 #endregion Lane Collapsing Helpers
 
+
 class EvalTreeBasedUponLongestCommonSubstring():
 
     def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
@@ -984,7 +992,6 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         # Assumes a perfect swap gate!
         # self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")            
 
-
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
         Construct a tree from another tree.
@@ -1027,9 +1034,6 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
             updated[new_cir] = loc
         self.circuit_to_save_location = updated
 
-
-
-
     def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int):
         """
         Compute the total product cache. Note that this may still have a tensor product
@@ -1119,7 +1123,6 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         return list(output)
 
-
     """        
     def _evaluate_product_rule(self, cind: int, rn: int):
 
@@ -1178,7 +1181,7 @@ def _evaluate_product_rule(self, cind: int, rn: int):
                         else:
                             cumulative_term = val @ cumulative_term
     """
-    
+
 
 class CollectionOfLCSEvalTrees():
 
@@ -1291,7 +1294,6 @@ def compute_tensor_orders(self):
 
         return
             
-
     def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...], cache):
         
 

From 75bb7980e39172a36549720cac4b6d3e95a4a2c7 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 17:13:45 -0700
Subject: [PATCH 068/141] Get the dense operator in the minimal space required
 for a gate operation.

---
 pygsti/models/explicitmodel.py   | 28 +++++++++++++++++++++++++++-
 pygsti/models/layerrules.py      | 19 ++++++++++++++++++-
 pygsti/models/localnoisemodel.py | 23 +++++++++++++++++++++--
 3 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index e8cb3511c..7356db6a3 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -34,7 +34,7 @@
 from pygsti.modelmembers.operations import opfactory as _opfactory
 from pygsti.baseobjs.basis import Basis as _Basis
 from pygsti.baseobjs.basis import BuiltinBasis as _BuiltinBasis, DirectSumBasis as _DirectSumBasis
-from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
+from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel, LabelTup as _LabelTup
 from pygsti.baseobjs import statespace as _statespace
 from pygsti.tools import basistools as _bt
 from pygsti.tools import jamiolkowski as _jt
@@ -45,6 +45,8 @@
 from pygsti.tools import listtools as _lt
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
 
+from pygsti.modelmembers.operations import EmbeddedOp as _EmbeddedOp, ComposedOp as _ComposedOp
+
 
 class Roster:
     def __init__(self, arg):
@@ -1788,3 +1790,27 @@ def operation_layer_operator(self, model, layerlbl, caches):
             return model.operations[layerlbl]
         else:
             return _opfactory.op_from_factories(model.factories, layerlbl)
+        
+    def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+
+        operation = model.operations["gates"][lbl]
+
+        if isinstance(operation, _EmbeddedOp):
+            return operation.embedded_op.to_dense()
+        elif isinstance(operation, _ComposedOp):
+            breakpoint()
+        return operation.to_dense('minimal')
+
diff --git a/pygsti/models/layerrules.py b/pygsti/models/layerrules.py
index d1fcd9357..5060d3087 100644
--- a/pygsti/models/layerrules.py
+++ b/pygsti/models/layerrules.py
@@ -13,7 +13,8 @@
 
 from pygsti.modelmembers import operations as _op
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
-
+from pygsti.baseobjs.label import LabelTup as _LabelTup
+from pygsti.models.model import OpModel as _OpModel
 
 class LayerRules(_NicelySerializable):
     """
@@ -110,3 +111,19 @@ def operation_layer_operator(self, model, layerlbl, cache):
         """
         #raise KeyError(f"Cannot create operator for non-primitive layer: {layerlbl}")
         raise KeyError("Cannot create operator for non-primitive layer: %s" % str(layerlbl))
+    
+    def get_dense_process_matrix_represention_for_gate(self, model: _OpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+        raise KeyError("Cannot find a dense operator for layer: %s" % str(lbl))
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index cc46bb770..d9e21ceee 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -37,6 +37,7 @@
 from pygsti.tools import optools as _ot
 from pygsti.tools import listtools as _lt
 from pygsti.processors.processorspec import ProcessorSpec as _ProcessorSpec, QubitProcessorSpec as _QubitProcessorSpec
+from pygsti.baseobjs.label import LabelTup as _LabelTup
 
 
 class LocalNoiseModel(_ImplicitOpModel):
@@ -171,7 +172,7 @@ def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None,
         idle_names = processor_spec.idle_gate_names
         global_idle_layer_label = processor_spec.global_idle_layer_label
 
-        layer_rules = _SimpleCompLayerRules(qudit_labels, implicit_idle_mode, None, global_idle_layer_label)
+        layer_rules = _SimpleCompLayerRules(qudit_labels, implicit_idle_mode, None, global_idle_layer_label, independent_gates=independent_gates)
 
         super(LocalNoiseModel, self).__init__(state_space, layer_rules, 'pp',
                                               simulator=simulator, evotype=evotype)
@@ -406,7 +407,7 @@ def rescale(coeffs):
 
 class _SimpleCompLayerRules(_LayerRules):
 
-    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label):
+    def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels, global_idle_layer_label, independent_gates):
         super().__init__()
         self.implicit_idle_mode = implicit_idle_mode  # how to handle implied idles ("blanks") in circuits
         self.qubit_labels = qubit_labels
@@ -414,6 +415,7 @@ def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels,
         self._add_global_idle_to_all_layers = False
         self._add_padded_idle = False
         self.use_op_caching = True  # expert functionality - can be turned off if needed
+        self._spacial_homogeneity_assumed = independent_gates
 
         if implicit_idle_mode not in ('none', 'add_global', 'only_global', 'pad_1Q'):
             raise ValueError("Invalid `implicit_idle_mode`: '%s'" % str(implicit_idle_mode))
@@ -613,6 +615,23 @@ def _layer_component_operation(self, model, complbl, cache):
             ret = _opfactory.op_from_factories(model.factories['layers'], complbl)
         return ret
 
+    def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel, lbl: _LabelTup):
+        """
+        Get the dense process matrix corresponding to the lbl.
+        Note this should be the minimal size required to represent the dense operator.
+
+        Parameters
+        ----------
+        lbl: Label
+            A label with a gate name and a specific set of qubits it will be acting on.
+        
+        Returns
+        ----------
+        _np.ndarray
+        """
+
+        key = lbl.name if self._spacial_homogeneity_assumed else lbl
+        return model.operation_blks["gates"][key].to_dense()
 
 
 

From 6ff9e05b3f540a36feeec0cd1fb5564c37a71357 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 10 Jul 2025 17:24:12 -0700
Subject: [PATCH 069/141] Extract LCS work.

---
 pygsti/layouts/evaltree.py    | 237 +--------------------------------
 pygsti/tools/sequencetools.py | 241 ++++++++++++++++++++++++++++++++++
 2 files changed, 245 insertions(+), 233 deletions(-)
 create mode 100644 pygsti/tools/sequencetools.py

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 3cbb3b144..f493c7cc3 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -23,7 +23,7 @@
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
-from typing import Sequence
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, build_internal_tables
 import time
 
 
@@ -454,235 +454,6 @@ def _get_start_indices(max_intersect):
         return disjointLists, helpfulScratchLists
 
 
-#region Longest Common Subsequence
-
-def _best_matching_only(A: Sequence, B: Sequence) -> int:
-    """
-    Returns:
-    -----
-    int - the length of the longest matching prefix between A and B.
-    """
-    i = 0
-    n = len(A)
-    m = len(B)
-    while i < n and i < m:
-        if A[i] != B[i]:
-            return len(A[:i])
-        i += 1
-    return len(A[:i])
-
-
-def _lcs_dp_version(A: Sequence, B: Sequence):
-    """
-    Compute the longest common substring between A and B using
-    dynamic programming.
-
-    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
-    """
-    table = _np.zeros((len(A) + 1, len(B) + 1))
-    n, m = table.shape
-    for i in range(n-2, -1, -1):
-        for j in range(m-2, -1, -1):
-            opt1 = 0
-            if A[i] == B[j]:
-                opt1 = _best_matching_only(A[i:], B[j:])
-            opt2 = table[i, j+1]
-            opt3 = table[i+1, j]
-            table[i,j] = max(opt1, opt2, opt3)
-    return table
-
-
-def _conduct_one_round_of_lcs_simplification(circuits, table_data_and_sequences, internal_tables_and_sequences, starting_cache_num, cache_struct, round_num: int=0):
-    if table_data_and_sequences:
-        table, sequences = table_data_and_sequences
-    else:
-        table, sequences = _compute_lcs_for_every_pair_of_circuits(circuits)
-
-    if internal_tables_and_sequences:
-        internal_subtable, internal_subsequences = internal_tables_and_sequences
-    else:
-        internal_subtable, internal_subsequences = build_internal_tables(circuits)
-
-    best_index = _np.where(table == _np.max(table))
-    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
-    updated_circuits = circuits
-    cache_num = starting_cache_num
-
-    # Build sequence dict
-    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
-
-    if _np.max(internal_subtable) >= _np.max(table):
-        # We are only going to replace if this was the longest substring.
-        for cir_ind in best_internal_index[0]:
-            for seq in internal_subsequences[cir_ind]:
-                key = tuple(seq)
-                if key in all_subsequences_to_replace:
-                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
-                else:
-                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
-
-    if _np.max(table) >= _np.max(internal_subtable):
-        for ii in range(len(best_index[0])):
-            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
-            cir_index = best_index[0][ii]
-            cir_index2 = best_index[1][ii]
-            seq = updated_circuits[cir_index][starting_point: int(starting_point + length+1)]
-
-            key = tuple(seq)
-            if key in all_subsequences_to_replace:
-                if cir_index not in all_subsequences_to_replace[key]:
-                    # We did not already handle this with internal subsequences.
-                    all_subsequences_to_replace[key][cir_index] = [starting_point]
-                if cir_index2 not in all_subsequences_to_replace[key]:
-                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
-
-            else:
-                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
-
-
-    # Handle the updates.
-    old_cache_num = cache_num
-    for seq, cdict in all_subsequences_to_replace.items():
-        w = len(seq)
-        if  w > 1 or (not isinstance(seq[0], int)):
-            # We have reached an item which we can just compute.
-            for cir_ind in cdict:
-                my_cir = updated_circuits[cir_ind]
-                sp = 0
-                while sp+w <= len(my_cir):
-                    if list(my_cir[sp: sp+w]) == list(seq):
-                        my_cir[sp: sp + w] = [cache_num]
-
-                    sp += 1
-                updated_circuits[cir_ind] = my_cir
-
-                cache_struct[cir_ind] = updated_circuits[cir_ind]
-
-            updated_circuits.append(list(seq))
-            cache_struct[cache_num] = updated_circuits[cache_num]
-
-            cache_num += 1
-
-    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
-
-    return updated_circuits, cache_num, cache_struct, sequences_introduced_in_this_round
-
-
-def _find_starting_positions_using_dp_table(dp_table) -> tuple[int, int, int]:
-    """
-    Finds the indices of the starting points of the sequences in A and B.
-
-    Returns:
-    ---------
-    int - starting index in A of LCS(A,B)
-    int - starting index in B of LCS(A,B)
-    int - length of LCS(A,B)
-    """
-    n, m = dp_table.shape
-    i = 0
-    j = 0
-    while i < n-1 and j < m -1:
-        curr = dp_table[i,j]
-        opt1 = dp_table[i+1, j+1]
-        opt2 = dp_table[i+1, j]
-        opt3 = dp_table[i, j+1]
-        options = [opt1, opt2, opt3]
-        if _np.all(curr == options):
-            i += 1
-            j += 1
-        elif opt2 > opt1 and opt2 > opt3:
-            i += 1
-        elif opt3 > opt2 and opt3 > opt1:
-            j += 1
-        else:
-            # All three options are equal. So we should march the diagonal.
-            i += 1
-            j += 1
-            return i-1, j-1, dp_table[i,j]
-    return None, None, None
-
-
-def _compute_lcs_for_every_pair_of_circuits(circuit_list: list[_Circuit]):
-    """
-    Computes the LCS for every pair of circuits A,B in circuit_list
-    """
-    best_subsequences = {}
-    best_lengths = _np.zeros((len(circuit_list), len(circuit_list)))
-    curr_best = 0
-    for i in range(len(circuit_list)-1, -1, -1): # Lets do this in reverse order
-        cir0 = circuit_list[i]
-        if len(cir0) >= curr_best:
-            # Could be the best.
-            for j in range(i-1, -1, -1):
-                cir1 = circuit_list[j]
-                if len(cir1) >= curr_best:
-                    table = _lcs_dp_version(cir0, cir1)
-                    best_lengths[i,j] = table[0,0]
-                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
-                    curr_best = max(best_lengths[i,j], curr_best)
-                else:
-                    best_lengths[i,j] = -1
-                    best_subsequences[(i,j)] = (None, None, None)
-        else:
-            # Skipped because cannot be the best yet.
-            best_lengths[i,j] = -1
-            best_subsequences[(i,j)] = (None, None, None)
-    return best_lengths, best_subsequences
-
-
-def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
-    """
-    Compute the longest common subsequence within a single circuit A.
-
-    Cost ~ O(L^3 / 8) where L is the length of A
-
-    Returns:
-    ---------
-    int - length of longest common subsequences within A
-    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
-    """
-    n = len(A)
-    best = 0
-    best_ind = {}
-    changed = False
-    for w in range(1, int(_np.floor(n / 2) + 1)):
-        for sp in range(n - w):
-            window = A[sp: sp + w]
-            for match in range(sp+ w, n-w + 1):
-                if A[match: match + w] == window:
-                    if best == w:
-                        if tuple(window) in best_ind:
-                            best_ind[tuple(window)].add(match)
-                        else:
-                            best_ind[tuple(window)] = {sp, match}
-                    else:
-                        best_ind = {tuple(window): {sp, match}}
-                        changed = True
-                        best = w
-        if not changed:
-            return best, best_ind
-    return best, best_ind
-
-
-def build_internal_tables(circuit_list):
-    """
-    Compute all the longest common internal sequences for each circuit A in circuit_list
-
-    Total cost is O(C L^3).
-    """
-
-    C = len(circuit_list)
-    the_table = _np.zeros(C)
-    seq_table = [[] for _ in range(C)]
-
-    curr_best = 1
-    for i in range(C):
-        if len(circuit_list[i]) >= curr_best:
-            the_table[i], seq_table[i] = _longest_common_internal_subsequence(circuit_list[i])
-            curr_best = max(curr_best, the_table[i])
-    return the_table, seq_table
-
-#endregion Longest Common Subsequence
 
 
 #region Split circuit list into lists of subcircuits
@@ -909,7 +680,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
 
-        external_matches = _compute_lcs_for_every_pair_of_circuits(circuit_list)
+        external_matches = _compute_lcs_for_every_pair_of_sequences(circuit_list)
         
         best_external_match = _np.max(external_matches[0])
         self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
@@ -931,9 +702,9 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         i = 0
         while max_rounds > 1:
-            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = _conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache, i)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache)
             i += 1
-            external_matches = _compute_lcs_for_every_pair_of_circuits(new_circuit_list)
+            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list)
 
             if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
                 # We are not going to get a better internal match.
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
new file mode 100644
index 000000000..69c2f505d
--- /dev/null
+++ b/pygsti/tools/sequencetools.py
@@ -0,0 +1,241 @@
+from typing import Sequence
+import numpy as _np
+
+
+#region Longest Common Subsequence
+
+def _best_matching_only(A: Sequence, B: Sequence) -> int:
+    """
+    Returns:
+    -----
+    int - the length of the longest matching prefix between A and B.
+    """
+    i = 0
+    n = len(A)
+    m = len(B)
+    while i < n and i < m:
+        if A[i] != B[i]:
+            return len(A[:i])
+        i += 1
+    return len(A[:i])
+
+
+def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
+    """
+    Compute the longest common substring between A and B using
+    dynamic programming.
+
+    This will use O(n \times m) space and take O(n \times m \times max(m, n)) time.
+    """
+    table = _np.zeros((len(A) + 1, len(B) + 1))
+    n, m = table.shape
+    for i in range(n-2, -1, -1):
+        for j in range(m-2, -1, -1):
+            opt1 = 0
+            if A[i] == B[j]:
+                opt1 = _best_matching_only(A[i:], B[j:])
+            opt2 = table[i, j+1]
+            opt3 = table[i+1, j]
+            table[i,j] = max(opt1, opt2, opt3)
+    return table
+
+
+def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_data_and_sequences,
+                                            internal_tables_and_sequences,
+                                            starting_cache_num,
+                                            cache_struct):
+    """
+    Simplify the set of sequences by contracting the set of longest common subsequences.
+
+    Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
+    """
+    if table_data_and_sequences:
+        table, sequences = table_data_and_sequences
+    else:
+        table, sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = build_internal_tables(sequences)
+
+    best_index = _np.where(table == _np.max(table))
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_sequences = sequences
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
+
+    if _np.max(internal_subtable) >= _np.max(table):
+        # We are only going to replace if this was the longest substring.
+        for cir_ind in best_internal_index[0]:
+            for seq in internal_subsequences[cir_ind]:
+                key = tuple(seq)
+                if key in all_subsequences_to_replace:
+                    all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+                else:
+                    all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    if _np.max(table) >= _np.max(internal_subtable):
+        for ii in range(len(best_index[0])):
+            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
+            cir_index = best_index[0][ii]
+            cir_index2 = best_index[1][ii]
+            seq = updated_sequences[cir_index][starting_point: int(starting_point + length+1)]
+
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                if cir_index not in all_subsequences_to_replace[key]:
+                    # We did not already handle this with internal subsequences.
+                    all_subsequences_to_replace[key][cir_index] = [starting_point]
+                if cir_index2 not in all_subsequences_to_replace[key]:
+                    all_subsequences_to_replace[key][cir_index2] = [starting_point_2]
+
+            else:
+                all_subsequences_to_replace[key] = {cir_index: [starting_point], cir_index2: [starting_point_2]}
+
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_sequences[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+
+                    sp += 1
+                updated_sequences[cir_ind] = my_cir
+
+                cache_struct[cir_ind] = updated_sequences[cir_ind]
+
+            updated_sequences.append(list(seq))
+            cache_struct[cache_num] = updated_sequences[cache_num]
+
+            cache_num += 1
+
+    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+
+    return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round
+
+
+def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int, int, int]:
+    """
+    Finds the starting positions for the longest common subsequence.
+
+    Returns:
+    ---------
+    int - starting index in A of LCS(A,B)
+    int - starting index in B of LCS(A,B)
+    int - length of LCS(A,B)
+    """
+    n, m = dp_table.shape
+    i = 0
+    j = 0
+    while i < n-1 and j < m -1:
+        curr = dp_table[i,j]
+        opt1 = dp_table[i+1, j+1]
+        opt2 = dp_table[i+1, j]
+        opt3 = dp_table[i, j+1]
+        options = [opt1, opt2, opt3]
+        if _np.all(curr == options):
+            i += 1
+            j += 1
+        elif opt2 > opt1 and opt2 > opt3:
+            i += 1
+        elif opt3 > opt2 and opt3 > opt1:
+            j += 1
+        else:
+            # All three options are equal. So we should march the diagonal.
+            i += 1
+            j += 1
+            return i-1, j-1, dp_table[i,j]
+    return None, None, None
+
+
+def _compute_lcs_for_every_pair_of_sequences(sequences: list):
+    """
+    Computes the LCS for every pair of sequences A,B in sequences
+    """
+    best_subsequences = {}
+    best_lengths = _np.zeros((len(sequences), len(sequences)))
+    curr_best = 0
+    for i in range(len(sequences)-1, -1, -1): # Lets do this in reverse order
+        cir0 = sequences[i]
+        if len(cir0) >= curr_best:
+            # Could be the best.
+            for j in range(i-1, -1, -1):
+                cir1 = sequences[j]
+                if len(cir1) >= curr_best:
+                    table = _lcs_dp_version(cir0, cir1)
+                    best_lengths[i,j] = table[0,0]
+                    best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
+                    curr_best = max(best_lengths[i,j], curr_best)
+                else:
+                    best_lengths[i,j] = -1
+                    best_subsequences[(i,j)] = (None, None, None)
+        else:
+            # Skipped because cannot be the best yet.
+            best_lengths[i,j] = -1
+            best_subsequences[(i,j)] = (None, None, None)
+    return best_lengths, best_subsequences
+
+
+def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
+    """
+    Compute the longest common subsequence within a single circuit A.
+
+    Cost ~ O(L^3 / 8) where L is the length of A
+
+    Returns:
+    ---------
+    int - length of longest common subsequences within A
+    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
+    """
+    n = len(A)
+    best = 0
+    best_ind = {}
+    changed = False
+    for w in range(1, int(_np.floor(n / 2) + 1)):
+        for sp in range(n - w):
+            window = A[sp: sp + w]
+            for match in range(sp+ w, n-w + 1):
+                if A[match: match + w] == window:
+                    if best == w:
+                        if tuple(window) in best_ind:
+                            best_ind[tuple(window)].add(match)
+                        else:
+                            best_ind[tuple(window)] = {sp, match}
+                    else:
+                        best_ind = {tuple(window): {sp, match}}
+                        changed = True
+                        best = w
+        if not changed:
+            return best, best_ind
+    return best, best_ind
+
+
+def build_internal_tables(sequences):
+    """
+    Compute all the longest common internal sequences for each circuit A in sequences
+
+    Total cost is O(C L^3).
+    """
+
+    C = len(sequences)
+    the_table = _np.zeros(C)
+    seq_table = [[] for _ in range(C)]
+
+    curr_best = 1
+    for i in range(C):
+        if len(sequences[i]) >= curr_best:
+            the_table[i], seq_table[i] = _longest_common_internal_subsequence(sequences[i])
+            curr_best = max(curr_best, the_table[i])
+    return the_table, seq_table
+
+#endregion Longest Common Subsequence

From 5ff9873f30db005b494d9b87adade996d7aa8877 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 11 Jul 2025 11:37:43 -0700
Subject: [PATCH 070/141] Add in test cases for sequencetools

---
 pygsti/tools/sequencetools.py         | 27 +++++----
 test/unit/tools/test_sequencetools.py | 81 +++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 14 deletions(-)
 create mode 100644 test/unit/tools/test_sequencetools.py

diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 69c2f505d..fe7e10b05 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -50,18 +50,18 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
     Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
     """
     if table_data_and_sequences:
-        table, sequences = table_data_and_sequences
+        table, external_sequences = table_data_and_sequences
     else:
-        table, sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
+        table, external_sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
 
     if internal_tables_and_sequences:
         internal_subtable, internal_subsequences = internal_tables_and_sequences
     else:
-        internal_subtable, internal_subsequences = build_internal_tables(sequences)
+        internal_subtable, internal_subsequences = create_tables_for_internal_LCS(sequences)
 
     best_index = _np.where(table == _np.max(table))
     best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
-    updated_sequences = sequences
+    updated_sequences = [seq for seq in sequences]
     cache_num = starting_cache_num
 
     # Build sequence dict
@@ -79,10 +79,10 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
 
     if _np.max(table) >= _np.max(internal_subtable):
         for ii in range(len(best_index[0])):
-            starting_point, starting_point_2, length = sequences[(best_index[0][ii], best_index[1][ii])]
             cir_index = best_index[0][ii]
             cir_index2 = best_index[1][ii]
-            seq = updated_sequences[cir_index][starting_point: int(starting_point + length+1)]
+            starting_point, starting_point_2, length = external_sequences[(cir_index, cir_index2)]
+            seq = updated_sequences[cir_index][starting_point: int(starting_point + length)]
 
             key = tuple(seq)
             if key in all_subsequences_to_replace:
@@ -139,9 +139,9 @@ def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int,
     j = 0
     while i < n-1 and j < m -1:
         curr = dp_table[i,j]
-        opt1 = dp_table[i+1, j+1]
-        opt2 = dp_table[i+1, j]
-        opt3 = dp_table[i, j+1]
+        opt1 = dp_table[i+1, j+1] # Use
+        opt2 = dp_table[i+1, j] # Eliminate A prefix
+        opt3 = dp_table[i, j+1] # Eliminate B prefix
         options = [opt1, opt2, opt3]
         if _np.all(curr == options):
             i += 1
@@ -152,9 +152,7 @@ def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int,
             j += 1
         else:
             # All three options are equal. So we should march the diagonal.
-            i += 1
-            j += 1
-            return i-1, j-1, dp_table[i,j]
+            return i, j, dp_table[0,0]
     return None, None, None
 
 
@@ -220,7 +218,8 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
     return best, best_ind
 
 
-def build_internal_tables(sequences):
+def create_tables_for_internal_LCS(sequences: list[Sequence]) -> tuple[_np.ndarray,
+                                                        list[dict[tuple, list[int]]]]:
     """
     Compute all the longest common internal sequences for each circuit A in sequences
 
@@ -233,7 +232,7 @@ def build_internal_tables(sequences):
 
     curr_best = 1
     for i in range(C):
-        if len(sequences[i]) >= curr_best:
+        if len(sequences[i]) >= 2*curr_best:
             the_table[i], seq_table[i] = _longest_common_internal_subsequence(sequences[i])
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
new file mode 100644
index 000000000..a11e5a1c0
--- /dev/null
+++ b/test/unit/tools/test_sequencetools.py
@@ -0,0 +1,81 @@
+import numpy as np
+from pygsti.tools.sequencetools import _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification
+
+def test_external_matches():
+
+    my_strings = ["ABAARCR12LIO", "QWERTYASDFGH", "QWEELLKJAT"]
+
+    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings)
+
+    assert np.max(tables) == 3
+
+    assert len(np.where(np.max(tables) == tables)[0]) == 1 # There is only one sequence present in this case.
+
+
+    if (1,2) in sequences:
+        assert sequences[(1,2)] == (0, 0, 3)
+    else:
+        assert (2,1) in sequences
+        assert sequences[(2,1)] == (0, 0, 3)
+
+
+def test_internal_matches():
+
+    my_strings = ["RACECAR", "AAAAQAAAA", "QWERTYQWEQWEQWE"]
+
+    tables, sequences = create_tables_for_internal_LCS(my_strings)
+
+    assert np.max(tables) == 4
+
+
+    assert sequences[1][tuple("AAAA")] == {0, 5}
+
+
+    my_strings = [my_strings[0]] + [my_strings[2]]
+
+    tables, sequences = create_tables_for_internal_LCS(my_strings)
+
+    assert np.max(tables) == 3
+    assert sequences[1][tuple("QWE")] == {0, 6, 9, 12}
+
+
+def test_one_round_update_collecting_tables_first():
+
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+    ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+    ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+    internal = create_tables_for_internal_LCS(example)
+    external = _compute_lcs_for_every_pair_of_sequences(example)
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, external, internal, len(example), cache)
+
+    assert len(updated) == 4
+    assert "".join(updated[3]) == "AAAA"
+
+    assert cache[1] == [3,"Q",3]
+    assert np.allclose(seq_intro, np.array(3))
+
+    assert num == len(updated)
+
+
+def test_one_round_update_without_collecting_tables_first():
+
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+    ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+    ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+
+    assert len(updated) == 4
+    assert "".join(updated[3]) == "AAAA"
+
+    assert cache[1] == [3,"Q",3]
+    assert np.allclose(seq_intro, np.array(3))
+
+    assert num == len(updated)
\ No newline at end of file

From fc80874be8ae8ee5d1b0fe1c1f13e3bfc11d4977 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Fri, 11 Jul 2025 11:40:22 -0700
Subject: [PATCH 071/141] tiny simplification

---
 pygsti/tools/sequencetools.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index fe7e10b05..69c5b770b 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -4,7 +4,7 @@
 
 #region Longest Common Subsequence
 
-def _best_matching_only(A: Sequence, B: Sequence) -> int:
+def len_lcp(A: Sequence, B: Sequence) -> int:
     """
     Returns:
     -----
@@ -15,9 +15,9 @@ def _best_matching_only(A: Sequence, B: Sequence) -> int:
     m = len(B)
     while i < n and i < m:
         if A[i] != B[i]:
-            return len(A[:i])
+            return i
         i += 1
-    return len(A[:i])
+    return i
 
 
 def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
@@ -33,7 +33,7 @@ def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
         for j in range(m-2, -1, -1):
             opt1 = 0
             if A[i] == B[j]:
-                opt1 = _best_matching_only(A[i:], B[j:])
+                opt1 = len_lcp(A[i:], B[j:])
             opt2 = table[i, j+1]
             opt3 = table[i+1, j]
             table[i,j] = max(opt1, opt2, opt3)

From 51474fd33fbf327d19719d1a5e0e83b66a3c0bec Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 11 Jul 2025 12:58:20 -0700
Subject: [PATCH 072/141] add more test cases

---
 pygsti/layouts/evaltree.py            | 30 ++++++++++++---------------
 pygsti/tools/sequencetools.py         | 11 +++++++---
 test/unit/tools/test_sequencetools.py | 24 ++++++++++++++++++++-
 3 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index f493c7cc3..3f7048015 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -23,7 +23,7 @@
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
-from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, build_internal_tables
+from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 import time
 
 
@@ -476,6 +476,12 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I")
 
 def _compute_qubit_to_lanes_mapping_for_circuit(circuit, num_qubits: int) -> tuple[dict[int, int], dict[int, tuple[int]]]:
     """
+    Parameters:
+    ------------
+    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
+
+    num_qubits: int - The total number of qubits expected in the circuit.
+
     Returns
     --------
     Dictionary mapping qubit number to lane number in the circuit.
@@ -630,15 +636,7 @@ def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
     """
 
 
-    if hasattr(model, "operations"):
-        return model.operations[opTuple].to_dense()
-    elif hasattr(model, "operation_blks"):
-        if opTuple[0] not in model.operation_blks["gates"]:
-            breakpoint()
-        return model.operation_blks["gates"][opTuple[0]].to_dense()
-    else:
-        raise ValueError("Missing attribute")
-
+    return 
 
 def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     op_term = 1
@@ -648,13 +646,13 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:
             # This is in the wrong order.
-            op_term = model_and_gate_to_dense_rep(model, op)
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
             op_term = swap_dense @ (op_term) @ swap_dense
             saved[op] = op_term # Save so we only need to this operation once.
         else:
-            op_term = model_and_gate_to_dense_rep(model, op)
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
     else:
-        op_term = model_and_gate_to_dense_rep(model, op)
+        op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
     return op_term
 
 
@@ -687,7 +685,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         self.qubit_start_point = qubit_starting_loc
 
 
-        internal_matches = build_internal_tables(circuit_list)
+        internal_matches = create_tables_for_internal_LCS(circuit_list)
         best_internal_match = _np.max(internal_matches[0])
 
         max_rounds = int(max(best_external_match,best_internal_match))
@@ -710,14 +708,13 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
                 # We are not going to get a better internal match.
                 pass
             else:
-                internal_matches = build_internal_tables(new_circuit_list)
+                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
 
             best_external_match = _np.max(external_matches[0])
             best_internal_match = _np.max(internal_matches[0])
 
             max_rounds = int(max(best_external_match,best_internal_match))
 
-        self.circuit_list = new_circuit_list
         self.cache = cache
         self.num_circuits = C
         self.from_other = False
@@ -772,7 +769,6 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
         self.num_circuits = other.num_circuits
         self.sequence_intro = other.sequence_intro
         self.swap_gate = other.swap_gate
-        self.circuit_list = other.circuit_list
         self.orig_circuit_list = other.orig_circuit_list
         self.circuit_to_save_location = other.circuit_to_save_location
         self.from_other = other
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 69c5b770b..ff0e8c00f 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -100,6 +100,7 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
     old_cache_num = cache_num
     for seq, cdict in all_subsequences_to_replace.items():
         w = len(seq)
+        update_made = 0
         if  w > 1 or (not isinstance(seq[0], int)):
             # We have reached an item which we can just compute.
             for cir_ind in cdict:
@@ -108,16 +109,20 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
                 while sp+w <= len(my_cir):
                     if list(my_cir[sp: sp+w]) == list(seq):
                         my_cir[sp: sp + w] = [cache_num]
+                        update_made = 1
 
                     sp += 1
                 updated_sequences[cir_ind] = my_cir
 
                 cache_struct[cir_ind] = updated_sequences[cir_ind]
 
-            updated_sequences.append(list(seq))
-            cache_struct[cache_num] = updated_sequences[cache_num]
+            if update_made:
+                # There may have been multiple overlapping subsequences in the same sequence.
+                # (e.g. QWEQWEQWERQWE has QWE, WEQ, and EQW all happen and all are length 3 subsequences.)
+                updated_sequences.append(list(seq))
+                cache_struct[cache_num] = updated_sequences[cache_num]
 
-            cache_num += 1
+                cache_num += 1
 
     sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
 
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
index a11e5a1c0..d51ae5e17 100644
--- a/test/unit/tools/test_sequencetools.py
+++ b/test/unit/tools/test_sequencetools.py
@@ -78,4 +78,26 @@ def test_one_round_update_without_collecting_tables_first():
     assert cache[1] == [3,"Q",3]
     assert np.allclose(seq_intro, np.array(3))
 
-    assert num == len(updated)
\ No newline at end of file
+    assert num == len(updated)
+
+
+def test_update_only_adds_those_strings_which_are_actually_used():
+    example = [('R', 'A', 'C', 'E', 'C', 'A', 'R'),
+        ('A', 'A', 'A', 'A', 'Q', 'A', 'A', 'A', 'A'),
+        ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
+    example = [list(x) for x in example]
+
+
+    cache = {i: s for i,s in enumerate(example)}
+    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+
+    r2, num, c2, s2 = conduct_one_round_of_lcs_simplification(updated, None, None, num, cache)
+
+    assert len(r2) == num
+
+    assert len(s2) == 1
+
+    assert 4 in c2[2]
+
+    assert len(c2[4]) == 3
+

From 338f8cf19863f6274e85fc8a22307d63c3991047 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 11 Jul 2025 12:58:56 -0700
Subject: [PATCH 073/141] Simplify

---
 pygsti/layouts/evaltree.py       | 9 ++-------
 pygsti/models/layerrules.py      | 3 +--
 pygsti/models/localnoisemodel.py | 4 ++--
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 3f7048015..353145466 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -630,15 +630,10 @@ def setup_circuit_list_for_LCS_computations(
 
 #region Lane Collapsing Helpers
 
-def model_and_gate_to_dense_rep(model, opTuple) -> _np.ndarray:
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     """
-    Look up the dense representation of a gate in the model.
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_i+1, qu_i).
     """
-
-
-    return 
-
-def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     op_term = 1
     if op.num_qubits == 2:
         # We may need to do swaps.
diff --git a/pygsti/models/layerrules.py b/pygsti/models/layerrules.py
index 5060d3087..d47cd21ab 100644
--- a/pygsti/models/layerrules.py
+++ b/pygsti/models/layerrules.py
@@ -14,7 +14,6 @@
 from pygsti.modelmembers import operations as _op
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
 from pygsti.baseobjs.label import LabelTup as _LabelTup
-from pygsti.models.model import OpModel as _OpModel
 
 class LayerRules(_NicelySerializable):
     """
@@ -112,7 +111,7 @@ def operation_layer_operator(self, model, layerlbl, cache):
         #raise KeyError(f"Cannot create operator for non-primitive layer: {layerlbl}")
         raise KeyError("Cannot create operator for non-primitive layer: %s" % str(layerlbl))
     
-    def get_dense_process_matrix_represention_for_gate(self, model: _OpModel, lbl: _LabelTup):
+    def get_dense_process_matrix_represention_for_gate(self, model, lbl: _LabelTup):
         """
         Get the dense process matrix corresponding to the lbl.
         Note this should be the minimal size required to represent the dense operator.
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index d9e21ceee..2a7b24fea 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -415,7 +415,7 @@ def __init__(self, qubit_labels, implicit_idle_mode, singleq_idle_layer_labels,
         self._add_global_idle_to_all_layers = False
         self._add_padded_idle = False
         self.use_op_caching = True  # expert functionality - can be turned off if needed
-        self._spacial_homogeneity_assumed = independent_gates
+        self._spatial_homogeneity_assumed = not independent_gates
 
         if implicit_idle_mode not in ('none', 'add_global', 'only_global', 'pad_1Q'):
             raise ValueError("Invalid `implicit_idle_mode`: '%s'" % str(implicit_idle_mode))
@@ -630,7 +630,7 @@ def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel
         _np.ndarray
         """
 
-        key = lbl.name if self._spacial_homogeneity_assumed else lbl
+        key = lbl.name if self._spatial_homogeneity_assumed else lbl
         return model.operation_blks["gates"][key].to_dense()
 
 

From 9f8faadbbbb21d950b4e7c1ebd2e3c7ca825b0ba Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Mon, 14 Jul 2025 13:59:01 -0700
Subject: [PATCH 074/141] Add padded idles into the circuit as necessary.

---
 pygsti/layouts/evaltree.py       | 228 +++++++++++++------------------
 pygsti/models/localnoisemodel.py |  16 ++-
 2 files changed, 108 insertions(+), 136 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 353145466..844787f87 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -22,10 +22,11 @@
 from pygsti.baseobjs.label import LabelTupTup, Label
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
+from pygsti.baseobjs.basis import get_num_qubits_in_basis
 import itertools
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 import time
-
+from typing import Iterable
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -586,9 +587,6 @@ def setup_circuit_list_for_LCS_computations(
     Then, a sequence detailing the number of qubits in each lane for a circuit.
     """
 
-    # output = []
-    # cir_id_to_lanes = []
-
     # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
     # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
 
@@ -606,7 +604,7 @@ def setup_circuit_list_for_LCS_computations(
 
         assert len(sub_cirs) == len(lanes_to_qubits)
         for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j])
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]))
             lbls = sc._line_labels
             if lbls in line_labels_to_circuit_list:
                 line_labels_to_circuit_list[lbls].append(sc)
@@ -621,8 +619,6 @@ def setup_circuit_list_for_LCS_computations(
             else:
                 cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
 
-        # output.extend(sub_cirs)
-        # cir_id_to_lanes.append(lanes_to_qubits)
     return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
 
 #endregion Split Circuits by lanes helpers
@@ -634,9 +630,9 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
     """
     Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_i+1, qu_i).
     """
-    op_term = 1
     if op.num_qubits == 2:
         # We may need to do swaps.
+        op_term = 1
         if op in saved:
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:
@@ -646,9 +642,8 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, s
             saved[op] = op_term # Save so we only need to this operation once.
         else:
             op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-    else:
-        op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-    return op_term
+        return op_term
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
 
 
 def combine_two_gates(cumulative_term, next_dense_matrix):
@@ -752,9 +747,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
                                     [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
 
-        # Assumes a perfect swap gate!
-        # self.swap_gate = create_from_superop_mx(swap_gate, "static standard", stdname="Gswap")            
-
+      
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
         Construct a tree from another tree.
@@ -804,62 +797,18 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
 
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-        saved: dict[int, _LinearOperator] = {}
+        saved: dict[int | LabelTupTup, _np.ndarray] = {}
         
-
-
-        def cache_lookup_and_product(cumulative_term, term_to_extend_with: int):
-            if cumulative_term is None:
-                # look up result.
-                return saved[term]
-            elif isinstance(term, int) and cumulative_term is not None:
-                return combine_two_gates(cumulative_term, saved[term_to_extend_with]) 
-
-
-
-        def collapse_cache_line(cumulative_term, term_to_extend_with: int | LabelTupTup):
-
-            if isinstance(term_to_extend_with, int):
-                return cache_lookup_and_product(cumulative_term, term_to_extend_with)
-
-            else:
-                val = 1
-                qubits_used = [i for i in range(num_qubits_in_default)]
-                while qubits_used:
-                    qu = qubits_used[0]
-                    gate_matrix = _np.eye(4)
-                    found = False
-                    op_ind = self.qubit_start_point # Handle circuits with only qubits (i, i+k) where k is number of qubits in the subsystem.
-                    while not found and op_ind < len(term):
-                        op = term[op_ind]
-                        if qu in op.qubits:
-                            gate_matrix = get_dense_representation_of_gate_with_perfect_swap_gates(model, op, saved, self.swap_gate)
-                            found = True
-                            # We assume that the qubits need to overlap for a specific gate.
-                            # i.e. One cannot have op.qubits = (0, 2) in a system with a qubits (0,1,2).
-                            qubits_used = qubits_used[len(op.qubits):]
-                        op_ind += 1
-                    val = _np.kron(val, gate_matrix)
-                    if not found:
-                        # Remove that qubit from list to check.
-                        qubits_used = qubits_used[1:]
-
-                if val.shape != expected_shape:
-                    breakpoint()
-                if cumulative_term is None:
-                    return val
-                else:
-                    return combine_two_gates(cumulative_term, val)
-
-        expected_shape = (4**num_qubits_in_default, 4**num_qubits_in_default)
         for key in round_keys:
             for cind in self.sequence_intro[key]:
                 cumulative_term = None
                 for term in self.cache[cind]:
-                    cumulative_term = collapse_cache_line(cumulative_term, term)
+                    cumulative_term = self._collapse_cache_line(model, cumulative_term, term, saved, num_qubits_in_default)
                         
                 if cumulative_term is None:
-                    saved[cind] = _np.eye(4**num_qubits_in_default) # identity of the appropriate size.
+                    saved[cind] = get_dense_representation_of_gate_with_perfect_swap_gates(model,  Label("Fake_Gate_To_Get_Tensor_Size_Right", *(qu for qu in range(num_qubits_in_default))), saved, self.swap_gate)
+                    # This will return an identity gate of the appropriate size.
+                    # But it may also be a Noisy idle gate.
                 else:
                     saved[cind] = cumulative_term
         if __debug__:
@@ -871,6 +820,66 @@ def collapse_cache_line(cumulative_term, term_to_extend_with: int | LabelTupTup)
     
         return saved, self.circuit_to_save_location 
 
+    def handle_results_cache_lookup_and_product(self,
+                            cumulative_term: None | _np.ndarray,
+                            term_to_extend_with: int | LabelTupTup,
+                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
+
+        if cumulative_term is None:
+            # look up result.
+            return results_cache[term_to_extend_with]
+        return combine_two_gates(cumulative_term, results_cache[term_to_extend_with]) 
+
+
+    def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
+                            term_to_extend_with: int | LabelTupTup,
+                            results_cache: dict[int | LabelTupTup, _np.ndarray],
+                            num_qubits_in_default: int) -> _np.ndarray:
+        """
+        Reduce a cache line to a single process matrix.
+
+        This should really only be called from collapse_circuits_to_process_matrices.
+
+        """
+
+
+        if isinstance(term_to_extend_with, int):
+            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
+
+        else:
+            val = 1
+            qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
+            matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
+                                            results_cache, self.swap_gate) for op in term_to_extend_with}
+            qubit_used = []
+            for key in matrix_reps.keys():
+                qubit_used.extend(key)
+
+            assert len(qubit_used) == len(set(qubit_used))
+            unused_qubits = set(qubits_available) - set(qubit_used)
+
+            implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
+                                        Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
+                                        results_cache, self.swap_gate) for qu in unused_qubits}
+
+            while qubits_available:
+
+                qu = qubits_available[0]
+                if qu in unused_qubits:
+                    val = _np.kron(val, implicit_idle_reps[(qu,)])
+                    qubits_available = qubits_available[1:]
+                else:
+                    # It must be a part of a non-trivial gate.
+                    gatekey = [key for key in matrix_reps if qu in key][0]
+                    val = _np.kron(val, matrix_reps[gatekey])
+
+                    qubits_available = qubits_available[len(gatekey):]
+
+            if cumulative_term is None:
+                return val
+            return combine_two_gates(cumulative_term, val)
+
+
     def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         output = ()
@@ -885,65 +894,6 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         return list(output)
 
-    """        
-    def _evaluate_product_rule(self, cind: int, rn: int):
-
-        sequence = self.cache[cind]
-        num_terms = len(sequence)
-        sub_tree_cache, sub_rounds = self.deriv_ordering_cache[num_terms]
-
-        for sub_r in sorted(sub_rounds.keys())[::-1]:
-            sub_sequence = None
-            for sub_cind in sub_rounds[sub_r]:
-        
-                for term in sub_tree_cache[sub_cind]:
-                    if isinstance(term, tuple):
-                        # Then, this may be a partial derivative or an character in original sequence.
-                        if len(term) == 2:
-                            # Then this is taking a partial derivative.
-                            natural_term = term[1][0]
-                            if natural_term in self.derivative_cache:
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-                            else:
-                                # This should be a natural derivative.
-                                self.derivative_cache[natural_term] = term.deriv_wrt_params(None)
-                                cumulative_term = cumulative_term @ self.derivative_cache[natural_term]
-
-                        # It is just an index to sequence for where to look in the cache.
-                        next_ind = term[0]
-                        sequence_val = sequence[next_ind]
-
-                        if isinstance(term, int) and cumulative_term is None:
-                            # look up result.
-                            cumulative_term = saved[term]
-                        elif isinstance(term, int) and not (cumulative_term is None):
-                            cumulative_term = saved[term] @ cumulative_term
-                        elif isinstance(term, LabelTupTup):
-                            val = 1
-                            for op in term:
-                            op_term = 1
-                            if op.num_qubits == 2:
-                                # We may need to do swaps.
-                                if op in saved:
-                                    op_term = saved[op]
-                                elif op.qubits[1] < op.qubits[0]:
-                                    # This is in the wrong order.
-                                    swap_term = model.operation_blks["gates"][("Gswap",0,1)].to_dense() # assume this is perfect.
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                                    op_term = swap_term @ op_term @ swap_term.T
-                                    saved[op] = op_term # Save so we only need to this operation once.
-                                else:
-                                    op_term = model.operation_blks["gates"][op].to_dense()
-                            else:
-                                op_term = model.operation_blks["gates"][op].to_dense()
-                            val = _np.kron(val, op_term)
-                        #val = model.operation_blks["gates"][term[0]].to_dense()
-                        if cumulative_term is None:
-                            cumulative_term = val
-                        else:
-                            cumulative_term = val @ cumulative_term
-    """
-
 
 class CollectionOfLCSEvalTrees():
 
@@ -988,16 +938,22 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
 
     def collapse_circuits_to_process_matrices(self, model):
         # Just collapse all of them.
-        
+
+
         self.saved_results = {}
         for key in self.trees:
-            self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
+            tree = self.trees[key]
+            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits)
+            # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+            self.saved_results[key] = out1
+            self.sub_cir_to_ind_in_results[key] = out2
 
     def reconstruct_full_matrices(self):
 
         if len(self.saved_results) == 0:
             return
-        
+
         # Now we can do the combination.
 
         num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
@@ -1008,17 +964,19 @@ def reconstruct_full_matrices(self):
             for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
                 cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
                 lblkey = cir._line_labels
-
-                if len(cir.layertup) == 0:
-
-                    lane_circuits.append(_np.eye(4**(len(lblkey))))
-                else:
-                    if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
-                        print(lblkey)
-                        print(cir)
-                        breakpoint()
+                if lblkey == ("*",):
+                    # We are gettting a noisy idle line and so need to check the size we are expecting here.
                     ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                    lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                    print(cir.num_lines)
+                    # lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+
+                    # 
+                if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
+                    print(lblkey)
+                    print(cir)
+                    breakpoint()
+                ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                lane_circuits.append(self.saved_results[lblkey][ind_in_results])
             output.append(lane_circuits)
 
         # Need a map from lane id to computed location.
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 2a7b24fea..0ca844214 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -631,7 +631,21 @@ def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel
         """
 
         key = lbl.name if self._spatial_homogeneity_assumed else lbl
-        return model.operation_blks["gates"][key].to_dense()
+        if key in model.operation_blks["gates"]:
+            return model.operation_blks["gates"][key].to_dense()
+
+        elif self._add_padded_idle:
+            # We have idle gates that we can include.
+            absent_sslbls = lbl[1:]
+            new_key = self.single_qubit_idle_layer_labels[absent_sslbls]
+            if self._spatial_homogeneity_assumed:
+                new_key = new_key.name
+            return model.operation_blks["gates"][new_key].to_dense()
+    
+        else:
+            # Assume a perfect idle q-qubit gate.
+            return _np.eye(4**len(lbl.qubits))
+
 
 
 

From 4b9b10a8928cf914fca1e7c67499d51f3bc25958 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Mon, 14 Jul 2025 14:52:23 -0700
Subject: [PATCH 075/141] bugfix for
 ExplicitLayerRules.get_dense_process_matrix_represention_for_gate. Enable
 LCSEvalTreeForwardSimulator in tests. Change
 EvalTreeBasedUponLongestCommonSubstring.collapse_circuits_to_process_matrices
 to not use the `fake` gate label.

---
 pygsti/layouts/evaltree.py           |  6 ++----
 pygsti/models/explicitmodel.py       |  2 +-
 test/unit/objects/test_forwardsim.py | 12 ++++++++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 844787f87..2d20bacd8 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -22,7 +22,6 @@
 from pygsti.baseobjs.label import LabelTupTup, Label
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
-from pygsti.baseobjs.basis import get_num_qubits_in_basis
 import itertools
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 import time
@@ -806,9 +805,8 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                     cumulative_term = self._collapse_cache_line(model, cumulative_term, term, saved, num_qubits_in_default)
                         
                 if cumulative_term is None:
-                    saved[cind] = get_dense_representation_of_gate_with_perfect_swap_gates(model,  Label("Fake_Gate_To_Get_Tensor_Size_Right", *(qu for qu in range(num_qubits_in_default))), saved, self.swap_gate)
-                    # This will return an identity gate of the appropriate size.
-                    # But it may also be a Noisy idle gate.
+                    saved[cind] = _np.eye(4**num_qubits_in_default)
+                    # NOTE: unclear when (if ever) this should be a noisy idle gate.
                 else:
                     saved[cind] = cumulative_term
         if __debug__:
diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index 7356db6a3..df61a66f7 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -1806,7 +1806,7 @@ def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel,
         _np.ndarray
         """
 
-        operation = model.operations["gates"][lbl]
+        operation = model.operations[lbl]
 
         if isinstance(operation, _EmbeddedOp):
             return operation.embedded_op.to_dense()
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 2c742f533..17509b93f 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -13,7 +13,10 @@
 from pygsti.models import ExplicitOpModel
 from pygsti.circuits import Circuit, create_lsgst_circuit_lists
 from pygsti.baseobjs import Label as L
-from ..util import BaseCase
+try:
+    from ..util import BaseCase
+except ImportError:
+    BaseCase = object
 
 from pygsti.data import simulate_data
 from pygsti.modelpacks import smq1Q_XYI
@@ -282,7 +285,7 @@ def setUp(self):
             SimpleMatrixForwardSimulator(),
             MapForwardSimulator(),
             MatrixForwardSimulator(),
-            # LCSEvalTreeMatrixForwardSimulator()
+            LCSEvalTreeMatrixForwardSimulator()
         ]
         if TorchForwardSimulator.ENABLED:
             sims.append(TorchForwardSimulator())
@@ -364,3 +367,8 @@ def test_matrix_fwdsim(self):
     def test_lcs_matrix_fwdsim(self):
         self._run(LCSEvalTreeMatrixForwardSimulator)
 
+
+if __name__ == '__main__':
+    tester = ForwardSimConsistencyTester()
+    tester.test_consistent_probs()
+    print()

From 40d6460fe083edbc080616ffd25a102a647d69c1 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 15 Jul 2025 11:17:01 -0700
Subject: [PATCH 076/141] Error out when there is an implicit idle.

---
 pygsti/layouts/evaltree.py     | 10 ++++++++--
 pygsti/models/explicitmodel.py |  3 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 2d20bacd8..1f1c01236 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -601,9 +601,13 @@ def setup_circuit_list_for_LCS_computations(
         qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
         sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
 
+        if not implicit_idle_gate_name:
+            if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
+                raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
+
         assert len(sub_cirs) == len(lanes_to_qubits)
         for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]))
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]),)
             lbls = sc._line_labels
             if lbls in line_labels_to_circuit_list:
                 line_labels_to_circuit_list[lbls].append(sc)
@@ -907,7 +911,9 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
 
         starttime = time.time()
         for key, vals in line_lbls_to_circuit_list.items():
-            sub_cirs = [list(cir) for cir in vals]
+            sub_cirs = []
+            for cir in vals:
+                sub_cirs.append(list(cir))
             if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
                 if len(key) not in size_to_tree:
                     self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index df61a66f7..1dec5eb29 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -1806,6 +1806,9 @@ def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel,
         _np.ndarray
         """
 
+        if lbl not in model.operations:
+            return _np.empty(1)
+
         operation = model.operations[lbl]
 
         if isinstance(operation, _EmbeddedOp):

From cbb090f3ed431775118305053365094a50537bfa Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 15 Jul 2025 13:46:41 -0700
Subject: [PATCH 077/141] Make tests easier.

---
 pygsti/circuits/split_circuits_into_lanes.py | 109 ++++++++++++++++
 pygsti/layouts/evaltree.py                   | 127 +++----------------
 test/unit/objects/test_circuit_splitting.py  | 101 +++++++++++++++
 test/unit/objects/test_forwardsim.py         |  22 ++--
 4 files changed, 239 insertions(+), 120 deletions(-)
 create mode 100644 pygsti/circuits/split_circuits_into_lanes.py
 create mode 100644 test/unit/objects/test_circuit_splitting.py

diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
new file mode 100644
index 000000000..633565e2a
--- /dev/null
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -0,0 +1,109 @@
+import numpy as _np
+
+from pygsti.circuits import Circuit as _Circuit
+from pygsti.baseobjs.label import Label, LabelTupTup
+
+def compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit: _Circuit) -> tuple[dict[int, int],
+                                                                                        dict[int, tuple[int]]]:
+    """
+    Parameters:
+    ------------
+    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
+
+    num_qubits: int - The total number of qubits expected in the circuit.
+
+    Returns
+    --------
+    Dictionary mapping qubit number to lane number in the circuit.
+    """
+
+    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(circuit.num_lines)}
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        for op in layer:
+            qubits_used = op.qubits
+            for qb in qubits_used:
+                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
+
+    lanes = {}
+    lan_num = 0
+    visited: dict[int, int] = {}
+    def reachable_nodes(starting_point: int,
+                        graph_qubits_to_neighbors: dict[int, set[int]],
+                        visited: dict[int, set[int]]):
+        """
+        Find which nodes are reachable from this starting point.
+        """
+        if starting_point in visited:
+            return visited[starting_point]
+        else:
+            assert starting_point in graph_qubits_to_neighbors
+            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
+            output = set(visited[starting_point])
+            for child in graph_qubits_to_neighbors[starting_point]:
+                if child != starting_point:
+                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
+            visited[starting_point] = output
+            return output
+
+    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
+    while available_starting_points:
+        sp = available_starting_points[0]
+        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
+        for node in nodes:
+            available_starting_points.remove(node)
+        lanes[lan_num] = nodes
+        lan_num += 1
+
+    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
+        """
+        Determine a mapping from qubit to the lane it is in for this specific circuit.
+        """
+        out = {}
+        for key, val in lanes_to_qubits.items():
+            for qb in val:
+                out[qb] = key
+        return out
+
+    return compute_qubits_to_lanes(lanes), lanes
+
+
+def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
+    """
+    Split a circuit into multiple subcircuits which do not talk across lanes.
+    """
+
+    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
+
+    num_layers = circuit.num_layers
+    for layer_ind in range(num_layers):
+        layer = circuit.layer(layer_ind)
+        group = []
+        group_lane = None
+        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
+
+        for op in sorted_layer:
+            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
+            qubits_used = op.qubits # This will be a list of qubits used.
+            # I am assuming that the qubits are indexed numerically and not by strings.
+            lane = qubits_to_lanes[qubits_used[0]]
+
+            if group_lane is None:
+                group_lane = lane
+                group.append(op)
+            elif group_lane == lane:
+                group.append(op)
+            else:
+                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+                group_lane = lane
+                group = [op]
+
+        if len(group) > 0:
+            # We have a left over group.
+            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
+
+    if num_layers == 0:
+        return []
+
+    return lanes_to_gates
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 1f1c01236..df4e63533 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -24,6 +24,8 @@
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
+
+from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
 from typing import Iterable
 
@@ -474,111 +476,14 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I")
     return tmp
 
 
-def _compute_qubit_to_lanes_mapping_for_circuit(circuit, num_qubits: int) -> tuple[dict[int, int], dict[int, tuple[int]]]:
-    """
-    Parameters:
-    ------------
-    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
-
-    num_qubits: int - The total number of qubits expected in the circuit.
-
-    Returns
-    --------
-    Dictionary mapping qubit number to lane number in the circuit.
-    """
-
-    qubits_to_potentially_entangled_others = {i: set((i,)) for i in range(num_qubits)}
-    num_layers = circuit.num_layers
-    for layer_ind in range(num_layers):
-        layer = circuit.layer(layer_ind)
-        for op in layer:
-            qubits_used = op.qubits
-            for qb in qubits_used:
-                qubits_to_potentially_entangled_others[qb].update(set(qubits_used))
-
-    lanes = {}
-    lan_num = 0
-    visited: dict[int, int] = {}
-    def reachable_nodes(starting_point: int, graph_qubits_to_neighbors: dict[int, set[int]], visited: dict[int, set[int]]):
-        """
-        Find which nodes are reachable from this starting point.
-        """
-        if starting_point in visited:
-            return visited[starting_point]
-        else:
-            assert starting_point in graph_qubits_to_neighbors
-            visited[starting_point] = graph_qubits_to_neighbors[starting_point]
-            output = set(visited[starting_point])
-            for child in graph_qubits_to_neighbors[starting_point]:
-                if child != starting_point:
-                    output.update(output, reachable_nodes(child, graph_qubits_to_neighbors, visited))
-            visited[starting_point] = output
-            return output
-
-    available_starting_points = list(sorted(qubits_to_potentially_entangled_others.keys()))
-    while available_starting_points:
-        sp = available_starting_points[0]
-        nodes = reachable_nodes(sp, qubits_to_potentially_entangled_others, visited)
-        for node in nodes:
-            available_starting_points.remove(node)
-        lanes[lan_num] = nodes
-        lan_num += 1
-
-    def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, int]:
-        """
-        Determine a mapping from qubit to the lane it is in for this specific circuit.
-        """
-        out = {}
-        for key, val in lanes_to_qubits.items():
-            for qb in val:
-                out[qb] = key
-        return out
-
-    return compute_qubits_to_lanes(lanes), lanes
-
-
-def _compute_subcircuits(circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
-    """
-    Split a circuit into multiple subcircuits which do not talk across lanes.
-    """
-
-    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
-
-    num_layers = circuit.num_layers
-    for layer_ind in range(num_layers):
-        layer = circuit.layer(layer_ind)
-        group = []
-        group_lane = None
-        sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
-
-        for op in sorted_layer:
-            # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
-            qubits_used = op.qubits # This will be a list of qubits used.
-            # I am assuming that the qubits are indexed numerically and not by strings.
-            lane = qubits_to_lanes[qubits_used[0]]
-
-            if group_lane is None:
-                group_lane = lane
-                group.append(op)
-            elif group_lane == lane:
-                group.append(op)
-            else:
-                lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
-                group_lane = lane
-                group = [op]
-
-        if len(group) > 0:
-            # We have a left over group.
-            lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
 
-    return lanes_to_gates
 
 
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
-                                                    dict[tuple[_Circuit], list[tuple[int, int]]],
-                                                    dict[tuple[int, ...], set[_Circuit]]]:
+                                                    dict[tuple[LabelTupTup], list[tuple[int, int]]],
+                                                    dict[tuple[int, ...], list[LabelTupTup]]]:
     """
     Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
 
@@ -598,25 +503,25 @@ def setup_circuit_list_for_LCS_computations(
         if implicit_idle_gate_name:
             cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
 
-        qubits_to_lane, lanes_to_qubits = _compute_qubit_to_lanes_mapping_for_circuit(cir, cir.num_lines)
-        sub_cirs = _compute_subcircuits(cir, qubits_to_lane)
+        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir, cir.num_lines)
+        sub_cirs = compute_subcircuits(cir, qubit_to_lane)
 
         if not implicit_idle_gate_name:
             if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
                 raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
 
-        assert len(sub_cirs) == len(lanes_to_qubits)
+        assert len(sub_cirs) == len(lane_to_qubits)
         for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j],line_labels=tuple(lanes_to_qubits[j]),)
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lane_to_qubits[j]),)
             lbls = sc._line_labels
             if lbls in line_labels_to_circuit_list:
-                line_labels_to_circuit_list[lbls].append(sc)
+                line_labels_to_circuit_list[lbls].append(sc.layertup)
             else:
-                line_labels_to_circuit_list[lbls] = [sc]
-            if sc in sub_cir_to_cir_id_and_lane_id:
-                sub_cir_to_cir_id_and_lane_id[sc].append((i,j))
+                line_labels_to_circuit_list[lbls] = [sc.layertup]
+            if sc.layertup in sub_cir_to_cir_id_and_lane_id:
+                sub_cir_to_cir_id_and_lane_id[sc.layertup].append((i,j))
             else:
-                sub_cir_to_cir_id_and_lane_id[sc] = [(i,j)]
+                sub_cir_to_cir_id_and_lane_id[sc.layertup] = [(i,j)]
             if i in cir_ind_and_lane_id_to_sub_cir:
                 cir_ind_and_lane_id_to_sub_cir[i][j] = sc
             else:
@@ -899,7 +804,9 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
 class CollectionOfLCSEvalTrees():
 
-    def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id, cir_id_and_lane_id_to_sub_cir):
+    def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTupTup]],
+                 sub_cir_to_full_cir_id_and_lane_id,
+                 cir_id_and_lane_id_to_sub_cir):
         
         self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
 
@@ -913,7 +820,7 @@ def __init__(self, line_lbls_to_circuit_list, sub_cir_to_full_cir_id_and_lane_id
         for key, vals in line_lbls_to_circuit_list.items():
             sub_cirs = []
             for cir in vals:
-                sub_cirs.append(list(cir))
+                sub_cirs.append(cir.layertup)
             if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
                 if len(key) not in size_to_tree:
                     self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
diff --git a/test/unit/objects/test_circuit_splitting.py b/test/unit/objects/test_circuit_splitting.py
new file mode 100644
index 000000000..ea5bdf998
--- /dev/null
+++ b/test/unit/objects/test_circuit_splitting.py
@@ -0,0 +1,101 @@
+from pygsti.circuits.circuit import Circuit as _Circuit
+from pygsti.baseobjs.label import Label
+from pygsti.circuits.split_circuits_into_lanes import compute_subcircuits, compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit
+import numpy as np
+
+
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
+    my_circuit = []
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = str(np.random.choice(allowed_gates))
+            layer.append((gate, qnum))
+        my_circuit.append(layer)
+    return _Circuit(my_circuit)
+
+
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: int, depth_L: int, lane_end_points: list[int], gates_to_qubits_used: dict[str, int]):
+
+    assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
+    assert lane_end_points[0] > 0
+    assert np.all(np.diff(lane_end_points) > 0) # then it is sorted in increasing order.
+
+    if lane_end_points[-1] < num_qubits:
+        lane_end_points.append(num_qubits)
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        start_point = 0
+
+        for lane_ep in lane_end_points:
+            num_used: int = 0
+            while num_used < (lane_ep - start_point):
+                navail = (lane_ep - start_point) - num_used
+                nchosen = 0
+                if navail >= max(n_qs_to_gates_avail):
+                    # we can use any gate
+                    nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+                else:
+                    # we need to first choose how many to use.
+                    nchosen = np.random.randint(1, navail + 1)
+                gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+                tmp = list(np.random.permutation(nchosen) + num_used + start_point) # Increase to offset.
+                perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+                if gate == "Gcustom":
+                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+                else:
+                    layer.append((gate, *perm_of_qubits_used))
+                num_used += nchosen
+
+            if num_used > (lane_ep - start_point) + 1:
+                print(num_used, f"lane ({start_point}, {lane_ep})")
+                raise AssertionError("lane barrier is broken")
+            
+            start_point = lane_ep
+        my_circuit.append(layer)
+    return _Circuit(my_circuit, line_labels=[i for i in range(num_qubits)])
+
+
+def test_subcircuits_splits_can_create_empty_sub_circuit():
+
+
+    original = _Circuit([], line_labels=[0])
+
+    qubits_to_lanes = {0: 0}
+
+    attempt = compute_subcircuits(original, qubits_to_lanes)
+
+    assert original == _Circuit(attempt, line_labels=[0])
+
+
+def test_find_qubit_to_lane_splitting():
+
+    gates_to_num_used = {"X": 1, "Y": 1, "Z": 1, "CNOT": 2, "CZ": 2}
+
+    depth = 10
+    num_qubits = 6
+
+    lane_eps = [1, 2, 4, 5]
+    # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+
+    circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
+
+    qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
+
+
+    assert len(qubit_to_lane) == num_qubits
+
+    assert len(lane_to_qubits) <= num_qubits
+
+    circuit = _Circuit([[]])
+
+test_subcircuits_splits_can_create_empty_sub_circuit()
\ No newline at end of file
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index 17509b93f..e2236e845 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -19,12 +19,14 @@
     BaseCase = object
 
 from pygsti.data import simulate_data
-from pygsti.modelpacks import smq1Q_XYI
+from pygsti.modelpacks import smq1Q_XYI, smq1Q_XY
 from pygsti.protocols import gst
 from pygsti.protocols.protocol import ProtocolData
 from pygsti.tools import two_delta_logl
 
 
+
+GLOBAL_MODEL_IDLE = smq1Q_XYI
 def Ls(*args):
     """ Convert args to a tuple to Labels """
     return tuple([L(x) for x in args])
@@ -153,8 +155,8 @@ class BaseProtocolData:
 
     @classmethod
     def setUpClass(cls):
-        cls.gst_design = smq1Q_XYI.create_gst_experiment_design(max_max_length=16)
-        cls.mdl_target = smq1Q_XYI.target_model()
+        cls.gst_design = GLOBAL_MODEL_IDLE.create_gst_experiment_design(max_max_length=16)
+        cls.mdl_target = GLOBAL_MODEL_IDLE.target_model()
         cls.mdl_datagen = cls.mdl_target.depolarize(op_noise=0.05, spam_noise=0.025)
 
         ds = simulate_data(cls.mdl_datagen, cls.gst_design.all_circuits_needing_data, 20000, sample_error='none')
@@ -259,23 +261,23 @@ def jac_colinearities(self):
             colinearities *= -1
         return colinearities
 
-    
 
 class ForwardSimConsistencyTester(TestCase):
 
     PROBS_TOL = 1e-14
     JACS_TOL = 1e-10
 
+    
     def setUp(self):
-        self.model_ideal = smq1Q_XYI.target_model()
+        self.model_ideal = GLOBAL_MODEL_IDLE.target_model()
         if TorchForwardSimulator.ENABLED:
             # TorchFowardSimulator can only work with TP modelmembers.
             self.model_ideal.convert_members_inplace(to_type='full TP')
         
         self.model_noisy = self.model_ideal.depolarize(op_noise=0.05, spam_noise=0.025)
-        prep_fiducials = smq1Q_XYI.prep_fiducials()
-        meas_fiducials = smq1Q_XYI.meas_fiducials()
-        germs = smq1Q_XYI.germs()
+        prep_fiducials = GLOBAL_MODEL_IDLE.prep_fiducials()
+        meas_fiducials = GLOBAL_MODEL_IDLE.meas_fiducials()
+        germs = GLOBAL_MODEL_IDLE.germs()
         max_lengths = [4]
         circuits = create_lsgst_circuit_lists(
             self.model_noisy, prep_fiducials, meas_fiducials, germs, max_lengths
@@ -339,9 +341,9 @@ class ForwardSimIntegrationTester(BaseProtocolData):
 
     def _run(self, obj : ForwardSimulator.Castable):
         self.setUpClass()
-        proto = gst.GateSetTomography(smq1Q_XYI.target_model("full TP"), 'stdgaugeopt', name="testGST")
+        proto = gst.GateSetTomography(GLOBAL_MODEL_IDLE.target_model("full TP"), name="testGST")
         results = proto.run(self.gst_data, simulator=obj)
-        mdl_result = results.estimates["testGST"].models['stdgaugeopt']
+        mdl_result = results.estimates["testGST"].models["final iteration estimate"]
         twoDLogL = two_delta_logl(mdl_result, self.gst_data.dataset)
         assert twoDLogL <= 0.05  # should be near 0 for perfect data
         pass

From 650fd595da8dd4ef07414ed30466afb95acc4974 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 15 Jul 2025 13:58:28 -0700
Subject: [PATCH 078/141] Improve the circuit splitting test.

---
 test/unit/objects/test_circuit_splitting.py | 32 +++++++++++++++++----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/test/unit/objects/test_circuit_splitting.py b/test/unit/objects/test_circuit_splitting.py
index ea5bdf998..397392670 100644
--- a/test/unit/objects/test_circuit_splitting.py
+++ b/test/unit/objects/test_circuit_splitting.py
@@ -4,7 +4,11 @@
 import numpy as np
 
 
-def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]) -> _Circuit:
+    """
+    Build a random circuit of depth L which operates on num_qubits and has the allowed
+    single qubit gates specified in allowed gates.
+    """
     my_circuit = []
     for lnum in range(depth_L):
         layer = []
@@ -12,10 +16,20 @@ def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
             gate = str(np.random.choice(allowed_gates))
             layer.append((gate, qnum))
         my_circuit.append(layer)
-    return _Circuit(my_circuit)
+    return _Circuit(my_circuit, line_labels=[i for i in range(num_qubits)])
+
 
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(
+                                num_qubits: int,
+                                depth_L: int,
+                                lane_end_points: list[int],
+                                gates_to_qubits_used: dict[str, int]) -> _Circuit:
+    """
+    Builds a circuit with a known lane structure.
+    Any two + qubit lanes can be split into smaller lanes if none of the gates
+    chosen for that lane actually operate on two or more qubits.
+    """
 
-def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: int, depth_L: int, lane_end_points: list[int], gates_to_qubits_used: dict[str, int]):
 
     assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
     assert lane_end_points[0] > 0
@@ -86,7 +100,9 @@ def test_find_qubit_to_lane_splitting():
 
     lane_eps = [1, 2, 4, 5]
     # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+    minimum_num_lanes = 5
 
+    # This is a random circuit so the lanes may not be perfect.
     circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
 
     qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
@@ -94,8 +110,14 @@ def test_find_qubit_to_lane_splitting():
 
     assert len(qubit_to_lane) == num_qubits
 
+    assert len(lane_to_qubits) >= minimum_num_lanes
     assert len(lane_to_qubits) <= num_qubits
 
-    circuit = _Circuit([[]])
+    for qubit in qubit_to_lane:
+        assert qubit_to_lane[qubit] in lane_to_qubits
+
 
-test_subcircuits_splits_can_create_empty_sub_circuit()
\ No newline at end of file
+    for lane in lane_to_qubits:
+        for qu in lane_to_qubits[lane]:
+            assert qu in qubit_to_lane
+            assert lane == qubit_to_lane[qu]

From 2025949402e62c493a46837448ee7bf4b6cdd4f1 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 15 Jul 2025 14:22:45 -0700
Subject: [PATCH 079/141] change Circuit.replace_gatename_inplace to handle the
 common situation when old_gatename is not a Label object, but can match
 old_gatename == obj for some obj encountered when iterating over elements of
 Circuit._labels.

---
 pygsti/circuits/circuit.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index ac13aa539..435382e89 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -2563,9 +2563,15 @@ def replace_gatename_inplace(self, old_gatename, new_gatename):
 
         def replace(obj):  # obj is either a simple label or a list
             if isinstance(obj, _Label):
-                if obj.name == old_gatename:
-                    newobj = _Label(new_gatename, obj.sslbls)
-                else: newobj = obj
+                newobj = _Label(new_gatename, obj.sslbls) if (obj.name == old_gatename) else obj
+            elif obj == old_gatename:
+                if len(obj) == 0:
+                    sslbls = self.line_labels
+                else:
+                    import warnings
+                    warnings.warn(f'Cannot infer target of gate(s) of {obj}.')
+                    sslbls = tuple()
+                newobj = _Label((new_gatename,) + sslbls)
             else:
                 newobj = [replace(sub) for sub in obj]
             return newobj

From 8cf274c29df8a7eace07afd3f07dc6fae2cf5dd3 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 15 Jul 2025 15:42:21 -0700
Subject: [PATCH 080/141] Helper classes

---
 pygsti/models/explicitmodel.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index d5df6a696..1dec5eb29 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -48,6 +48,31 @@
 from pygsti.modelmembers.operations import EmbeddedOp as _EmbeddedOp, ComposedOp as _ComposedOp
 
 
+class Roster:
+    def __init__(self, arg):
+        if isinstance(arg, str) and arg == 'all':
+            self.trivial = True
+            self.collection = None
+        else:
+            self.trivial = False
+            self.collection = arg
+    def __contains__(self, item):
+        return self.trivial or (item in self.collection)
+
+
+class ModelView:
+    @staticmethod
+    def cast(arg):
+        return arg if arg is not None else ModelView()
+    def __init__(self):
+        self.operations  = _collections.defaultdict(lambda: None)
+        self.preps       = _collections.defaultdict(lambda: None)
+        self.povms       = _collections.defaultdict(lambda: None)
+        self.instruments = _collections.defaultdict(lambda: None)
+        self.factories   = _collections.defaultdict(lambda: None)
+
+
+
 class ExplicitOpModel(_mdl.OpModel):
     """
     Encapsulates a set of gate, state preparation, and POVM effect operations.

From e9f4576fb6db7edbea8c2f760f305c570e43e250 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 10:38:32 -0700
Subject: [PATCH 081/141] add ECR to standard unitaries

---
 pygsti/tools/internalgates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pygsti/tools/internalgates.py b/pygsti/tools/internalgates.py
index 86f7e6f58..e5249ea78 100644
--- a/pygsti/tools/internalgates.py
+++ b/pygsti/tools/internalgates.py
@@ -196,6 +196,7 @@ def standard_gatename_unitaries():
       * 'Gsqrtiswap' : square-root of ISWAP gate, used in some superconducting qubit platforms.
       * 'Gxx', 'Gzz' : MS-style parity gates
       * 'Gcres', 'Gecres' : Cross-resonance and echoed cross-resonance gates. Native gate operations common on transmon systems (including IBM).
+      * 'Gecr' : alternative name for the echoed cross-resonance gate, matching OpenQASM / IBM conventions.
       
     * Non-Clifford gates:
 
@@ -302,7 +303,7 @@ def u_op(exp):
     std_unitaries['Gcres'] = _spl.expm(-1j*_np.pi/4*sigmaxz)
     std_unitaries['Gecres'] = _np.array([[0, 1, 0., 1j], [1., 0, -1j, 0.],
                                         [0., 1j, 0, 1], [-1j, 0., 1, 0]], complex)/_np.sqrt(2)
-
+    std_unitaries['Gecr'] = std_unitaries['Gecres']  # alias
     std_unitaries['Gzr'] = Gzr()
     std_unitaries['Gczr'] = Gczr()
 
@@ -403,7 +404,7 @@ def standard_gatenames_stim_conversions():
     ecr_unitary = _np.array([[0, 1, 0., 1j], [1., 0, -1j, 0.],
                              [0., 1j, 0, 1], [-1j, 0., 1, 0]], complex)/_np.sqrt(2)
     gate_dict['Gecres'] = stim.Tableau.from_unitary_matrix(ecr_unitary, endian='big')
-
+    gate_dict['Gecr'] = gate_dict['Gecres']
     return gate_dict
 
 def standard_gatenames_cirq_conversions():

From 1dcc580ec62cf1e898032ff523d5963235394d9f Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 10:40:33 -0700
Subject: [PATCH 082/141] Update crosstalk free generator to add explicit idles
 to single qubit gates acting in a two qubit system.

---
 .../protocols/crosstalkfreeexperimentdesign.py  | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 504af3607..223131bc6 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -70,6 +70,17 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     assert Label(()) not in mapper_1q.values()
     aux_info = {}
 
+    m2q = mapper_2q.copy()
+    for k2 in mapper_2q:
+        if k2.num_qubits == 1:
+            tgt = k2[1]
+            tmp = [None, None]
+            tmp[tgt] = k2
+            tmp[1-tgt] = Label("Gi", 1-tgt)
+            m2q[k2] = tuple(tmp)
+
+    mapper_2q = m2q # Reset here.
+
     num_lines = -1
     global_line_order = None
     for patch, edge_set in color_patches.items():
@@ -155,9 +166,11 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                     c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
 
                 for i in range(node_start, node_perms.shape[0]):
-                    c2 = oneq_circuits[ node_perms[i,j] ] # Fix col
-                    c2._static = False
+                    c2 = oneq_circuits[ node_perms[i,j] ].copy(True) # Fix col
+                    
                     c2._labels = [mapper_1q[ell].copy() for ell in c2._labels]
+                    c2._append_idle_layers_inplace(len(c) - len(c2))
+
                     c2.done_editing()
                     assert Label(()) not in c2._labels
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}

From 2180de60b87aa9706c7919acd291d620e70fc6ee Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 11:16:33 -0700
Subject: [PATCH 083/141] Append idling layers

---
 pygsti/protocols/crosstalkfreeexperimentdesign.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 223131bc6..9da9d361f 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -56,8 +56,8 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
     twoq_idle_label = Label(('Gii',) + twoq_gstdesign.qubit_labels)
     oneq_idle_label = Label(('Gi',) + oneq_gstdesign.qubit_labels)
-    mapper_2q = {twoq_idle_label: twoq_idle_label}
-    mapper_1q = {oneq_idle_label: oneq_idle_label}
+    mapper_2q: dict[Label, Label] = {twoq_idle_label: twoq_idle_label}
+    mapper_1q: dict[Label, Label] = {oneq_idle_label: oneq_idle_label}
     for cl in twoq_gstdesign.circuit_lists:
         for c in cl:
             mapper_2q.update({k:k for k in c._labels})
@@ -77,7 +77,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             tmp = [None, None]
             tmp[tgt] = k2
             tmp[1-tgt] = Label("Gi", 1-tgt)
-            m2q[k2] = tuple(tmp)
+            m2q[k2] = Label(tuple(tmp))
 
     mapper_2q = m2q # Reset here.
 
@@ -168,8 +168,9 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 for i in range(node_start, node_perms.shape[0]):
                     c2 = oneq_circuits[ node_perms[i,j] ].copy(True) # Fix col
                     
-                    c2._labels = [mapper_1q[ell].copy() for ell in c2._labels]
-                    c2._append_idle_layers_inplace(len(c) - len(c2))
+                    # ell here will be a list[Label] since we copied in editable format.
+                    c2._labels = [mapper_1q[ell[0]].copy() for ell in c2._labels]
+                    c2._append_idling_layers_inplace(len(c) - len(c2))
 
                     c2.done_editing()
                     assert Label(()) not in c2._labels

From b037fabe3a44cb9f6cdeb6837dede0621ac854d2 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 11:45:38 -0700
Subject: [PATCH 084/141] edesign seems to work now

---
 pygsti/circuits/circuit.py                    | 11 ++-
 .../crosstalkfreeexperimentdesign.py          | 83 ++++++++++++++-----
 2 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 435382e89..e6b1cae46 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -1451,15 +1451,20 @@ def insert_idling_layers_inplace(self, insert_before, num_to_insert, lines=None)
         None
         """
         assert(not self._static), "Cannot edit a read-only circuit!"
-        if insert_before is None: insert_before = len(self._labels)
-        elif insert_before < 0: insert_before = len(self._labels) + insert_before
-
+        if insert_before is None:
+            insert_before = len(self._labels)
+        elif insert_before < 0:
+            insert_before = len(self._labels) + insert_before
+        
         if lines is None:  # insert complete layers
             for i in range(num_to_insert):
                 self._labels.insert(insert_before, [])
 
             #Shift compilable layer indices as needed
             if self._compilable_layer_indices_tup:
+                if num_to_insert <= 0 and insert_before < len(self._labels):
+                    raise ValueError('Undefined behavior (at least until the ' \
+                    'documentation is updated).')
                 shifted_inds = [i if (i < insert_before) else (i + num_to_insert)
                                 for i in self._compilable_layer_indices_tup]
                 self._compilable_layer_indices_tup = tuple(shifted_inds)
diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 223131bc6..1a77a2f6d 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -126,6 +126,28 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             assert num_lines == curr_num_lines
             if edge_perms.size > 0 and node_perms.size > 0:
                 assert edge_perms.shape[1] == node_perms.shape[1]
+            
+            # compute any padding we might need.
+            padded_circuit_lengths = np.zeros((max_len,), int)
+            for j in range(max_len):
+                clen = 0
+                if len(edge_perms):
+                    edge_start = 1
+                    node_start = 0
+                    c = twoq_circuits[edge_perms[0,j]]
+                    clen = max(clen, len(c))
+                else:
+                    edge_start = 0
+                    node_start = 1
+                    c = oneq_circuits[node_perms[0,j]]
+                    clen = max(clen, len(c))
+                for i in range(edge_start, edge_perms.shape[0]):
+                    c2 = twoq_circuits[ edge_perms[i,j] ]
+                    clen = max(clen, len(c2))
+                for i in range(node_start, node_perms.shape[0]):
+                    c2 = oneq_circuits[ node_perms[i,j] ]
+                    clen = max(clen, len(c2)) # c is already a copy due to map_state_space_labels above
+                padded_circuit_lengths[j] = clen
 
             """
             NOTE: I was able to infer that the twoq_gstdesign.sslabels should really be qubit labels by seeing how (oldq, newq)
@@ -133,52 +155,69 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             """
             for j in range(max_len):
                 # Pick the initial subcircuit
+                clen = padded_circuit_lengths[j]
                 if len(edge_perms):
-                    c = twoq_circuits[edge_perms[0,j]]
+                    edge_start = 1
+                    node_start = 0
+                    c = twoq_circuits[edge_perms[0,j]].copy(True)
+                    c._append_idling_layers_inplace(clen - len(c))
+                    c.done_editing()
+                    # ^ That changes the format of c._labels. We need to make more edits in this format,
+                    #   so in the next line we set c._static = False.
                     c._static = False
-                    c._labels = [mapper_2q[ell].copy() for ell in c._labels]
+                    c._labels = [mapper_2q[ell] for ell in c._labels]
                     c.done_editing()
-                    assert Label(()) not in c._labels
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[0])}
                     c = c.map_state_space_labels(map_dict)
-                    edge_start = 1
-                    node_start = 0
                 else:
-                    c = oneq_circuits[node_perms[0,j]]
-                    c._static = False
-                    c._labels = [mapper_1q[ell].copy() for ell in c._labels]
+                    edge_start = 0
+                    node_start = 1
+                    c = oneq_circuits[node_perms[0,j]].copy(True)
+                    c._append_idling_layers_inplace(clen - len(c))
+                    c.done_editing()
+                    # ^ That changes the format of c._labels. We need to make more edits in this format,
+                    #   so in the next line we set c._static = False.
+                    c._static = False 
+                    c._labels = [mapper_1q[ell] for ell in c._labels]
                     c.done_editing()
-                    assert Label(()) not in c._labels
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[0],))}
                     c = c.map_state_space_labels(map_dict)
-                    edge_start = 0
-                    node_start = 1
+
                 
                 # Tensor together the other subcircuits
                 for i in range(edge_start, edge_perms.shape[0]):
-                    c2 = twoq_circuits[ edge_perms[i,j] ]  # Fix col
+                    c2 = twoq_circuits[ edge_perms[i,j] ].copy(True)  # Fix col
+                    c2._append_idling_layers_inplace(clen - len(c2))
+                    c2.done_editing()
+                    # ^ That changes the format of c2._labels. We need to make more edits in this format,
+                    #   so in the next line we set c2._static = False.
                     c2._static = False
-                    c2._labels = [mapper_2q[ell].copy() for ell in c2._labels]
+                    c2._labels = [mapper_2q[ell] for ell in c2._labels]
                     c2.done_editing()
                     assert Label(()) not in c2._labels
                     map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[i])}
                     c2 = c2.map_state_space_labels(map_dict)
-                    c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
+                    c = c.tensor_circuit(c2) # c is already a copy due to map_state_space_labels above
 
                 for i in range(node_start, node_perms.shape[0]):
-                    c2 = oneq_circuits[ node_perms[i,j] ].copy(True) # Fix col
-                    
-                    c2._labels = [mapper_1q[ell].copy() for ell in c2._labels]
-                    c2._append_idle_layers_inplace(len(c) - len(c2))
-
+                    c2 = oneq_circuits[ node_perms[i,j] ].copy(True)
+                    c2._append_idling_layers_inplace(clen - len(c2))
+                    c2.done_editing()
+                    # ^ That changes the format of c2._labels. We need to make more edits in this format,
+                    #   so in the next line we set c2._static = False.
+                    c2._static = False
+                    c2._labels = [mapper_1q[ell] for ell in c2._labels]
                     c2.done_editing()
                     assert Label(()) not in c2._labels
                     map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}
                     c2 = c2.map_state_space_labels(map_dict)
-                    c = c.tensor_circuit(c2) # c is already a copy due to map_line_labels above
+                    c = c.tensor_circuit(c2) # c is already a copy due to map_state_space_labels above
                 
-                assert Label(()) not in c._labels
-                # By this point, should have len(c._line_labels) == [some constant, number of ]
+                for i in range(c.num_layers):
+                    l0 = set(c.layer(i))
+                    l1 = set(c.layer_with_idles(i))
+                    assert l0 == l1
+
                 circuit_lists[L].append(c.reorder_lines(global_line_order))
 
 

From 8cec2d234fa622dc4b150ae23e8a743cce47b920 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 11:50:04 -0700
Subject: [PATCH 085/141] port a change from Nicks last commit

---
 pygsti/protocols/crosstalkfreeexperimentdesign.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 1a77a2f6d..43e0436d7 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -56,8 +56,8 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
     twoq_idle_label = Label(('Gii',) + twoq_gstdesign.qubit_labels)
     oneq_idle_label = Label(('Gi',) + oneq_gstdesign.qubit_labels)
-    mapper_2q = {twoq_idle_label: twoq_idle_label}
-    mapper_1q = {oneq_idle_label: oneq_idle_label}
+    mapper_2q: dict[Label, Label] = {twoq_idle_label: twoq_idle_label}
+    mapper_1q: dict[Label, Label] = {oneq_idle_label: oneq_idle_label}
     for cl in twoq_gstdesign.circuit_lists:
         for c in cl:
             mapper_2q.update({k:k for k in c._labels})
@@ -77,7 +77,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             tmp = [None, None]
             tmp[tgt] = k2
             tmp[1-tgt] = Label("Gi", 1-tgt)
-            m2q[k2] = tuple(tmp)
+            m2q[k2] = Label(tuple(tmp))
 
     mapper_2q = m2q # Reset here.
 
@@ -146,13 +146,9 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                     clen = max(clen, len(c2))
                 for i in range(node_start, node_perms.shape[0]):
                     c2 = oneq_circuits[ node_perms[i,j] ]
-                    clen = max(clen, len(c2)) # c is already a copy due to map_state_space_labels above
+                    clen = max(clen, len(c2))
                 padded_circuit_lengths[j] = clen
 
-            """
-            NOTE: I was able to infer that the twoq_gstdesign.sslabels should really be qubit labels by seeing how (oldq, newq)
-            were in the iterator that zip'd (twoq_gstdesign.sslabels, other_thing).
-            """
             for j in range(max_len):
                 # Pick the initial subcircuit
                 clen = padded_circuit_lengths[j]

From 45c0317881025b1c4a29770210c5713f55bad585 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 11:51:39 -0700
Subject: [PATCH 086/141] Revert "Append idling layers"

This reverts commit 2180de60b87aa9706c7919acd291d620e70fc6ee.

I took the part of the commit that was useful and copied to over to another temporary branch I was working on. This revert is in preparation for merging in the contents of that branch.
---
 pygsti/protocols/crosstalkfreeexperimentdesign.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 9da9d361f..223131bc6 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -56,8 +56,8 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
     circuit_lists = [[] for _ in twoq_gstdesign.circuit_lists]
     twoq_idle_label = Label(('Gii',) + twoq_gstdesign.qubit_labels)
     oneq_idle_label = Label(('Gi',) + oneq_gstdesign.qubit_labels)
-    mapper_2q: dict[Label, Label] = {twoq_idle_label: twoq_idle_label}
-    mapper_1q: dict[Label, Label] = {oneq_idle_label: oneq_idle_label}
+    mapper_2q = {twoq_idle_label: twoq_idle_label}
+    mapper_1q = {oneq_idle_label: oneq_idle_label}
     for cl in twoq_gstdesign.circuit_lists:
         for c in cl:
             mapper_2q.update({k:k for k in c._labels})
@@ -77,7 +77,7 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             tmp = [None, None]
             tmp[tgt] = k2
             tmp[1-tgt] = Label("Gi", 1-tgt)
-            m2q[k2] = Label(tuple(tmp))
+            m2q[k2] = tuple(tmp)
 
     mapper_2q = m2q # Reset here.
 
@@ -168,9 +168,8 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
                 for i in range(node_start, node_perms.shape[0]):
                     c2 = oneq_circuits[ node_perms[i,j] ].copy(True) # Fix col
                     
-                    # ell here will be a list[Label] since we copied in editable format.
-                    c2._labels = [mapper_1q[ell[0]].copy() for ell in c2._labels]
-                    c2._append_idling_layers_inplace(len(c) - len(c2))
+                    c2._labels = [mapper_1q[ell].copy() for ell in c2._labels]
+                    c2._append_idle_layers_inplace(len(c) - len(c2))
 
                     c2.done_editing()
                     assert Label(()) not in c2._labels

From eb85393fafff0ebbde5fce517b9731536fd61f4c Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 12:51:47 -0700
Subject: [PATCH 087/141] Implicit op models work with opfactories that return
 embeddedops.

---
 pygsti/circuits/split_circuits_into_lanes.py | 4 ++--
 pygsti/layouts/evaltree.py                   | 4 ++--
 pygsti/modelmembers/operations/opfactory.py  | 4 +++-
 pygsti/models/localnoisemodel.py             | 7 +++++++
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
index 633565e2a..a140cbfd0 100644
--- a/pygsti/circuits/split_circuits_into_lanes.py
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -78,7 +78,7 @@ def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> l
 
     num_layers = circuit.num_layers
     for layer_ind in range(num_layers):
-        layer = circuit.layer(layer_ind)
+        layer = circuit.layer_with_idles(layer_ind)
         group = []
         group_lane = None
         sorted_layer = sorted(layer, key=lambda x: x.qubits[0])
@@ -104,6 +104,6 @@ def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> l
             lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
 
     if num_layers == 0:
-        return []
+        return lanes_to_gates
 
     return lanes_to_gates
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index df4e63533..8030bc550 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -503,7 +503,7 @@ def setup_circuit_list_for_LCS_computations(
         if implicit_idle_gate_name:
             cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
 
-        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir, cir.num_lines)
+        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir)
         sub_cirs = compute_subcircuits(cir, qubit_to_lane)
 
         if not implicit_idle_gate_name:
@@ -820,7 +820,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         for key, vals in line_lbls_to_circuit_list.items():
             sub_cirs = []
             for cir in vals:
-                sub_cirs.append(cir.layertup)
+                sub_cirs.append(list(cir))
             if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
                 if len(key) not in size_to_tree:
                     self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
diff --git a/pygsti/modelmembers/operations/opfactory.py b/pygsti/modelmembers/operations/opfactory.py
index cbaed54c3..0719e2a75 100644
--- a/pygsti/modelmembers/operations/opfactory.py
+++ b/pygsti/modelmembers/operations/opfactory.py
@@ -9,6 +9,7 @@
 # in compliance with the License.  You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
+from __future__ import annotations
 import numpy as _np
 
 from pygsti.modelmembers.operations.staticunitaryop import StaticUnitaryOp as _StaticUnitaryOp
@@ -24,9 +25,10 @@
 from pygsti.baseobjs import basis as _basis
 from pygsti.evotypes import Evotype as _Evotype
 from pygsti.tools import optools as _ot
+from pygsti.modelmembers.operations.linearop import LinearOperator as _LinearOperator
 
 
-def op_from_factories(factory_dict, lbl):
+def op_from_factories(factory_dict: dict[_Lbl, OpFactory], lbl: _Lbl) -> _LinearOperator:
     """
     Create an operator for `lbl` from the factories in `factory_dict`.
 
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 0ca844214..6b4ad778e 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -631,8 +631,15 @@ def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel
         """
 
         key = lbl.name if self._spatial_homogeneity_assumed else lbl
+        key_without_args = lbl.strip_args()
+
         if key in model.operation_blks["gates"]:
             return model.operation_blks["gates"][key].to_dense()
+        
+        elif key_without_args in model.factories['layers']:
+            ret = _opfactory.op_from_factories(model.factories['layers'], key)
+
+            return ret.embedded_op.to_dense()
 
         elif self._add_padded_idle:
             # We have idle gates that we can include.

From 1d31a651194b5482fb0bc5b90dd52cb57f2bc3a5 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 13:18:42 -0700
Subject: [PATCH 088/141] Add more tests for the forward simulator on implicit
 op models.

---
 pygsti/circuits/split_circuits_into_lanes.py  |  21 +
 .../test_forwardsim_on_implicitop_model.py    | 371 ++++++++++++++++++
 2 files changed, 392 insertions(+)
 create mode 100644 test/unit/objects/test_forwardsim_on_implicitop_model.py

diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
index a140cbfd0..323f79dc0 100644
--- a/pygsti/circuits/split_circuits_into_lanes.py
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -74,6 +74,27 @@ def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> l
     Split a circuit into multiple subcircuits which do not talk across lanes.
     """
 
+    if "lanes" in circuit.saved_auxinfo:
+        # Check if the lane info matches and I can just return that set up.
+        lane_to_qubits: dict[int, tuple[int, ...]]= {}
+        for qu, val in qubits_to_lanes.items():
+            if val in lane_to_qubits:
+                lane_to_qubits[val] = (*lane_to_qubits[val], qu)
+            else:
+                lane_to_qubits[val] = (qu,)
+
+        if len(lane_to_qubits) == len(circuit.saved_auxinfo["lanes"]):
+            # We may have this already in cache.
+
+            lanes_to_gates = [[] for _ in range(len(lane_to_qubits))]
+            for i, key in lane_to_qubits.items():
+                if sorted(key) in circuit.saved_auxinfo["lanes"]:
+                    lanes_to_gates[i] = circuit.saved_auxinfo["lanes"][sorted(key)].layertup
+
+                else:
+                    raise ValueError(f"lbl cache miss: {key} in circuit {circuit}")
+            return lanes_to_gates
+
     lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
 
     num_layers = circuit.num_layers
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
new file mode 100644
index 000000000..cb91d3c07
--- /dev/null
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -0,0 +1,371 @@
+import numpy as np
+
+from pygsti.baseobjs import qubitgraph as _qgraph
+from pygsti.baseobjs import QubitSpace
+from pygsti.models import modelconstruction as pgmc
+from pygsti.processors import QubitProcessorSpec
+from pygsti.circuits import Circuit
+from pygsti.tools import unitary_to_superop
+
+from pygsti.baseobjs import Label
+from pygsti.modelmembers import operations as op
+from pygsti.baseobjs import UnitaryGateFunction
+
+from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
+
+def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: Circuit):
+
+    for key, val in op_dict.items():
+        assert key in exp_dict
+        assert np.allclose(exp_dict[key], val), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
+
+#region Model Construction
+def construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta):
+
+    first_term = np.exp(alpha*1j)
+    left_mat = np.array([np.exp(-1j*beta/2), 0, 0, np.exp(1j*beta/2)]).reshape(2,2)
+    rotate_mat = np.array([np.cos(gamma/2), -np.sin(gamma/2), np.sin(gamma/2), np.cos(gamma/2)]).reshape(2,2)
+    right_mat = np.array([np.exp(-1j*delta/2), 0, 0, np.exp(1j*delta/2)]).reshape(2,2)
+
+    my_matrix = first_term * (left_mat @ (rotate_mat @ right_mat))
+
+    assert np.allclose(np.conjugate( my_matrix.T) @ my_matrix, np.eye(2))
+
+    return my_matrix
+
+
+class MyContinuouslyParameterizedGateFunction(UnitaryGateFunction):
+    shape = (2, 2)
+    def __call__(self, alpha, beta, gamma, delta):                                                                                                                                                                                                                                                                                                                                                                                                                   
+        return construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta)
+
+
+class ArbParameterizedOpFactory(op.OpFactory):
+    def __init__(self, state_space, location:int):
+        op.OpFactory.__init__(self, state_space=state_space, evotype="densitymx")
+        self.my_state_space = state_space
+        self.interesting_qubit = location
+
+    def create_object(self, args=None, sslbls=None):
+        # Note: don't worry about sslbls (unused) -- this argument allow factories to create different operations on different target qubits
+        assert(len(args) == 4)
+        alpha, beta, gamma, delta = args
+        
+        unitary = construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta)
+        
+        superop = unitary_to_superop(unitary)
+        return op.EmbeddedOp(state_space = self.my_state_space, target_labels=self.interesting_qubit, operation_to_embed=op.StaticArbitraryOp(superop))
+
+def make_spam(num_qubits):
+    state_space = QubitSpace(num_qubits)
+    max_weights = {'H':1, 'S':1, 'C':1, 'A':1}
+    # egbn_H_only = CompleteElementaryErrorgenBasis('PP', state_space, ('H',), max_weights)
+
+    # rho_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
+    # rho_lindblad = LindbladErrorgen.from_elementary_errorgens(rho_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
+    # rho_errorgen = ExpErrorgenOp(rho_lindblad)
+    # rho_ideal    = ComputationalBasisState([0]*num_qubits)
+    # rho          = ComposedState(rho_ideal, rho_errorgen)
+
+    # M_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
+    # M_lindblad = LindbladErrorgen.from_elementary_errorgens(M_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
+    # M_errorgen = ExpErrorgenOp(M_lindblad)
+    # M = ComposedPOVM(M_errorgen)
+    M = None
+    return None, M
+
+def make_target_model(num_qubits):
+    ps_geometry = _qgraph.QubitGraph.common_graph(
+        num_qubits, geometry='line',
+        directed=True, all_directions=True,
+        qubit_labels=tuple(range(num_qubits))
+    )
+    u_ecr = 1/np.sqrt(2)*np.array([[0,0,1,1j],[0,0,1j,1],[1,-1j,0,0],[-1j,1,0,0]])
+    gatenames = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi', 'Gii',  'Gecr', "Gcnot", "Gswap"]
+    ps = QubitProcessorSpec(
+        num_qubits=num_qubits,
+        gate_names=gatenames,
+        # nonstd_gate_unitaries={"Gcustom": MyContinuouslyParameterizedGateFunction(),
+        #     'Gecr': u_ecr, 'Gii': np.eye(4)},
+        nonstd_gate_unitaries={'Gecr': u_ecr, 'Gii': np.eye(4)},
+        geometry=ps_geometry
+    )
+    # gateerrs = dict()
+    # egb1 = CompleteElementaryErrorgenBasis('PP', QubitSpace(1), ('H','S'), default_label_type='local')
+    # for gn in gatenames[:-1]:
+    #     gateerrs[gn] = {ell: 0 for ell in egb1.labels}
+    # egb2 = CompleteElementaryErrorgenBasis('PP', QubitSpace(2), ('H','S'), default_label_type='local')
+    # gateerrs['Gecr'] = {ell: 0 for ell in egb2.labels}
+    # gateerrs['Gii'] = gateerrs['Gecr']
+    # tmn = pgmc.create_crosstalk_free_model(ps, lindblad_error_coeffs=gateerrs)
+
+    tmn = pgmc.create_crosstalk_free_model(ps, implicit_idle_mode="pad_1Q", independent_gates=True)
+    
+    # rho, M = make_spam(num_qubits)
+    # tmn.prep_blks['layers']['rho0'] = rho
+    # tmn.povm_blks['layers']['Mdefault'] = M
+    # tmn._rebuild_paramvec()
+
+
+    for i in range(num_qubits):
+        Ga_factory = ArbParameterizedOpFactory(state_space=QubitSpace(num_qubits), location=(i,))
+        tmn.factories["layers"][("Gcustom", i)] = Ga_factory # add in the factory for every qubit.
+    
+    return tmn
+
+def build_models_for_testing(num_qubits):
+
+    tgt_model = make_target_model(num_qubits)
+
+    # target_model.sim.calclib = pygsti.forwardsims.mapforwardsim_calc_generic
+    tgt_model.sim = LCSEvalTreeMatrixForwardSimulator()
+
+    tgt_model2 = make_target_model(num_qubits)
+
+    return tgt_model, tgt_model2
+
+
+#endregion Model Construction
+
+#region Building Random Circuits
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
+    my_circuit = []
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = str(np.random.choice(allowed_gates))
+            layer.append((gate, qnum))
+        my_circuit.append(layer)
+    return Circuit(my_circuit)
+
+def build_circuit_with_arbitrarily_random_single_qubit_gates(num_qubits: int, depth_L: int):
+
+    my_circuit = []
+    gate_name = "Gcustom"
+
+    full_args = np.random.random((depth_L, num_qubits, 4)) * 4 * np.pi # Need to be in [0, 2 \pi] for the half angles.
+
+    for lnum in range(depth_L):
+        layer = []
+        for qnum in range(num_qubits):
+            gate = Label(gate_name, qnum, args=(full_args[lnum, qnum]))
+            layer.append(gate)
+        my_circuit.append(layer)
+    return Circuit(my_circuit, num_lines=num_qubits)
+
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: int, depth_L: int, lane_end_points: list[int], gates_to_qubits_used: dict[str, int]):
+
+    assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
+    assert lane_end_points[0] > 0
+    assert np.all(np.diff(lane_end_points) > 0) # then it is sorted in increasing order.
+
+    if lane_end_points[-1] < num_qubits:
+        lane_end_points.append(num_qubits)
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        start_point = 0
+
+        for lane_ep in lane_end_points:
+            num_used: int = 0
+            while num_used < (lane_ep - start_point):
+                navail = (lane_ep - start_point) - num_used
+                nchosen = 0
+                if navail >= max(n_qs_to_gates_avail):
+                    # we can use any gate
+                    nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+                else:
+                    # we need to first choose how many to use.
+                    nchosen = np.random.randint(1, navail + 1)
+                gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+                tmp = list(np.random.permutation(nchosen) + num_used + start_point) # Increase to offset.
+                perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+                if gate == "Gcustom":
+                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+                else:
+                    layer.append((gate, *perm_of_qubits_used))
+                num_used += nchosen
+
+            if num_used > (lane_ep - start_point) + 1:
+                print(num_used, f"lane ({start_point}, {lane_ep})")
+                raise AssertionError("lane barrier is broken")
+            
+            start_point = lane_ep
+        my_circuit.append(layer)
+    return Circuit(my_circuit, num_lines=num_qubits)
+
+
+def build_circuit_with_multiple_qubit_gates(num_qubits: int, depth_L: int, gates_to_qubits_used: dict[str, int], starting_qubit: int=0):
+
+    my_circuit = []
+    n_qs_to_gates_avail = {}
+    for key, val in gates_to_qubits_used.items():
+        if val in n_qs_to_gates_avail:
+            n_qs_to_gates_avail[val].append(key)
+        else:
+            n_qs_to_gates_avail[val] = [key]
+
+    for lnum in range(depth_L):
+        layer = []
+        num_used: int = 0
+        while num_used < num_qubits:
+            navail = num_qubits - num_used
+            nchosen = 0
+            if navail >= max(n_qs_to_gates_avail):
+                # we can use any gate
+                nchosen = np.random.randint(1, max(n_qs_to_gates_avail) + 1)
+            else:
+                # we need to first choose how many to use.
+                nchosen = np.random.randint(1, navail + 1)
+            gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
+            tmp = list(np.random.permutation(nchosen) + num_used) # Increase to offset.
+            perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
+            if gate == "Gcustom":
+                layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+            else:
+                layer.append((gate, *perm_of_qubits_used))
+            num_used += nchosen
+
+        my_circuit.append(layer)
+    return Circuit(my_circuit, num_lines=num_qubits)
+
+#endregion Building Random Circuits
+
+
+#region Consistency of Pro
+def test_tensor_product_single_unitaries_yield_right_results():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    circuitNone = Circuit([], num_lines=num_qubits)
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+
+    for cir in [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]:
+        probs = under_test.probabilities(cir)
+        exp = expected_model.probabilities(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+def test_tensor_product_single_unitaries_random_collection_of_xyz():
+
+    for qb in range(2, 6):
+
+        under_test, expected_model = build_models_for_testing(qb)
+        allowed_gates = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi']
+
+        circuit100 = build_circuit(qb, 100, allowed_gates=allowed_gates)
+
+        probs = under_test.probabilities(circuit100)
+        exp = expected_model.probabilities(circuit100)
+
+        assert_probability_densities_are_equal(probs, exp, circuit100)
+
+def test_tensor_product_arbitrarily_random_rotations():
+
+    for qb in range(2, 6):
+
+        under_test, expected_model = build_models_for_testing(qb)
+
+        circuit = build_circuit_with_arbitrarily_random_single_qubit_gates(qb, 10)
+
+        probs = under_test.probabilities(circuit)
+        exp = expected_model.probabilities(circuit)
+
+        assert_probability_densities_are_equal(probs, exp, circuit)
+
+def test_tensor_product_two_qubit_gates():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+
+    circuitECR01 = Circuit([[("Gecr", 0,1), ("Gi", 2), ("Gzpi2", 3)]])    
+    circuitECR10 = Circuit([[("Gecr", 1,0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    for cir in [circuitECR01, circuitECR10]:
+        probs = under_test.probabilities(cir)
+        exp = expected_model.probabilities(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+def test_tensor_product_gates_with_implicit_idles():
+
+    num_qubits = 5
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+    
+    gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
+    for gate in gatenames:
+        for i in range(num_qubits):
+            cir = Circuit([[(gate, i)]], num_lines=num_qubits)
+
+            probs = under_test.probabilities(cir)
+            exp = expected_model.probabilities(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+    # Now for the two qubit gates. Gecr and GCNOT
+
+    # gatenames = ["Gecr", "Gcnot"]
+    gatenames = ["Gecr"]
+    for gate in gatenames:
+        for i in range(num_qubits - 1):
+            cir = Circuit([[(gate, i, i+1)]], num_lines=num_qubits)
+
+            probs = under_test.probabilities(cir)
+            exp = expected_model.probabilities(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+            # Order swapped.
+            cir = Circuit([[(gate, i+1, i)]], num_lines=num_qubits)
+
+            probs = under_test.probabilities(cir)
+            exp = expected_model.probabilities(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+def test_tensor_product_multi_qubit_gates_arbitrarily_random_rotations():
+
+
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gcustom': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    for qb in range(3, 6):
+
+        under_test, expected_model = build_models_for_testing(qb)
+
+        circuit = build_circuit_with_multiple_qubit_gates(qb, 100, gates_to_qubits_used=gates_to_used_qubits)
+
+        print(circuit)
+
+        probs = under_test.probabilities(circuit)
+        exp = expected_model.probabilities(circuit)
+
+        assert_probability_densities_are_equal(probs, exp, circuit)
+
+def test_tensor_product_multi_qubit_gates_with_structured_lanes():
+
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    for qb in range(5, 6):
+
+        lanes = [1, 2, 4]
+
+        under_test, expected_model = build_models_for_testing(qb)
+
+        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb, 100, lanes, gates_to_qubits_used=gates_to_used_qubits)
+
+        probs = under_test.probabilities(circuit)
+        exp = expected_model.probabilities(circuit)
+
+        assert_probability_densities_are_equal(probs, exp, circuit)
+#endregion Probabilities Consistency tests

From 61ab0a18db9dc448e1c55033ed48b13a7d43f857 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 14:27:12 -0700
Subject: [PATCH 089/141] Cache the split circuits if desired.

---
 pygsti/circuits/circuit.py                   | 27 +++++++++++++++++-
 pygsti/circuits/split_circuits_into_lanes.py | 25 ++++++++--------
 test/unit/objects/test_circuit_splitting.py  | 30 ++++++++++++++++++--
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index e6b1cae46..e9f53adf5 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -2460,6 +2460,32 @@ def tensor_circuit(self, circuit: Circuit, line_order=None):
         cpy.tensor_circuit_inplace(circuit, line_order)
         if self._static: cpy.done_editing()
         return cpy
+    
+    def _cache_tensor_lanes(self, sub_circuit_list: list[_Label],
+                            lane_to_qubits: dict[int, tuple[int, ...]]) -> Circuit:
+        """
+        Store the tensor lanes in the circuit if appropriate. 
+        Note that this should only be called in the case that the sub_circuit_list
+        when tensored is equivalent to the current circuit.
+        """
+
+        if "lanes" in self.saved_auxinfo and len(self) > 0:
+            if len(self.saved_auxinfo["lanes"]) == 1:
+                # We will update this because it is now believed that
+                # we are able to conduct the operation in cross talk free lanes.
+                qubits_used = set()
+                for qub_in_lane in lane_to_qubits.values():
+                    qubits_used = qubits_used.union(qub_in_lane)
+
+                if len(qubits_used) != self.num_lines:
+                    # Do not update.
+                    return self
+                
+                self.saved_auxinfo["lanes"] = {} # Reset lanes info
+                for i, qubit_labels in lane_to_qubits.items():
+                    self.saved_auxinfo["lanes"][tuple(sorted(qubit_labels))] = sub_circuit_list[i]
+                    
+        return self
 
     def replace_layer_with_circuit_inplace(self, circuit: Circuit, j):
         """
@@ -2641,7 +2667,6 @@ def replace(obj):  # obj is either a simple label or a list
                 else:
                     ret.append(replace(sub))
             return ret
-
         self._labels = replace(self._labels)
 
     def replace_gatename_with_idle(self, gatename):
diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
index 323f79dc0..64d8f9449 100644
--- a/pygsti/circuits/split_circuits_into_lanes.py
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -1,7 +1,7 @@
 import numpy as _np
 
 from pygsti.circuits import Circuit as _Circuit
-from pygsti.baseobjs.label import Label, LabelTupTup
+from pygsti.baseobjs.label import LabelTupTup
 
 def compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit: _Circuit) -> tuple[dict[int, int],
                                                                                         dict[int, tuple[int]]]:
@@ -69,33 +69,29 @@ def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, i
     return compute_qubits_to_lanes(lanes), lanes
 
 
-def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
+def compute_subcircuits(circuit: _Circuit,
+                        qubit_to_lanes: dict[int, int],
+                        lane_to_qubits: dict[int, tuple[int, ...]],
+                        cache_lanes_in_circuit: bool = False) -> list[list[LabelTupTup]]:
     """
     Split a circuit into multiple subcircuits which do not talk across lanes.
     """
 
     if "lanes" in circuit.saved_auxinfo:
         # Check if the lane info matches and I can just return that set up.
-        lane_to_qubits: dict[int, tuple[int, ...]]= {}
-        for qu, val in qubits_to_lanes.items():
-            if val in lane_to_qubits:
-                lane_to_qubits[val] = (*lane_to_qubits[val], qu)
-            else:
-                lane_to_qubits[val] = (qu,)
-
         if len(lane_to_qubits) == len(circuit.saved_auxinfo["lanes"]):
             # We may have this already in cache.
 
             lanes_to_gates = [[] for _ in range(len(lane_to_qubits))]
             for i, key in lane_to_qubits.items():
-                if sorted(key) in circuit.saved_auxinfo["lanes"]:
-                    lanes_to_gates[i] = circuit.saved_auxinfo["lanes"][sorted(key)].layertup
+                if tuple(sorted(key)) in circuit.saved_auxinfo["lanes"]:
+                    lanes_to_gates[i] = circuit.saved_auxinfo["lanes"][tuple(sorted(key))]
 
                 else:
                     raise ValueError(f"lbl cache miss: {key} in circuit {circuit}")
             return lanes_to_gates
 
-    lanes_to_gates = [[] for _ in range(_np.unique(list(qubits_to_lanes.values())).shape[0])]
+    lanes_to_gates = [[] for _ in range(_np.unique(list(qubit_to_lanes.values())).shape[0])]
 
     num_layers = circuit.num_layers
     for layer_ind in range(num_layers):
@@ -108,7 +104,7 @@ def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> l
             # We need this to be sorted by the qubit number so we do not get that a lane was split Q1 Q3 Q2 in the layer where Q1 and Q2 are in the same lane.
             qubits_used = op.qubits # This will be a list of qubits used.
             # I am assuming that the qubits are indexed numerically and not by strings.
-            lane = qubits_to_lanes[qubits_used[0]]
+            lane = qubit_to_lanes[qubits_used[0]]
 
             if group_lane is None:
                 group_lane = lane
@@ -124,6 +120,9 @@ def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> l
             # We have a left over group.
             lanes_to_gates[group_lane].append(LabelTupTup(tuple(group)))
 
+    if cache_lanes_in_circuit:
+        circuit = circuit._cache_tensor_lanes(lanes_to_gates, lane_to_qubits)
+
     if num_layers == 0:
         return lanes_to_gates
 
diff --git a/test/unit/objects/test_circuit_splitting.py b/test/unit/objects/test_circuit_splitting.py
index 397392670..5970dc4cd 100644
--- a/test/unit/objects/test_circuit_splitting.py
+++ b/test/unit/objects/test_circuit_splitting.py
@@ -85,10 +85,36 @@ def test_subcircuits_splits_can_create_empty_sub_circuit():
     original = _Circuit([], line_labels=[0])
 
     qubits_to_lanes = {0: 0}
+    lane_to_qubits = {0: (0,)}
 
-    attempt = compute_subcircuits(original, qubits_to_lanes)
+    attempt = compute_subcircuits(original, qubits_to_lanes, lane_to_qubits)
+
+    assert original == _Circuit(attempt[0], line_labels=[0])
+
+def test_subcircuits_split_can_be_cached():
+
+    gates_to_num_used = {"X": 1, "Y": 1, "Z": 1, "CNOT": 2, "CZ": 2}
+
+    depth = 10
+    num_qubits = 6
+
+    lane_eps = [1, 2, 4, 5]
+    # So expected lane dist is (0, ), (1), (2,3), (4,), (5,)
+
+    # This is a random circuit so the lanes may not be perfect.
+    circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits, depth, lane_eps, gates_to_num_used)
+
+    qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit)
+
+
+    assert "lanes" in circuit.saved_auxinfo
+
+    assert list(circuit.saved_auxinfo["lanes"].keys()) == [(0, 1, 2, 3, 4, 5)]
+
+    sub_cirs = compute_subcircuits(circuit, qubit_to_lane, lane_to_qubits, cache_lanes_in_circuit=True)
+
+    assert len(circuit.saved_auxinfo["lanes"].keys()) == len(sub_cirs)
 
-    assert original == _Circuit(attempt, line_labels=[0])
 
 
 def test_find_qubit_to_lane_splitting():

From e18e7ddd69efcbd6a99857ee6534cd8265412cce Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 16:06:14 -0700
Subject: [PATCH 090/141] Sim cost estimates

---
 pygsti/forwardsims/matrixforwardsim.py |  9 ----
 pygsti/layouts/evaltree.py             | 67 +++++++++++++++++++++++---
 pygsti/layouts/matrixlayout.py         |  9 ++++
 3 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 924d937bc..c7fb8b96e 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2213,17 +2213,10 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         dim = self.model.evotype.minimal_dim(self.model.state_space)
         # resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
 
-        starttime =time.time()
         layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
-        endtime = time.time()
 
-        print("Time to collapse the process matrices (s): ", endtime - starttime)
-        starttime = time.time()
         Gs = layout_atom.tree.reconstruct_full_matrices()
-        endtime = time.time()
-        print("Time to reconstruct the whole matrices (s): ", endtime - starttime)
 
-        starttime = time.time()
         old_err = _np.seterr(over='ignore')
         for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
             # "element indices" index a circuit outcome probability in array_to_fill's first dimension
@@ -2234,8 +2227,6 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
             _fas(array_to_fill, [element_indices],
                  self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
         _np.seterr(**old_err)
-        endtime = time.time()
-        print("Time to complete the spam operations (s): ", endtime - starttime)
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
 
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 8030bc550..83be721b6 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -504,7 +504,7 @@ def setup_circuit_list_for_LCS_computations(
             cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
 
         qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir)
-        sub_cirs = compute_subcircuits(cir, qubit_to_lane)
+        sub_cirs = compute_subcircuits(cir, qubit_to_lane, lane_to_qubits)
 
         if not implicit_idle_gate_name:
             if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
@@ -564,6 +564,14 @@ def combine_two_gates(cumulative_term, next_dense_matrix):
     """
     return next_dense_matrix @ cumulative_term
 
+def matrix_matrix_cost_estimate(matrix_size: tuple[int, int]) -> int:
+    """
+    Estimate cost of A @ B when both are square and dense.
+    """
+    n = matrix_size[0]
+    return 2 * n**3
+
+
 #endregion Lane Collapsing Helpers
 
 
@@ -801,6 +809,20 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         return list(output)
 
+    def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int]):
+        """
+        We assume that each matrix matrix multiply is the same size.
+        """
+
+        assert matrix_size[0] == matrix_size[1]
+
+        total_flop_cost = 0
+        for cache_ind in self.cache:
+            num_mm_on_this_cache_line = len(self.cache[cache_ind]) - 1
+            total_flop_cost += (matrix_matrix_cost_estimate(matrix_size)) * num_mm_on_this_cache_line
+
+        return total_flop_cost
+
 
 class CollectionOfLCSEvalTrees():
 
@@ -841,7 +863,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         self.sub_cir_to_full_cir_id_and_lane_id = sub_cir_to_full_cir_id_and_lane_id
         self.cir_id_and_lane_id_to_sub_cir = cir_id_and_lane_id_to_sub_cir
 
-        self.cir_id_to_tensor_order = {}
+        self.cir_id_to_tensor_order: dict[int, list[list[int], int]] = {}
         self.compute_tensor_orders()
 
         self.saved_results = {}
@@ -893,7 +915,7 @@ def reconstruct_full_matrices(self):
         # Need a map from lane id to computed location.
         for icir in range(num_cirs):
 
-            order = self.cir_id_to_tensor_order[icir]
+            order, _cost_estimate = self.cir_id_to_tensor_order[icir]
             
             
             while order:
@@ -925,7 +947,13 @@ def compute_tensor_orders(self):
 
         return
             
-    def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...], cache):
+    def best_order_for_tensor_contraction(self,
+                    qubit_list: tuple[int, ...],
+                    cache: dict[tuple[int, ...], tuple[list[int], int]]) -> tuple[list[int], int]:
+        """
+        Find the tensor contraction order that minizes the cost of contracting to a dense system with
+        a total number of qubits equal to the len(qubit_list)
+        """
         
 
         if qubit_list in cache:
@@ -961,9 +989,9 @@ def best_order_for_tensor_contraction(self, qubit_list: tuple[int, ...], cache):
                 best_order = list(order)
 
         # Store off the information.
-        cache[qubit_list] = best_order
+        cache[qubit_list] = best_order, best_cost
 
-        return best_order
+        return best_order, best_cost
 
     def _tensor_cost_model(self, num_qubits1, num_qubits2):
         """
@@ -971,3 +999,30 @@ def _tensor_cost_model(self, num_qubits1, num_qubits2):
         """
 
         return (4**num_qubits1)**2 * (4**num_qubits2)**2
+    
+    def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[int, list[int], list[int]]:
+        """
+        Compute the number of flops needed to collapse each circuit into a single process matrix.
+
+        Returns:
+        ---------
+            cost - int total cost to collapse and reform
+            collapse_lane_cost - list[int] cost to collapse a lane
+            tensor_cost - list[int] cost to recombine a circuit into its full size.
+        """
+
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+
+        collapse_lane_cost = []
+
+        for lbl_key, my_tree in self.trees.items():
+            collapse_lane_cost.append(my_tree.flop_cost_of_evaluating_tree([4**len(lbl_key), 4**len(lbl_key)]))
+
+        tensor_cost = []
+        for icir in range(num_cirs):
+            
+            _order, cost = self.cir_id_to_tensor_order[icir]
+            tensor_cost.append(cost)
+
+        return sum(tensor_cost) + sum(collapse_lane_cost), collapse_lane_cost, tensor_cost
\ No newline at end of file
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index e2137e355..4604c6023 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -347,6 +347,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
         
         cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list = _setup_circuit_list_for_LCS_computations(vals, implicit_idle_gate)
         self.tree = _CollectionOfLCSEvalTrees(line_labels_to_circuit_list, sub_cir_to_cir_id_and_lane_id, cir_ind_and_lane_id_to_sub_cir)
+      
         #print("Atom tree: %d circuits => tree of size %d" % (len(expanded_nospam_circuits), len(self.tree)))
 
         self._num_nonscratch_tree_items = len(expanded_nospam_circuits)  # put this in EvalTree?
@@ -386,6 +387,14 @@ def add_expanded_circuits(indices, add_to_this_dict):
                     elindex_outcome_tuples[unique_i].append((elindex, outcome))  # *local* element indices
         self.elindex_outcome_tuples = elindex_outcome_tuples
 
+
+        print("Flop cost to evaluate the tree once: ", self.tree._flop_estimate_to_collapse_to_each_circuit_to_process_matrix()[0])
+        
+        num_circs = len(cir_ind_and_lane_id_to_sub_cir)
+        num_rho_and_em = len(self.indices_by_spamtuple.keys())
+        num_qubits_in_circuit = unique_circuits[0].num_lines
+        print("Flop cost for <p_i G Em>: ", (2*(4**num_qubits_in_circuit)**2)*num_circs*num_rho_and_em)
+
         super().__init__(element_slice, num_elements)
 
     def nonscratch_cache_view(self, a, axis=None):

From fe062e7b9d30a7fe33ab48a119ada5f19df5e52c Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 16:07:53 -0700
Subject: [PATCH 091/141] simplify switch_circuits_by_germ_power_only by
 introducing a new helper function in split_circuits_into_lanes.py

---
 pygsti/circuits/circuit.py                    |  3 +-
 pygsti/circuits/split_circuits_into_lanes.py  | 67 ++++++++++++-
 .../crosstalkfreeexperimentdesign.py          | 97 ++++---------------
 3 files changed, 83 insertions(+), 84 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index e6b1cae46..9085a0104 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 import itertools as _itertools
 import warnings as _warnings
+from typing import List, Sequence
 
 import numpy as _np
 from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
@@ -513,7 +514,7 @@ def _fastinit(cls, labels, line_labels, editable, name='', stringrep=None, occur
     #Note: If editing _bare_init one should also check _copy_init in case changes must be propagated.
     def _bare_init(self, labels, line_labels, editable, name='', stringrep=None, occurrence=None,
                    compilable_layer_indices_tup=()):
-        self._labels = labels
+        self._labels : List[_Label | Sequence[_Label]] = labels
         self._line_labels = tuple(line_labels)
         self._occurrence_id = occurrence
         self._compilable_layer_indices_tup = compilable_layer_indices_tup # always a tuple, but can be empty.
diff --git a/pygsti/circuits/split_circuits_into_lanes.py b/pygsti/circuits/split_circuits_into_lanes.py
index 323f79dc0..e0b8abfc7 100644
--- a/pygsti/circuits/split_circuits_into_lanes.py
+++ b/pygsti/circuits/split_circuits_into_lanes.py
@@ -1,14 +1,16 @@
 import numpy as _np
 
-from pygsti.circuits import Circuit as _Circuit
+from typing import Sequence, Dict, Tuple, Optional, Set
+from pygsti.circuits import Circuit as Circuit
 from pygsti.baseobjs.label import Label, LabelTupTup
 
-def compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit: _Circuit) -> tuple[dict[int, int],
+
+def compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(circuit: Circuit) -> tuple[dict[int, int],
                                                                                         dict[int, tuple[int]]]:
     """
     Parameters:
     ------------
-    circuit: _Circuit - the circuit to compute qubit to lanes mapping for
+    circuit: Circuit - the circuit to compute qubit to lanes mapping for
 
     num_qubits: int - The total number of qubits expected in the circuit.
 
@@ -69,7 +71,7 @@ def compute_qubits_to_lanes(lanes_to_qubits: dict[int, set[int]]) -> dict[int, i
     return compute_qubits_to_lanes(lanes), lanes
 
 
-def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
+def compute_subcircuits(circuit: Circuit, qubits_to_lanes: dict[int, int]) -> list[list[LabelTupTup]]:
     """
     Split a circuit into multiple subcircuits which do not talk across lanes.
     """
@@ -127,4 +129,59 @@ def compute_subcircuits(circuit: _Circuit, qubits_to_lanes: dict[int, int]) -> l
     if num_layers == 0:
         return lanes_to_gates
 
-    return lanes_to_gates
\ No newline at end of file
+    return lanes_to_gates
+
+
+@staticmethod
+def batch_tensor(
+    circuits : Sequence[Circuit],
+    layer_mappers: Dict[int, Dict],
+    global_line_order: Optional[Tuple[int,...]] = None,
+    target_lines : Optional[Sequence[Tuple[int,...]]] = None
+    ) -> Circuit:
+    """
+    """
+    assert len(circuits) > 0
+
+    if target_lines is None:
+        target_lines = []
+        total_lines = 0
+        max_cir_len = 0
+        for c in circuits:
+            target_lines.append(tuple(range(total_lines, total_lines + c.num_lines)))
+            total_lines += c.num_lines
+            max_cir_len = max(max_cir_len, len(c))
+    else:
+        total_lines = sum([c.num_lines for c in circuits])
+        max_cir_len = max(*[len(c) for c in circuits])
+
+    s : Set[int] = set()
+    for c, t in zip(circuits, target_lines):
+        assert not s.intersection(t)
+        assert len(t) == c.num_lines
+        s.update(t)
+
+    if global_line_order is None:
+        global_line_order = tuple(sorted(list(s)))
+    
+    c = circuits[0].copy(editable=True)
+    c._append_idling_layers_inplace(max_cir_len - len(c))
+    c.done_editing()
+    # ^ That changes the format of c._labels. We need to edit c while in this format,
+    #   so the next line sets c._static = False. (We repeat this pattern in the loop below.)
+    c._static = False
+    c._labels = [layer_mappers[c.num_lines][ell] for ell in c._labels]
+    c.map_state_space_labels_inplace({k:v for k,v in zip(c.line_labels, target_lines[0])})
+    c.done_editing()
+    for i, c2 in enumerate(circuits[1:]):
+        c2 = c2.copy(editable=True)
+        c2._append_idling_layers_inplace(max_cir_len - len(c2))
+        c2.done_editing()
+        c2._static = False
+        c2._labels = [layer_mappers[c2.num_lines][ell] for ell in c2._labels]
+        c2.map_state_space_labels_inplace({k:v for k,v in zip(c2.line_labels, target_lines[i+1])})
+        c2.done_editing()
+        c = c.tensor_circuit(c2)
+
+    c = c.reorder_lines(global_line_order)
+    return c
diff --git a/pygsti/protocols/crosstalkfreeexperimentdesign.py b/pygsti/protocols/crosstalkfreeexperimentdesign.py
index 43e0436d7..0e94d1661 100644
--- a/pygsti/protocols/crosstalkfreeexperimentdesign.py
+++ b/pygsti/protocols/crosstalkfreeexperimentdesign.py
@@ -2,6 +2,7 @@
 from pygsti.protocols import CircuitListsDesign, HasProcessorSpec
 from pygsti.circuits.circuitlist import CircuitList
 from pygsti.circuits.circuit import Circuit
+from pygsti.circuits.split_circuits_into_lanes import batch_tensor
 from pygsti.baseobjs.label import Label
 import copy
 
@@ -77,9 +78,10 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             tmp = [None, None]
             tmp[tgt] = k2
             tmp[1-tgt] = Label("Gi", 1-tgt)
-            m2q[k2] = Label(tuple(tmp))
+            m2q[k2] = tuple(tmp)
 
     mapper_2q = m2q # Reset here.
+    layer_mappers = {1: mapper_1q, 2: mapper_2q}
 
     num_lines = -1
     global_line_order = None
@@ -127,97 +129,36 @@ def stitch_circuits_by_germ_power_only(color_patches: dict, vertices: list,
             if edge_perms.size > 0 and node_perms.size > 0:
                 assert edge_perms.shape[1] == node_perms.shape[1]
             
-            # compute any padding we might need.
-            padded_circuit_lengths = np.zeros((max_len,), int)
+            # Form the tensor product circuits, over all qubits.
             for j in range(max_len):
-                clen = 0
+                tensored_lines  = []
+                circs_to_tensor = []
                 if len(edge_perms):
                     edge_start = 1
                     node_start = 0
                     c = twoq_circuits[edge_perms[0,j]]
-                    clen = max(clen, len(c))
+                    tensored_lines.append(edge_set[0])
                 else:
                     edge_start = 0
                     node_start = 1
                     c = oneq_circuits[node_perms[0,j]]
-                    clen = max(clen, len(c))
+                    tensored_lines.append((unused_qubits[0],))
+                circs_to_tensor.append(c)
                 for i in range(edge_start, edge_perms.shape[0]):
                     c2 = twoq_circuits[ edge_perms[i,j] ]
-                    clen = max(clen, len(c2))
+                    circs_to_tensor.append( c2 )
+                    tensored_lines.append(edge_set[i])
                 for i in range(node_start, node_perms.shape[0]):
                     c2 = oneq_circuits[ node_perms[i,j] ]
-                    clen = max(clen, len(c2))
-                padded_circuit_lengths[j] = clen
-
-            for j in range(max_len):
-                # Pick the initial subcircuit
-                clen = padded_circuit_lengths[j]
-                if len(edge_perms):
-                    edge_start = 1
-                    node_start = 0
-                    c = twoq_circuits[edge_perms[0,j]].copy(True)
-                    c._append_idling_layers_inplace(clen - len(c))
-                    c.done_editing()
-                    # ^ That changes the format of c._labels. We need to make more edits in this format,
-                    #   so in the next line we set c._static = False.
-                    c._static = False
-                    c._labels = [mapper_2q[ell] for ell in c._labels]
-                    c.done_editing()
-                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[0])}
-                    c = c.map_state_space_labels(map_dict)
-                else:
-                    edge_start = 0
-                    node_start = 1
-                    c = oneq_circuits[node_perms[0,j]].copy(True)
-                    c._append_idling_layers_inplace(clen - len(c))
-                    c.done_editing()
-                    # ^ That changes the format of c._labels. We need to make more edits in this format,
-                    #   so in the next line we set c._static = False.
-                    c._static = False 
-                    c._labels = [mapper_1q[ell] for ell in c._labels]
-                    c.done_editing()
-                    map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[0],))}
-                    c = c.map_state_space_labels(map_dict)
-
-                
-                # Tensor together the other subcircuits
-                for i in range(edge_start, edge_perms.shape[0]):
-                    c2 = twoq_circuits[ edge_perms[i,j] ].copy(True)  # Fix col
-                    c2._append_idling_layers_inplace(clen - len(c2))
-                    c2.done_editing()
-                    # ^ That changes the format of c2._labels. We need to make more edits in this format,
-                    #   so in the next line we set c2._static = False.
-                    c2._static = False
-                    c2._labels = [mapper_2q[ell] for ell in c2._labels]
-                    c2.done_editing()
-                    assert Label(()) not in c2._labels
-                    map_dict = {oldq: newq for oldq, newq in zip(twoq_gstdesign.qubit_labels, edge_set[i])}
-                    c2 = c2.map_state_space_labels(map_dict)
-                    c = c.tensor_circuit(c2) # c is already a copy due to map_state_space_labels above
-
-                for i in range(node_start, node_perms.shape[0]):
-                    c2 = oneq_circuits[ node_perms[i,j] ].copy(True)
-                    c2._append_idling_layers_inplace(clen - len(c2))
-                    c2.done_editing()
-                    # ^ That changes the format of c2._labels. We need to make more edits in this format,
-                    #   so in the next line we set c2._static = False.
-                    c2._static = False
-                    c2._labels = [mapper_1q[ell] for ell in c2._labels]
-                    c2.done_editing()
-                    assert Label(()) not in c2._labels
-                    map_dict = {oldq: newq for oldq, newq in zip(oneq_gstdesign.qubit_labels, (unused_qubits[i],))}
-                    c2 = c2.map_state_space_labels(map_dict)
-                    c = c.tensor_circuit(c2) # c is already a copy due to map_state_space_labels above
-                
-                for i in range(c.num_layers):
-                    l0 = set(c.layer(i))
-                    l1 = set(c.layer_with_idles(i))
+                    circs_to_tensor.append( c2 )
+                    tensored_lines.append((unused_qubits[i],))
+                c_ten = batch_tensor(circs_to_tensor, layer_mappers, global_line_order, tensored_lines)
+                for i in range(c_ten.num_layers):
+                    l0 = set(c_ten.layer(i))
+                    l1 = set(c_ten.layer_with_idles(i))
                     assert l0 == l1
-
-                circuit_lists[L].append(c.reorder_lines(global_line_order))
-
-
-                aux_info[c] = {'edges': edge_set, 'vertices': unused_qubits} #YOLO
+                circuit_lists[L].append(c_ten)
+                aux_info[c_ten] = {'edges': edge_set, 'vertices': unused_qubits} #YOLO
 
     return circuit_lists, aux_info
 

From c2a2393faaec8cfa9cbf853d99dc07e48138c3c7 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 17:44:33 -0700
Subject: [PATCH 092/141] WIP resolution to the problem of implicit idle gates
 when circuits dont come from CrosstalkFreeExperimentDesign

---
 pygsti/circuits/circuit.py             |  6 +++---
 pygsti/forwardsims/matrixforwardsim.py | 29 ++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index d15a6df6b..557cdbd01 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -13,7 +13,7 @@
 from __future__ import annotations
 import itertools as _itertools
 import warnings as _warnings
-from typing import List, Sequence
+from typing import List, Sequence, Literal
 
 import numpy as _np
 from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
@@ -1026,7 +1026,7 @@ def num_lines(self):
         """
         return len(self._line_labels)
     
-    def copy(self, editable='auto'):
+    def copy(self, editable: bool | Literal['auto'] = 'auto'):
         """
         Returns a copy of the circuit.
 
@@ -3421,7 +3421,7 @@ def layer_with_idles(self, j, idle_gate_name='I'):
         """
         return tuple(self.layer_label_with_idles(j, idle_gate_name).components)
 
-    def layer_label_with_idles(self, j, idle_gate_name='I'):
+    def layer_label_with_idles(self, j, idle_gate_name : str|_Label ='I'):
         """
         Returns the layer, as a :class:`Label`, at depth j, with `idle_gate_name` at empty circuit locations.
 
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index c7fb8b96e..a01d88755 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -34,10 +34,12 @@
 from pygsti.tools import slicetools as _slct
 from pygsti.tools.matrixtools import _fas
 from pygsti.tools import listtools as _lt
-from pygsti.circuits import CircuitList as _CircuitList
+from pygsti.circuits import CircuitList as _CircuitList, Circuit as _Circuit
 from pygsti.tools.internalgates import internal_gate_unitaries
 from pygsti.tools.optools import unitary_to_superop
-from pygsti.baseobjs.label import LabelTup, LabelTupTup
+from pygsti.baseobjs.label import LabelTup, LabelTupTup, Label
+
+from typing import Sequence
 
 
 _dummy_profiler = _DummyProfiler()
@@ -2256,5 +2258,24 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
                 self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
                 array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
-    def create_layout(self, circuits, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
-        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
\ No newline at end of file
+    def create_layout(self, circuits : Sequence[_Circuit] | _CircuitList, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
+        # replace implicit idles.
+        # from pygsti.layouts.evaltree import _add_in_idle_gates_to_circuit
+        # model_idle_key = Label(())  # not true in general.
+        sanitized_circuits = []
+        for i, c in enumerate(circuits):
+            if len(c) > 0:
+                # # Attempt 1: Broken
+                # c = c.copy(True)
+                # c.replace_gatename_inplace([], model_idle_key)
+                # c.replace_gatename_inplace(Label(()), model_idle_key)
+                # c.done_editing()
+                # # Attempt 2: Broken
+                # c = _add_in_idle_gates_to_circuit(c, model_idle_key)
+                # TODO: try yet another thing.
+                #   IDEA: define a function in the ExplicitModel class that parses a circuit
+                #         and returns one with suitably substituted explicit idles. This seems
+                #         like something that only a model can be expected to resolve.
+                pass
+            sanitized_circuits.append(c)
+        return super().create_layout(sanitized_circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)

From 68937ef7b86ebec58a0ef7d1701cfd9177a79a35 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 17:46:02 -0700
Subject: [PATCH 093/141] type annotation

---
 pygsti/layouts/evaltree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 83be721b6..c1145c82d 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -460,7 +460,7 @@ def _get_start_indices(max_intersect):
 
 #region Split circuit list into lists of subcircuits
 
-def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I") -> _Circuit:
+def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str|Label = 'I') -> _Circuit:
     """
     Add in explicit idles to the labels for each layer.
     """
@@ -481,7 +481,7 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str = "I")
 
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
-        implicit_idle_gate_name: str = "I") -> tuple[list[dict[int, int]],
+        implicit_idle_gate_name: str|Label = 'I') -> tuple[list[dict[int, int]],
                                                     dict[tuple[LabelTupTup], list[tuple[int, int]]],
                                                     dict[tuple[int, ...], list[LabelTupTup]]]:
     """

From 806a78f4bcb881dc903823e9e6b3222dcd928a7c Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Wed, 16 Jul 2025 17:46:55 -0700
Subject: [PATCH 094/141] raise an error when we should

---
 pygsti/models/explicitmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygsti/models/explicitmodel.py b/pygsti/models/explicitmodel.py
index 1dec5eb29..91526a0ad 100644
--- a/pygsti/models/explicitmodel.py
+++ b/pygsti/models/explicitmodel.py
@@ -1807,7 +1807,7 @@ def get_dense_process_matrix_represention_for_gate(self, model: ExplicitOpModel,
         """
 
         if lbl not in model.operations:
-            return _np.empty(1)
+            raise KeyError(f'Operation with lable {lbl} not found in model.operations.')
 
         operation = model.operations[lbl]
 

From 0cc94642fbd3041da175dea7693aa4f7b86764f9 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 16 Jul 2025 17:52:18 -0700
Subject: [PATCH 095/141] nicks changes.

---
 pygsti/forwardsims/matrixforwardsim.py |  62 +++++-
 pygsti/layouts/evaltree.py             | 273 ++++++++++++++++++++++++-
 test/unit/objects/test_forwardsim.py   |   3 +-
 3 files changed, 327 insertions(+), 11 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index a01d88755..098ac07bc 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -38,8 +38,11 @@
 from pygsti.tools.internalgates import internal_gate_unitaries
 from pygsti.tools.optools import unitary_to_superop
 from pygsti.baseobjs.label import LabelTup, LabelTupTup, Label
+<<<<<<< Updated upstream
 
 from typing import Sequence
+=======
+>>>>>>> Stashed changes
 
 
 _dummy_profiler = _DummyProfiler()
@@ -1256,6 +1259,7 @@ def _probs_from_rho_e(self, rho, e, gs, scale_vals):
         #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
         #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
         #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
+        breakpoint()
         return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
         # shape == (len(circuit_list),) ; may overflow but OK
 
@@ -2219,17 +2223,59 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
 
         Gs = layout_atom.tree.reconstruct_full_matrices()
 
+        sp_val, povm_vals, element_indices, tree_indices = self._rho_es_elm_inds_and_tree_inds_from_spam_tuples(layout_atom)
         old_err = _np.seterr(over='ignore')
-        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
-            # "element indices" index a circuit outcome probability in array_to_fill's first dimension
-            # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
-            #  to the the element indices when `spamtuple` is used.
-            # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
-            rho, E = self._rho_e_from_spam_tuple(spam_tuple)
-            _fas(array_to_fill, [element_indices],
-                 self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
+        # for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+        #     # "element indices" index a circuit outcome probability in array_to_fill's first dimension
+        #     # "tree indices" index a quantity for a no-spam circuit in a computed cache, which correspond
+        #     #  to the the element indices when `spamtuple` is used.
+        #     # (Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller)
+        #     rho, E = self._rho_e_from_spam_tuple(spam_tuple)
+        #     _fas(array_to_fill, [element_indices],
+        #          self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
+        _fas(array_to_fill, [element_indices], self._probs_from_rho_e(sp_val, povm_vals, Gs[tree_indices], 1))
         _np.seterr(**old_err)
 
+    def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
+        """
+        Compute the probabilities from rho, a set of povms, the circuits defined by gs, and then scale appropriately.
+        """
+
+        assert e.ndim == 2
+        assert e[0] > 1
+        return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
+
+        return super()._probs_from_rho_e(rho, e, gs, scale_vals)
+
+    def _rho_es_elm_inds_and_tree_inds_from_spam_tuples(self,
+                    layout_atom: _MatrixCOPALayoutAtomWithLCS) -> tuple[_np.ndarray, _np.ndarray]:
+        """
+        Assumes one state prep and many measurements.
+
+        We assume that there will be only one set of tree indices used throughout.
+        Also, we assume that we can find a slice 
+        """
+
+        sp_val = None
+        povm_vals: list[Label] = []
+        elm_inds: list[slice] = [] # This will get collapsed since we are assuming that each povm appears once.
+        tree_inds: list[slice] = [] # I am assuming that this will 
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            if spam_tuple[0] != sp_val and sp_val is not None:
+                raise ValueError("More than one state prep is being used.")
+            else:
+                sp_val = spam_tuple[0]
+
+            
+            povm_vals.append(spam_tuple[1])
+            elm_inds.append(element_indices)
+            tree_inds.append(tree_indices)
+
+        sp_val, povm_vals = self._rho_es_from_spam_tuples(sp_val, povm_vals)
+        povm_vals = _np.vstack(povm_vals)
+        tree_inds = _np.unique(tree_inds)[0]
+        return sp_val, povm_vals, elm_inds, tree_inds
+    
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
 
         
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index c1145c82d..7e605f666 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -28,7 +28,10 @@
 from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
 from typing import Iterable
-
+import numpy as np
+import scipy.linalg as la
+import scipy.sparse.linalg as sparla
+from typing import List
 
 def _walk_subtree(treedict, indx, running_inds):
     running_inds.add(indx)
@@ -883,6 +886,9 @@ def collapse_circuits_to_process_matrices(self, model):
             self.sub_cir_to_ind_in_results[key] = out2
 
     def reconstruct_full_matrices(self):
+        """
+        Construct a 
+        """
 
         if len(self.saved_results) == 0:
             return
@@ -1025,4 +1031,267 @@ def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[
             _order, cost = self.cir_id_to_tensor_order[icir]
             tensor_cost.append(cost)
 
-        return sum(tensor_cost) + sum(collapse_lane_cost), collapse_lane_cost, tensor_cost
\ No newline at end of file
+        return sum(tensor_cost) + sum(collapse_lane_cost), collapse_lane_cost, tensor_cost
+    
+
+
+
+
+class RealLinOp:
+    
+    # Function implementations below are merely defaults.
+    # Don't hesitate to override them if need be.
+
+    __array_priority__ = 100
+
+    @property
+    def ndim(self):
+        return 2
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def T(self):
+        return self._adjoint
+
+    def item(self):
+        # If self.size == 1, return a scalar representation of this linear operator.
+        # Otherwise, error.
+        raise NotImplementedError()
+
+    def __matmul__(self, other):
+        return self._linop @ other
+    
+    def __rmatmul__(self, other):
+        return other @ self._linop
+
+
+def is_2d_square(arg):
+    if not hasattr(arg, 'shape'):
+        return False
+    if len(arg.shape) != 2:
+        return False
+    return arg.shape[0] == arg.shape[1]
+
+
+class InvTriangular(RealLinOp):
+    """
+    NOTE: can avoid relying on sparla.LinearOperator since we can implement matmul and rmatmul directly.
+    """
+
+    def __init__(self, A : np.ndarray, lower: bool, adjoint=None):
+        assert is_2d_square(A)
+        self.lower = lower
+        self.A = A
+        self._size  = A.shape[0]**2
+        self._shape = A.shape
+        self._dtype = A.dtype
+        self._adjoint = InvTriangular(A.T, not self.lower, self) if adjoint is None else adjoint
+
+    def item(self):
+        return 1 / self.A.item()
+
+    def __matmul__(self, other):
+        return la.solve_triangular(self.A, other, trans=0, lower=self.lower, check_finite=False)
+    
+    def __rmatmul__(self, other):
+        return la.solve_triangular(self.A, other.T, trans=1, lower=self.lower, check_finite=False).T
+
+
+class InvPosDef(RealLinOp):
+    """
+    NOTE: can avoid relying on sparla.LinearOperator since we can implement matmul and rmatmul directly.
+    """
+
+    def __init__(self, A: np.ndarray):
+        assert is_2d_square(A)
+        self.A = A
+        self._size  = A.shape[0]**2
+        self._shape = A.shape
+        self._dtype = A.dtype
+        self._chol = la.cho_factor(self.A)
+
+    @property
+    def T(self):
+        # override the default implementation, since we're self-adjoint.
+        return self
+
+    def item(self):
+        return 1 / self.A.item()
+    
+    def __matmul__(self, other):
+        return la.cho_solve(self._chol, other, check_finite=False)
+    
+    def __rmatmul__(self, other):
+        temp = self.__matmul__(other.T)
+        out = temp.T
+        return out
+    
+
+class InvUpdatedKronPosDef(RealLinOp):
+    """
+    A representation of a positive definite linear operator
+    
+        M = inv( K + U U' ),
+    
+    where K is a positive definite matrix with known Kronecker product
+    structure and U is a tall-and-thin matrix.
+
+    This linear operator's action is implemented by precomputing some
+    intermediate quantities at construction time and then using those
+    quantifies in the Woodbury matrix identity. Specifically, we precompute
+
+        1. an implicit representation of L = cho_factor(K, lower=True),
+        2. an explicit representation of V = inv(L) @ U,
+        3. a factored  representation of W = I + V'V,
+    
+    and then we use the formula 
+
+        M = inv(L') (I - V @ inv(W) @ V') @ inv(L).
+    
+    The essence of this method can be preserved with different factorizations
+    for K. For example, instead of computing L = cho_factor(K, lower=True),
+    we could compute P = pinv(sqrtm(K)) and substitute P wherever inv(L) or
+    inv(L') were used.
+    """
+
+    def verify(self):
+        """
+        If P = LL' + U U', then this operator is supposed to represent M = inv(P).
+        This function checks if self @ P is nearly the identity matrix.
+        """
+        explicit_K = np.eye(1)
+        for kf in self.kron_factors:
+            explicit_K = np.kron(explicit_K, kf)
+        explicit_P = explicit_K + self.U @ self.U.T
+        expect_I = self @ explicit_P
+        nrmP = la.norm(explicit_P)
+        I = np.eye(self.shape[0])
+        rel_tol = np.finfo(self.dtype).eps * nrmP
+        abs_tol = np.finfo(self.dtype).eps ** 0.5
+        tol = max(rel_tol, abs_tol)
+        assert la.norm(I - expect_I) <= tol
+        
+
+    def __init__(self, kron_factors : List[np.ndarray], U: np.ndarray, verify=False):
+        K_cho_factors = []
+        dim = 1
+        for kf in kron_factors:
+            K_cho_factors.append(la.cho_factor(kf, lower=True)[0])
+            dim *= kf.shape[0]
+        assert dim == U.shape[0]
+        self.K_cho_factors = K_cho_factors
+        invL_kron_factors = [InvTriangular(lf, lower=True) for lf in K_cho_factors]
+        self.invL = KronStructured(invL_kron_factors)
+        
+        dim_update = U.shape[1]
+        self.V = self.invL @ U
+        self.W = np.eye(dim_update) + self.V.T @ self.V
+        self.chol_W = la.cho_factor(self.W)
+        self._size  = dim * dim
+        self._shape = (dim, dim)
+        self._dtype = self.invL.dtype
+        if verify:
+            self.kron_factors = kron_factors
+            self.U = U
+            self.verify()
+        else:
+            self.U = None
+            self.kron_factors = None
+        self.verified = verify
+        pass
+
+    @property
+    def T(self):
+        return self
+    
+    def __matmul__(self, other):
+        temp1 = self.invL @ other
+        temp2 = self.V.T @ temp1
+        temp3 = la.cho_solve(self.chol_W, temp2)
+        temp4 = self.V @ temp3
+        out = self.invL.T @ (temp1 - temp4)
+        return out
+    
+    def __rmatmul__(self, other):
+        # use the fact that we're self-adjoint.
+        temp = self @ other.T
+        out = temp.T
+        return out
+
+
+class DyadicKronStructed(RealLinOp):
+
+    def __init__(self, A, B, adjoint=None):
+        assert A.ndim == 2
+        assert B.ndim == 2
+        self.A = A
+        self.B = B
+        self._A_is_trivial = A.size == 1
+        self._B_is_trivial = B.size == 1
+        self._shape = ( A.shape[0]*B.shape[0], A.shape[1]*B.shape[1] )
+        self._size = self.shape[0] * self.shape[1]
+        self._fwd_matvec_core_shape = (B.shape[1], A.shape[1])
+        self._adj_matvec_core_shape = (B.shape[0], A.shape[0])
+        self._dtype = A.dtype
+        self._linop =  sparla.LinearOperator(dtype=self.dtype, shape=self.shape, matvec=self.matvec, rmatvec=self.rmatvec)
+        self._adjoint = DyadicKronStructed(A.T, B.T, adjoint=self) if adjoint is None else adjoint
+
+    def item(self):
+        # This will raise a ValueError if self.size > 1.
+        return self.A.item() * self.B.item()
+    
+    def matvec(self, other):
+        inshape = other.shape
+        assert other.size == self.shape[1]
+        if self._A_is_trivial:
+            return self.A.item() * (self.B @ other)
+        if self._B_is_trivial:
+            return self.B.item() * (self.A @ other)
+        out = self.B @ np.reshape(other, self._fwd_matvec_core_shape, order='F') @ self.A.T
+        out = np.reshape(out, inshape, order='F')
+        return out
+
+    def rmatvec(self, other):
+        inshape = other.shape
+        assert other.size == self.shape[0]
+        if self._A_is_trivial:
+            return self.A.item() * (self.B.T @ other)
+        if self._B_is_trivial:
+            return self.B.item() * (self.A.T @ other)
+        out = self.B.T @ np.reshape(other, self._adj_matvec_core_shape, order='F') @ self.A
+        out = np.reshape(out, inshape, order='F')
+        return out
+    
+    @staticmethod
+    def build_polyadic(kron_operands):
+        if len(kron_operands) == 2:
+            out = DyadicKronStructed(kron_operands[0], kron_operands[1])
+            return out
+        # else, recurse
+        arg = DyadicKronStructed.build_polyadic(kron_operands[1:])
+        out = DyadicKronStructed(kron_operands[0], arg)
+        return out
+
+
+class KronStructured(RealLinOp):
+
+    def __init__(self, kron_operands):
+        self.kron_operands = kron_operands
+        assert all([op.ndim == 2 for op in kron_operands])
+        self.shapes = np.array([op.shape for op in kron_operands])
+        self._shape = tuple(int(i) for i in np.prod(self.shapes, axis=0))
+        forward = DyadicKronStructed.build_polyadic(self.kron_operands)
+        self._linop   = forward._linop
+        self._adjoint = forward.T
+        self._dtype = self.kron_operands[0].dtype
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index e2236e845..e4b48104c 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -20,13 +20,14 @@
 
 from pygsti.data import simulate_data
 from pygsti.modelpacks import smq1Q_XYI, smq1Q_XY
+from pygsti.modelpacks import smq2Q_XYZICNOT
 from pygsti.protocols import gst
 from pygsti.protocols.protocol import ProtocolData
 from pygsti.tools import two_delta_logl
 
 
 
-GLOBAL_MODEL_IDLE = smq1Q_XYI
+GLOBAL_MODEL_IDLE = smq2Q_XYZICNOT
 def Ls(*args):
     """ Convert args to a tuple to Labels """
     return tuple([L(x) for x in args])

From 421b690d78027b504892a020459e6bffa7ee98b3 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 17 Jul 2025 08:49:59 -0700
Subject: [PATCH 096/141] type annotations, type alias, and slight renaming

---
 pygsti/circuits/circuit.py | 18 ++++++++++++------
 pygsti/layouts/evaltree.py | 24 ++++++++++++++----------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 557cdbd01..da921c7b1 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -13,10 +13,10 @@
 from __future__ import annotations
 import itertools as _itertools
 import warnings as _warnings
-from typing import List, Sequence, Literal
+from typing import List, Sequence, Literal, Tuple, Any, Hashable, Optional
 
 import numpy as _np
-from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel
+from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel, LabelTupTup as _LabelTupTup
 
 from pygsti.baseobjs import outcomelabeldict as _ld, _compatibility as _compat
 from pygsti.tools import internalgates as _itgs
@@ -44,6 +44,10 @@
      +' unitary from standard_gatename_unitaries which matches up to a global phase.'
 _warnings.filterwarnings('module', message=msg, category=UserWarning)
 
+
+LayerTupLike = tuple[_LabelTupTup, ...] | List[_Label | Sequence[_Label]] | tuple[_Label, ...]
+
+
 def _np_to_quil_def_str(name, input_array):
     """
     Write a DEFGATE block for RQC quil for an arbitrary one- or two-qubit unitary gate.
@@ -268,9 +272,11 @@ def from_tuple(cls, tup):
         else:
             return cls(tup)
 
-    def __init__(self, layer_labels=(), line_labels='auto', num_lines=None, editable=False,
-                 stringrep=None, name='', check=True, expand_subcircuits="default",
-                 occurrence=None, compilable_layer_indices=None):
+    def __init__(self,
+            layer_labels=(), line_labels: str|Tuple[Any,...] = 'auto', num_lines: Optional[int] = None,
+            editable=False, stringrep=None, name='', check=True, expand_subcircuits="default", occurrence=None,
+            compilable_layer_indices=None
+        ):
         """
         Creates a new Circuit object, encapsulating a quantum circuit.
 
@@ -649,7 +655,7 @@ def occurrence(self, value):
         self._str = None  # regenerate string rep (it may have updated)
 
     @property
-    def layertup(self):
+    def layertup(self) -> LayerTupLike:
         """
         This Circuit's layers as a standard Python tuple of layer Labels.
 
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 7e605f666..1ea93eb19 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -17,7 +17,7 @@
 
 import numpy as _np
 
-from pygsti.circuits.circuit import Circuit as _Circuit
+from pygsti.circuits.circuit import Circuit as _Circuit, LayerTupLike
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
 from pygsti.baseobjs.label import LabelTupTup, Label
 from pygsti.modelmembers.operations import create_from_superop_mx
@@ -33,6 +33,7 @@
 import scipy.sparse.linalg as sparla
 from typing import List
 
+
 def _walk_subtree(treedict, indx, running_inds):
     running_inds.add(indx)
     (iDest, iLeft, iRight) = treedict[indx]
@@ -484,9 +485,12 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str|Label =
 
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
-        implicit_idle_gate_name: str|Label = 'I') -> tuple[list[dict[int, int]],
-                                                    dict[tuple[LabelTupTup], list[tuple[int, int]]],
-                                                    dict[tuple[int, ...], list[LabelTupTup]]]:
+        implicit_idle_gate_name: str|Label = 'I'
+    ) -> tuple[
+        dict[int, dict[int, _Circuit]],
+        dict[LayerTupLike, list[tuple[int, int]]],
+        dict[tuple[int, ...],list[LayerTupLike]]
+    ]:
     """
     Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
 
@@ -497,9 +501,9 @@ def setup_circuit_list_for_LCS_computations(
     # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
     # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
 
-    sub_cir_to_cir_id_and_lane_id: dict[tuple[_Circuit], list[tuple[int, int]]] = {}
-    line_labels_to_circuit_list: dict[tuple[int, ...], set[_Circuit]] = {}
     cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
+    sub_cir_to_cir_id_and_lane_id:  dict[LayerTupLike, list[tuple[int, int]]] = {}
+    line_labels_to_layertup_lists:  dict[tuple[int, ...], list[LayerTupLike]] = {}
 
     for i, cir in enumerate(circuit_list):
 
@@ -517,10 +521,10 @@ def setup_circuit_list_for_LCS_computations(
         for j in range(len(sub_cirs)):
             sc = _Circuit(sub_cirs[j],line_labels=tuple(lane_to_qubits[j]),)
             lbls = sc._line_labels
-            if lbls in line_labels_to_circuit_list:
-                line_labels_to_circuit_list[lbls].append(sc.layertup)
+            if lbls in line_labels_to_layertup_lists:
+                line_labels_to_layertup_lists[lbls].append(sc.layertup)
             else:
-                line_labels_to_circuit_list[lbls] = [sc.layertup]
+                line_labels_to_layertup_lists[lbls] = [sc.layertup]
             if sc.layertup in sub_cir_to_cir_id_and_lane_id:
                 sub_cir_to_cir_id_and_lane_id[sc.layertup].append((i,j))
             else:
@@ -530,7 +534,7 @@ def setup_circuit_list_for_LCS_computations(
             else:
                 cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
 
-    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_circuit_list
+    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_layertup_lists
 
 #endregion Split Circuits by lanes helpers
 

From ea494175fdc913fbb51f7e20f86f60b0ad8e6461 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 17 Jul 2025 09:06:34 -0700
Subject: [PATCH 097/141] more typing

---
 pygsti/circuits/circuit.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index da921c7b1..f83d62a7e 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -13,7 +13,7 @@
 from __future__ import annotations
 import itertools as _itertools
 import warnings as _warnings
-from typing import List, Sequence, Literal, Tuple, Any, Hashable, Optional
+from typing import List, Sequence, Literal, Tuple, Any, Hashable, Optional, TypeAlias
 
 import numpy as _np
 from pygsti.baseobjs.label import Label as _Label, CircuitLabel as _CircuitLabel, LabelTupTup as _LabelTupTup
@@ -45,7 +45,16 @@
 _warnings.filterwarnings('module', message=msg, category=UserWarning)
 
 
-LayerTupLike = tuple[_LabelTupTup, ...] | List[_Label | Sequence[_Label]] | tuple[_Label, ...]
+##############################################################################################
+# NOTE(Riley): these types are work-in-progress. They don't make a whole lot of sense to me
+# right now. It might be possible that they just _DONT_ make sense, and yet they're correct
+# in the context of the current implementation.
+_NestedLabelSeq = List[_Label | Sequence[_Label]]
+#   ^ An alias to make it easier to see how subsequent types relate.
+#     Don't use this in function signatures.
+LayerTupLike = Tuple[_LabelTupTup,    ...] | _NestedLabelSeq | Tuple[_Label, ...]
+LabelsLike   = Tuple[_NestedLabelSeq, ...] | _NestedLabelSeq
+##############################################################################################
 
 
 def _np_to_quil_def_str(name, input_array):
@@ -520,7 +529,7 @@ def _fastinit(cls, labels, line_labels, editable, name='', stringrep=None, occur
     #Note: If editing _bare_init one should also check _copy_init in case changes must be propagated.
     def _bare_init(self, labels, line_labels, editable, name='', stringrep=None, occurrence=None,
                    compilable_layer_indices_tup=()):
-        self._labels : List[_Label | Sequence[_Label]] = labels
+        self._labels : LabelsLike = labels
         self._line_labels = tuple(line_labels)
         self._occurrence_id = occurrence
         self._compilable_layer_indices_tup = compilable_layer_indices_tup # always a tuple, but can be empty.
@@ -2469,7 +2478,7 @@ def tensor_circuit(self, circuit: Circuit, line_order=None):
         return cpy
     
     def _cache_tensor_lanes(self, sub_circuit_list: list[_Label],
-                            lane_to_qubits: dict[int, tuple[int, ...]]) -> Circuit:
+                            lane_to_qubits: dict[int, Tuple[int, ...]]) -> Circuit:
         """
         Store the tensor lanes in the circuit if appropriate. 
         Note that this should only be called in the case that the sub_circuit_list

From 4cc5ba2b39d136e8ac4309a35a5fc94d285237a0 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 17 Jul 2025 09:42:53 -0700
Subject: [PATCH 098/141] type annotations

---
 pygsti/layouts/evaltree.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 1ea93eb19..eda613581 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -19,7 +19,7 @@
 
 from pygsti.circuits.circuit import Circuit as _Circuit, LayerTupLike
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
-from pygsti.baseobjs.label import LabelTupTup, Label
+from pygsti.baseobjs.label import LabelTupTup, Label, LabelTup
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
@@ -541,16 +541,16 @@ def setup_circuit_list_for_LCS_computations(
 
 #region Lane Collapsing Helpers
 
-def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: Label, saved: dict[int | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     """
     Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_i+1, qu_i).
     """
     if op.num_qubits == 2:
         # We may need to do swaps.
-        op_term = 1
+        op_term : _np.ndarray = np.array([1.])
         if op in saved:
             op_term = saved[op]
-        elif op.qubits[1] < op.qubits[0]:
+        elif op.qubits[1] < op.qubits[0]:  # type: ignore
             # This is in the wrong order.
             op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
             op_term = swap_dense @ (op_term) @ swap_dense

From 91d2d96ca90090eb6930f71ab1d2d992e6e8b661 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 17 Jul 2025 10:42:26 -0700
Subject: [PATCH 099/141] resolve left over merge conflict

---
 pygsti/forwardsims/matrixforwardsim.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 098ac07bc..427f1f5df 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -38,11 +38,8 @@
 from pygsti.tools.internalgates import internal_gate_unitaries
 from pygsti.tools.optools import unitary_to_superop
 from pygsti.baseobjs.label import LabelTup, LabelTupTup, Label
-<<<<<<< Updated upstream
 
 from typing import Sequence
-=======
->>>>>>> Stashed changes
 
 
 _dummy_profiler = _DummyProfiler()

From 6e6dde05006c426266017bceb12365f401a0d1ab Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 17 Jul 2025 11:01:11 -0700
Subject: [PATCH 100/141] tests with smq1Q_XY pass (again ...)

---
 pygsti/forwardsims/matrixforwardsim.py |  9 ++++++---
 pygsti/tools/matrixtools.py            | 13 ++++++++++---
 pygsti/tools/sequencetools.py          | 21 ++++++++++++---------
 test/unit/objects/test_forwardsim.py   | 20 ++++++++++----------
 4 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 427f1f5df..ed2a24229 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -1256,7 +1256,7 @@ def _probs_from_rho_e(self, rho, e, gs, scale_vals):
         #  vp[i] = sum_k e[0,k] dot(gs, rho)[i,k,0]  * scale_vals[i]
         #  vp[i] = dot( e, dot(gs, rho))[0,i,0]      * scale_vals[i]
         #  vp    = squeeze( dot( e, dot(gs, rho)), axis=(0,2) ) * scale_vals
-        breakpoint()
+        # breakpoint()
         return _np.squeeze(_np.dot(e, _np.dot(gs, rho)), axis=(0, 2)) * scale_vals
         # shape == (len(circuit_list),) ; may overflow but OK
 
@@ -2230,7 +2230,10 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         #     rho, E = self._rho_e_from_spam_tuple(spam_tuple)
         #     _fas(array_to_fill, [element_indices],
         #          self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
-        _fas(array_to_fill, [element_indices], self._probs_from_rho_e(sp_val, povm_vals, Gs[tree_indices], 1))
+        probs = self._probs_from_rho_e(sp_val, povm_vals, Gs[tree_indices], 1)
+        if not isinstance(element_indices, list):
+            element_indices = [element_indices]
+        _fas(array_to_fill, element_indices, probs)
         _np.seterr(**old_err)
 
     def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
@@ -2239,7 +2242,7 @@ def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
         """
 
         assert e.ndim == 2
-        assert e[0] > 1
+        ## WHY ??? assert e[0] > 1
         return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
 
         return super()._probs_from_rho_e(rho, e, gs, scale_vals)
diff --git a/pygsti/tools/matrixtools.py b/pygsti/tools/matrixtools.py
index 365ecf62c..a58e2a5ce 100644
--- a/pygsti/tools/matrixtools.py
+++ b/pygsti/tools/matrixtools.py
@@ -1278,11 +1278,18 @@ def _fas(a, inds, rhs, add=False):
     # index-list index is fine too.  The case we need to
     # deal with is indexing a multi-dimensional array with
     # one or more index-lists
-    if all([isinstance(i, (int, slice)) for i in inds]) or len(inds) == 1:
+    if len(inds) == 1:
         if add:
-            a[inds] += rhs  # all integers or slices behave nicely
+            a[inds] += rhs
         else:
-            a[inds] = rhs  # all integers or slices behave nicely
+            a[inds] = rhs
+    elif all([isinstance(i, (int, slice)) for i in inds]):
+        assert len(inds) == rhs.shape[1]
+        for ind, rhs_vec in zip(inds, rhs.T):
+            if add:
+                a[ind] += rhs_vec  # all integers or slices behave nicely
+            else:
+                a[ind]  = rhs_vec  # all integers or slices behave nicely
     else:
         #convert each dimension's index to a list, take a product of
         # these lists, and flatten the right hand side to get the
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index ff0e8c00f..3b06cd785 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -1,4 +1,4 @@
-from typing import Sequence
+from typing import Sequence, Any, List
 import numpy as _np
 
 
@@ -40,7 +40,7 @@ def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
     return table
 
 
-def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_data_and_sequences,
+def conduct_one_round_of_lcs_simplification(sequences: List[Sequence], table_data_and_sequences,
                                             internal_tables_and_sequences,
                                             starting_cache_num,
                                             cache_struct):
@@ -65,7 +65,7 @@ def conduct_one_round_of_lcs_simplification(sequences: list[Sequence], table_dat
     cache_num = starting_cache_num
 
     # Build sequence dict
-    all_subsequences_to_replace: dict[tuple, dict[int, list[int]]] = {}
+    all_subsequences_to_replace: dict[tuple, dict[int, List[int]]] = {}
 
     if _np.max(internal_subtable) >= _np.max(table):
         # We are only going to replace if this was the longest substring.
@@ -189,7 +189,7 @@ def _compute_lcs_for_every_pair_of_sequences(sequences: list):
     return best_lengths, best_subsequences
 
 
-def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, list[int]]]:
+def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple, set[int]]]:
     """
     Compute the longest common subsequence within a single circuit A.
 
@@ -198,11 +198,11 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
     Returns:
     ---------
     int - length of longest common subsequences within A
-    dict[tuple, list[int]] - dictionary of subsequences to starting positions within A.
+    dict[tuple, set[int]] - dictionary of subsequences to starting positions within A.
     """
     n = len(A)
     best = 0
-    best_ind = {}
+    best_ind : dict[tuple[Any,...], set[int]] = dict()
     changed = False
     for w in range(1, int(_np.floor(n / 2) + 1)):
         for sp in range(n - w):
@@ -223,8 +223,11 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
     return best, best_ind
 
 
-def create_tables_for_internal_LCS(sequences: list[Sequence]) -> tuple[_np.ndarray,
-                                                        list[dict[tuple, list[int]]]]:
+def create_tables_for_internal_LCS(
+        sequences: List[Sequence[Any]]
+    ) -> tuple[
+        _np.ndarray, List[dict[tuple[Any,...], set[int]]]
+    ]:
     """
     Compute all the longest common internal sequences for each circuit A in sequences
 
@@ -233,7 +236,7 @@ def create_tables_for_internal_LCS(sequences: list[Sequence]) -> tuple[_np.ndarr
 
     C = len(sequences)
     the_table = _np.zeros(C)
-    seq_table = [[] for _ in range(C)]
+    seq_table : List[dict[tuple[Any,...], set[int]]] = [dict() for _ in range(C)]
 
     curr_best = 1
     for i in range(C):
diff --git a/test/unit/objects/test_forwardsim.py b/test/unit/objects/test_forwardsim.py
index e4b48104c..0a8292528 100644
--- a/test/unit/objects/test_forwardsim.py
+++ b/test/unit/objects/test_forwardsim.py
@@ -19,15 +19,15 @@
     BaseCase = object
 
 from pygsti.data import simulate_data
-from pygsti.modelpacks import smq1Q_XYI, smq1Q_XY
-from pygsti.modelpacks import smq2Q_XYZICNOT
+from pygsti.modelpacks import smq1Q_XYI, smq1Q_XY, smq2Q_XYZICNOT, smq2Q_XYCNOT
 from pygsti.protocols import gst
 from pygsti.protocols.protocol import ProtocolData
 from pygsti.tools import two_delta_logl
 
 
+GLOBAL_MODEL_PACK = smq1Q_XY
+
 
-GLOBAL_MODEL_IDLE = smq2Q_XYZICNOT
 def Ls(*args):
     """ Convert args to a tuple to Labels """
     return tuple([L(x) for x in args])
@@ -156,8 +156,8 @@ class BaseProtocolData:
 
     @classmethod
     def setUpClass(cls):
-        cls.gst_design = GLOBAL_MODEL_IDLE.create_gst_experiment_design(max_max_length=16)
-        cls.mdl_target = GLOBAL_MODEL_IDLE.target_model()
+        cls.gst_design = GLOBAL_MODEL_PACK.create_gst_experiment_design(max_max_length=16)
+        cls.mdl_target = GLOBAL_MODEL_PACK.target_model()
         cls.mdl_datagen = cls.mdl_target.depolarize(op_noise=0.05, spam_noise=0.025)
 
         ds = simulate_data(cls.mdl_datagen, cls.gst_design.all_circuits_needing_data, 20000, sample_error='none')
@@ -270,15 +270,15 @@ class ForwardSimConsistencyTester(TestCase):
 
     
     def setUp(self):
-        self.model_ideal = GLOBAL_MODEL_IDLE.target_model()
+        self.model_ideal = GLOBAL_MODEL_PACK.target_model()
         if TorchForwardSimulator.ENABLED:
             # TorchFowardSimulator can only work with TP modelmembers.
             self.model_ideal.convert_members_inplace(to_type='full TP')
         
         self.model_noisy = self.model_ideal.depolarize(op_noise=0.05, spam_noise=0.025)
-        prep_fiducials = GLOBAL_MODEL_IDLE.prep_fiducials()
-        meas_fiducials = GLOBAL_MODEL_IDLE.meas_fiducials()
-        germs = GLOBAL_MODEL_IDLE.germs()
+        prep_fiducials = GLOBAL_MODEL_PACK.prep_fiducials()
+        meas_fiducials = GLOBAL_MODEL_PACK.meas_fiducials()
+        germs = GLOBAL_MODEL_PACK.germs()
         max_lengths = [4]
         circuits = create_lsgst_circuit_lists(
             self.model_noisy, prep_fiducials, meas_fiducials, germs, max_lengths
@@ -342,7 +342,7 @@ class ForwardSimIntegrationTester(BaseProtocolData):
 
     def _run(self, obj : ForwardSimulator.Castable):
         self.setUpClass()
-        proto = gst.GateSetTomography(GLOBAL_MODEL_IDLE.target_model("full TP"), name="testGST")
+        proto = gst.GateSetTomography(GLOBAL_MODEL_PACK.target_model("full TP"), name="testGST")
         results = proto.run(self.gst_data, simulator=obj)
         mdl_result = results.estimates["testGST"].models["final iteration estimate"]
         twoDLogL = two_delta_logl(mdl_result, self.gst_data.dataset)

From 7d639bbeff73ea60fcc471db6db78dc1af699fe4 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Thu, 17 Jul 2025 11:21:04 -0700
Subject: [PATCH 101/141] typing and compute a 16-by-16 matrix rather than
 having it written explicitly in source

---
 pygsti/layouts/evaltree.py    | 38 ++---------------------------------
 pygsti/tools/sequencetools.py | 12 ++++++-----
 2 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index eda613581..b5124e1a6 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -634,42 +634,8 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         self.sequence_intro = sequence_intro
 
-        self.swap_gate = _np.array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                      0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00, -1.23259516e-32],
-                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00, 1.23259516e-32,
-                                      0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                      1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                    [ 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                      0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32],
-                                   
-                                    [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00],
-                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,
-                                     0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                    [ 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00,0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                   
-                                    [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00],
-                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,
-                                    0.00000000e+00, 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00],
-                                    [ 0.00000000e+00,  0.00000000e+00,  1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00],
-       
-                                    [ 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32],
-                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  1.23259516e-32,  0.00000000e+00, 0.00000000e+00,  1.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,0.00000000e+00,  0.00000000e+00,
-                                    1.23259516e-32, 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00],
-                                    [-1.23259516e-32, 0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 0.00000000e+00,  0.00000000e+00,
-                                     0.00000000e+00, 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.23259516e-32,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]])
-
+        from pygsti.modelmembers.operations import StaticStandardOp
+        self.swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense().round(16)
       
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 3b06cd785..80f00141c 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -1,4 +1,4 @@
-from typing import Sequence, Any, List
+from typing import Sequence, Any, List, Literal, Tuple, MutableSequence
 import numpy as _np
 
 
@@ -40,7 +40,7 @@ def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
     return table
 
 
-def conduct_one_round_of_lcs_simplification(sequences: List[Sequence], table_data_and_sequences,
+def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSequence[Any]], table_data_and_sequences,
                                             internal_tables_and_sequences,
                                             starting_cache_num,
                                             cache_struct):
@@ -129,7 +129,9 @@ def conduct_one_round_of_lcs_simplification(sequences: List[Sequence], table_dat
     return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round
 
 
-def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int, int, int]:
+def _find_starting_positions_using_dp_table(
+        dp_table: _np.ndarray
+    ) -> tuple[int, int, int] | Tuple[None, None, None]:
     """
     Finds the starting positions for the longest common subsequence.
 
@@ -161,7 +163,7 @@ def _find_starting_positions_using_dp_table(dp_table: _np.ndarray) -> tuple[int,
     return None, None, None
 
 
-def _compute_lcs_for_every_pair_of_sequences(sequences: list):
+def _compute_lcs_for_every_pair_of_sequences(sequences: MutableSequence[Any]):
     """
     Computes the LCS for every pair of sequences A,B in sequences
     """
@@ -224,7 +226,7 @@ def _longest_common_internal_subsequence(A: Sequence) -> tuple[int, dict[tuple,
 
 
 def create_tables_for_internal_LCS(
-        sequences: List[Sequence[Any]]
+        sequences: Sequence[Sequence[Any]]
     ) -> tuple[
         _np.ndarray, List[dict[tuple[Any,...], set[int]]]
     ]:

From 3a413ca751214f217172ab1887d0af20e78c5e77 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 17 Jul 2025 16:10:50 -0700
Subject: [PATCH 102/141] Error out if we encounter an ExplicitOp Model.

---
 pygsti/layouts/evaltree.py     | 7 +++----
 pygsti/layouts/matrixlayout.py | 9 +++++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index b5124e1a6..e9957b32b 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -27,11 +27,10 @@
 
 from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
-from typing import Iterable
 import numpy as np
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
-from typing import List
+from typing import List, Optional, Iterable
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -855,9 +854,9 @@ def collapse_circuits_to_process_matrices(self, model):
             self.saved_results[key] = out1
             self.sub_cir_to_ind_in_results[key] = out2
 
-    def reconstruct_full_matrices(self):
+    def reconstruct_full_matrices(self) -> Optional[List[KronStructured]]:
         """
-        Construct a 
+        Construct a tensor product structure for each individual circuit
         """
 
         if len(self.saved_results) == 0:
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 4604c6023..880e96f05 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -508,6 +508,15 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
                  num_param_dimension_processors=(), param_dimensions=(),
                  param_dimension_blk_sizes=(), resource_alloc=None, verbosity=0, 
                  layout_creation_circuit_cache = None, use_old_tree_style: bool = True):
+        
+        if not use_old_tree_style:
+            # NOTE: ERrror out if we are useing new tree and have an explicit op model. Explain why this is bad.
+            from pygsti.models import ExplicitOpModel, ImplicitOpModel
+            if isinstance(model, ExplicitOpModel):
+                raise ValueError(f"Model: {model.__class__} does not support creation of embedded op process matrices." +
+                    "One needs to be able to create the smallest representation possible a 4x4 matrix for a gate acting on a single qubit. In the case of a two qubit system (Gxpi2, 1) in the model could return a 16x16 matrix." +
+                    "This indicates that it was actually acting on the full two qubit system. We assume in the lane splitting algorithm, that the label 1 indicates it will only act on a single qubit specifically qubit 1." +
+                    f"To remedy this situation please convert the model to a subclass of {ImplicitOpModel}.")
 
         #OUTDATED: TODO - revise this:
         # 1. pre-process => get complete circuits => spam-tuples list for each no-spam circuit (no expanding yet)

From 773184ecdea99dfc3474cd0b4b8eeddffc046499 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 17 Jul 2025 18:54:45 -0700
Subject: [PATCH 103/141] Use the tensor product matrix multiplication trick.

---
 pygsti/forwardsims/matrixforwardsim.py |  4 +++
 pygsti/layouts/evaltree.py             | 40 ++++----------------------
 2 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index ed2a24229..29a0b21b9 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2243,6 +2243,10 @@ def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
 
         assert e.ndim == 2
         ## WHY ??? assert e[0] > 1
+        out = _np.zeros((len(gs), len(e)))
+        for i in range(len(gs)):
+            out[i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
+        return out
         return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
 
         return super()._probs_from_rho_e(rho, e, gs, scale_vals)
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index e9957b32b..fee6a3659 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -560,16 +560,6 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup
     return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
 
 
-def combine_two_gates(cumulative_term, next_dense_matrix):
-    """
-    Note that the visual representation was
-
-    State Prep | CumulativeTerm | NextDense | Measure
-
-    which in matrix multiplication requires Measure @ (NextDense @ Cumulative) @ State Prep.
-    """
-    return next_dense_matrix @ cumulative_term
-
 def matrix_matrix_cost_estimate(matrix_size: tuple[int, int]) -> int:
     """
     Estimate cost of A @ B when both are square and dense.
@@ -715,7 +705,7 @@ def handle_results_cache_lookup_and_product(self,
         if cumulative_term is None:
             # look up result.
             return results_cache[term_to_extend_with]
-        return combine_two_gates(cumulative_term, results_cache[term_to_extend_with]) 
+        return results_cache[term_to_extend_with] @ cumulative_term 
 
 
     def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
@@ -729,8 +719,7 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
         """
 
-
-        if isinstance(term_to_extend_with, int):
+        if term_to_extend_with in results_cache:
             return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
 
         else:
@@ -762,9 +751,11 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
                     qubits_available = qubits_available[len(gatekey):]
 
+            results_cache[term_to_extend_with] = val
             if cumulative_term is None:
                 return val
-            return combine_two_gates(cumulative_term, val)
+            # Cache if off.
+            return results_cache[term_to_extend_with] @ cumulative_term
 
 
     def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
@@ -885,26 +876,7 @@ def reconstruct_full_matrices(self) -> Optional[List[KronStructured]]:
                     breakpoint()
                 ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
                 lane_circuits.append(self.saved_results[lblkey][ind_in_results])
-            output.append(lane_circuits)
-
-        # Need a map from lane id to computed location.
-        for icir in range(num_cirs):
-
-            order, _cost_estimate = self.cir_id_to_tensor_order[icir]
-            
-            
-            while order:
-                sp = order[0]
-                output[icir][sp] = _np.kron(output[icir][sp], output[icir][sp+1])
-                output[icir][sp+1:] = output[icir][sp+2:]
-                
-                # Adjust future indices
-                tmp = []
-                for new_val in order[1:]:
-                    tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
-                order = tmp
-
-            output[icir] = output[icir][0]
+            output.append(KronStructured(lane_circuits))
         return output
     
     def compute_tensor_orders(self):

From c8c3bd0775181bbf21e6321e2e9544b5852f49f4 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 18 Jul 2025 11:17:15 -0700
Subject: [PATCH 104/141] Adding more typechecking

---
 pygsti/forwardsims/forwardsim.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pygsti/forwardsims/forwardsim.py b/pygsti/forwardsims/forwardsim.py
index 85d43899e..5af17fb9a 100644
--- a/pygsti/forwardsims/forwardsim.py
+++ b/pygsti/forwardsims/forwardsim.py
@@ -21,7 +21,10 @@
 from pygsti.baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
 from pygsti.baseobjs.nicelyserializable import NicelySerializable as _NicelySerializable
 from pygsti.tools import slicetools as _slct
-from typing import Union, Callable, Literal
+from typing import Union, Callable, Literal, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pygsti.models.model import OpModel
 
 
 class ForwardSimulator(_NicelySerializable):
@@ -96,7 +99,7 @@ def _array_types_for_method(cls, method_name):
             return ('ep', 'ep') + cls._array_types_for_method('_bulk_fill_dprobs_block')
         return ()
 
-    def __init__(self, model=None):
+    def __init__(self, model: OpModel=None):
         super().__init__()
         #self.dim = model.dim
         self.model = model
@@ -128,11 +131,11 @@ def __getstate__(self):
         return state_dict
 
     @property
-    def model(self):
+    def model(self) -> OpModel:
         return self._model
 
     @model.setter
-    def model(self, val):
+    def model(self, val: OpModel):
         self._model = val
         try:
             evotype = None if val is None else self._model.evotype

From f4b989cc99859a4c16526a80061e436b585436d4 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 18 Jul 2025 14:31:37 -0700
Subject: [PATCH 105/141] Debugging setting a single parameter directly.

---
 pygsti/forwardsims/mapforwardsim.py           |  4 +-
 .../forwardsims/mapforwardsim_calc_generic.py |  2 +
 pygsti/forwardsims/matrixforwardsim.py        | 40 ++++++++++++++-----
 pygsti/models/model.py                        |  4 +-
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/pygsti/forwardsims/mapforwardsim.py b/pygsti/forwardsims/mapforwardsim.py
index 7d59fb9ee..adb876ed3 100644
--- a/pygsti/forwardsims/mapforwardsim.py
+++ b/pygsti/forwardsims/mapforwardsim.py
@@ -360,7 +360,7 @@ def create_copa_layout_circuit_cache(circuits, model, dataset=None):
 
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim)
+        # resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim)
         start_time = time.time()
         self.calclib.mapfill_probs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]),  # all indices
                                         layout_atom, resource_alloc)
@@ -369,7 +369,7 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom, resource_alloc):
 
     def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc):
         # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller
-        resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim * _slct.length(param_slice))
+        # resource_alloc.check_can_allocate_memory(layout_atom.cache_size * self.model.dim * _slct.length(param_slice))
         self.calclib.mapfill_dprobs_atom(self, array_to_fill, slice(0, array_to_fill.shape[0]), dest_param_slice,
                                          layout_atom, param_slice, resource_alloc, self.derivative_eps)
 
diff --git a/pygsti/forwardsims/mapforwardsim_calc_generic.py b/pygsti/forwardsims/mapforwardsim_calc_generic.py
index 89e39087d..08b104721 100644
--- a/pygsti/forwardsims/mapforwardsim_calc_generic.py
+++ b/pygsti/forwardsims/mapforwardsim_calc_generic.py
@@ -184,6 +184,8 @@ def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, la
         iFinal = iParamToFinal[param_indices[i]]
         fwdsim.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
                                           [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
+        vec = fwdsim.model.to_vector()
+        assert _np.allclose(_np.where(vec != 0), [i])
         #mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc)
         cond_update_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, param_indices[i], resource_alloc)
         #assert _np.linalg.norm(probs2_test-probs2) < 1e-10
diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 29a0b21b9..fa68d8e82 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2231,9 +2231,12 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         #     _fas(array_to_fill, [element_indices],
         #          self._probs_from_rho_e(rho, E, Gs[tree_indices], 1))
         probs = self._probs_from_rho_e(sp_val, povm_vals, Gs[tree_indices], 1)
-        if not isinstance(element_indices, list):
-            element_indices = [element_indices]
-        _fas(array_to_fill, element_indices, probs)
+        # if not isinstance(element_indices, list):
+        #     element_indices = [element_indices]
+        # collapse element list
+        
+        # _fas(array_to_fill, element_indices, probs)
+        array_to_fill[:] = probs
         _np.seterr(**old_err)
 
     def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
@@ -2246,6 +2249,8 @@ def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
         out = _np.zeros((len(gs), len(e)))
         for i in range(len(gs)):
             out[i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
+        out = out.reshape((len(gs)*len(e)), order="C")
+        print(out)
         return out
         return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
 
@@ -2284,6 +2289,7 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
 
         
         eps = 1e-7  # hardcoded?
+        avoiding_repeated_dividing_eps = 1 / eps 
         if param_slice is None:
             param_slice = slice(0, self.model.num_params)
         param_indices = _slct.to_array(param_slice)
@@ -2300,13 +2306,27 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
         probs2 = _np.empty(layout_atom.num_elements, 'd')
         orig_vec = self.model.to_vector().copy()
 
-        for i in range(self.model.num_params):
-            if i in iParamToFinal:
-                iFinal = iParamToFinal[i]
-                vec = orig_vec.copy(); vec[i] += eps
-                self.model.from_vector(vec, close=True)
-                self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
-                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+        if len(param_indices)>0:
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            first_param_idx = param_indices[0]
+            iFinal = iParamToFinal[first_param_idx]
+            self.model.set_parameter_value(first_param_idx, orig_vec[first_param_idx]+eps)
+            self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+        for i in range(1, len(param_indices)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+
+            iFinal = iParamToFinal[param_indices[i]]
+            breakpoint()
+            self.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
+                                            [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
+            vec = self.model.to_vector()
+            assert _np.allclose(_np.where(vec != 0), [i])
+            self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            print(iFinal)
+        # array_to_fill = array_to_fill / eps # Divide once
 
     def create_layout(self, circuits : Sequence[_Circuit] | _CircuitList, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
         # replace implicit idles.
diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index 82db066c2..bce3c77fa 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -458,7 +458,7 @@ class OpModel(Model):
     """
 
     #Whether to perform extra parameter-vector integrity checks
-    _pcheck = False
+    _pcheck = True
 
     #Experimental: whether to call .from_vector on operation *cache* elements as part of model.from_vector call
     _call_fromvector_on_cache = True
@@ -712,7 +712,7 @@ def _iter_parameterized_objs(self):
         #return # default is to have no parameterized objects
 
     #TODO: Make this work with param interposers.
-    def _check_paramvec(self, debug=False):
+    def _check_paramvec(self, debug=True):
         if debug: print("---- Model._check_paramvec ----")
 
         TOL = 1e-8

From e43835150c5929338502cc65144416ba99da6bda Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Mon, 21 Jul 2025 17:09:37 -0700
Subject: [PATCH 106/141] Update flop cost.

---
 pygsti/layouts/evaltree.py     | 67 +++++++++++++++++++++++++++++++++-
 pygsti/layouts/matrixlayout.py | 10 ++---
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index fee6a3659..0c728981d 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -31,6 +31,7 @@
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
 from typing import List, Optional, Iterable
+from pygsti.tools.tqdm import our_tqdm
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -616,6 +617,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
             best_internal_match = _np.max(internal_matches[0])
 
             max_rounds = int(max(best_external_match,best_internal_match))
+            print("Num rounds remaining ", max_rounds)
 
         self.cache = cache
         self.num_circuits = C
@@ -802,7 +804,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         self.line_lbls_to_cir_list = line_lbls_to_circuit_list
 
         starttime = time.time()
-        for key, vals in line_lbls_to_circuit_list.items():
+        for key, vals in our_tqdm(line_lbls_to_circuit_list.items(), " Building Longest Common Substring Caches"):
             sub_cirs = []
             for cir in vals:
                 sub_cirs.append(list(cir))
@@ -879,6 +881,36 @@ def reconstruct_full_matrices(self) -> Optional[List[KronStructured]]:
             output.append(KronStructured(lane_circuits))
         return output
     
+    def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False):
+
+
+        cost_collapse = 0
+        for key in self.trees:
+            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
+            tree = self.trees[key]
+            cost_collapse += tree.flop_cost_of_evaluating_tree(tuple([4**num_qubits, 4**num_qubits]))
+        
+
+        tensor_cost = 0
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+
+        for cir_id in range(num_cirs):
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            qubit_list = list(qubit_list)
+            total_num = np.sum(qubit_list)
+
+            tensor_cost += cost_to_compute_tensor_matvec_without_reordering(qubit_list, total_num)
+
+        if return_collapse:
+            return tensor_cost + cost_collapse, cost_collapse
+        elif return_tensor_matvec:
+            return tensor_cost + cost_collapse, tensor_cost
+
+        return tensor_cost + cost_collapse
+
     def compute_tensor_orders(self):
 
         num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
@@ -1236,3 +1268,36 @@ def __init__(self, kron_operands):
         self._linop   = forward._linop
         self._adjoint = forward.T
         self._dtype = self.kron_operands[0].dtype
+
+
+def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int):
+
+    assert np.sum(qubit_list) == total_num_qubits
+
+    if len(qubit_list) == 1:
+        # Basic matvec.
+
+        cost = 2 * (4**qubit_list[0]**2)
+        return cost
+    
+    elif len(qubit_list) == 2:
+
+        # vec((A \tensor B) u) = vec(B U A.T)
+
+        term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
+
+        term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
+
+        return term1 + term2
+    
+    else:
+
+        # Just pop off the last term
+
+        # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
+
+        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0]) * 4**(np.sum(qubit_list[1:]))
+
+        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:], total_num_qubits - qubit_list[0]) * 4**(qubit_list[0])
+
+        return left + right
\ No newline at end of file
diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 880e96f05..09bcb346d 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -388,12 +388,12 @@ def add_expanded_circuits(indices, add_to_this_dict):
         self.elindex_outcome_tuples = elindex_outcome_tuples
 
 
-        print("Flop cost to evaluate the tree once: ", self.tree._flop_estimate_to_collapse_to_each_circuit_to_process_matrix()[0])
+        print("Flop cost to evaluate the tree once: ", self.tree.flop_estimate())
         
-        num_circs = len(cir_ind_and_lane_id_to_sub_cir)
-        num_rho_and_em = len(self.indices_by_spamtuple.keys())
-        num_qubits_in_circuit = unique_circuits[0].num_lines
-        print("Flop cost for <p_i G Em>: ", (2*(4**num_qubits_in_circuit)**2)*num_circs*num_rho_and_em)
+        # num_circs = len(cir_ind_and_lane_id_to_sub_cir)
+        # num_rho_and_em = len(self.indices_by_spamtuple.keys())
+        # num_qubits_in_circuit = unique_circuits[0].num_lines
+        # print("Flop cost for <p_i G Em>: ", (2*(4**num_qubits_in_circuit)**2)*num_circs*num_rho_and_em)
 
         super().__init__(element_slice, num_elements)
 

From 0fcf1d87833ba51acce8d694def309a1cba315cd Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 22 Jul 2025 13:26:32 -0700
Subject: [PATCH 107/141] Update after talking with Corey and Erik

---
 pygsti/forwardsims/matrixforwardsim.py        |  30 ++--
 pygsti/models/model.py                        |   7 +-
 .../test_forwardsim_on_implicitop_model.py    | 139 +++++++++++++-----
 3 files changed, 121 insertions(+), 55 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index fa68d8e82..2c436a868 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2306,23 +2306,25 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
         probs2 = _np.empty(layout_atom.num_elements, 'd')
         orig_vec = self.model.to_vector().copy()
 
-        if len(param_indices)>0:
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-            first_param_idx = param_indices[0]
-            iFinal = iParamToFinal[first_param_idx]
-            self.model.set_parameter_value(first_param_idx, orig_vec[first_param_idx]+eps)
-            self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+        # if len(param_indices)>0:
+        #     probs2[:] = probs[:] # Could recompute only some of the tree.
+        #     first_param_idx = param_indices[0]
+        #     iFinal = iParamToFinal[first_param_idx]
+        #     self.model.set_parameter_value(first_param_idx, orig_vec[first_param_idx]+eps)
+        #     self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
+        #     array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
-        for i in range(1, len(param_indices)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
+        for i in range(len(param_indices)):
+            # probs2[:] = probs[:] # Could recompute only some of the tree.
 
             iFinal = iParamToFinal[param_indices[i]]
-            breakpoint()
-            self.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
-                                            [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
-            vec = self.model.to_vector()
-            assert _np.allclose(_np.where(vec != 0), [i])
+            # self.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
+            #                                 [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
+            vec = orig_vec.copy()
+            vec[param_indices[i]] += eps
+            self.model.from_vector(vec)
+            # vec = self.model.to_vector()
+            # assert _np.allclose(_np.where(vec != 0), [i])
             self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
             print(iFinal)
diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index bce3c77fa..ef0c6dc6a 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -458,7 +458,7 @@ class OpModel(Model):
     """
 
     #Whether to perform extra parameter-vector integrity checks
-    _pcheck = True
+    _pcheck = False
 
     #Experimental: whether to call .from_vector on operation *cache* elements as part of model.from_vector call
     _call_fromvector_on_cache = True
@@ -1007,7 +1007,7 @@ def _rebuild_paramvec(self):
         # bounds to "ops" bounds like we do the parameter vector.  Need something like:
         #wb = self._model_parambouds_to_ops_parambounds(self._param_bounds) \
         #    if (self._param_bounds is not None) else _default_param_bounds(Np)
-        debug = False
+        debug = True
         if debug: print("DEBUG: rebuilding model %s..." % str(id(self)))
 
         # Step 1: add parameters that don't exist yet
@@ -1033,8 +1033,9 @@ def _rebuild_paramvec(self):
                 insertion_point = max_index_processed_so_far + 1
                 if num_new_params > 0:
                     # If so, before allocating anything, make the necessary space in the parameter arrays:
+                    memo = set()
                     for _, o in self._iter_parameterized_objs():
-                        o.shift_gpindices(insertion_point, num_new_params, self)
+                        o.shift_gpindices(insertion_point, num_new_params, self, memo)
                     w = _np.insert(w, insertion_point, _np.empty(num_new_params, 'd'))
                     wl = _np.insert(wl, insertion_point, _np.empty(num_new_params, dtype=object))
                     wb = _np.insert(wb, insertion_point, _default_param_bounds(num_new_params), axis=0)
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index cb91d3c07..13a183108 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -1,10 +1,23 @@
 import numpy as np
+import scipy.linalg as la
 
 from pygsti.baseobjs import qubitgraph as _qgraph
 from pygsti.baseobjs import QubitSpace
 from pygsti.models import modelconstruction as pgmc
 from pygsti.processors import QubitProcessorSpec
+from pygsti.modelmembers.states import ComposedState, ComputationalBasisState
+from pygsti.modelmembers.povms import ComposedPOVM
+from pygsti.modelmembers.operations import ComposedOp, LindbladErrorgen, ExpErrorgenOp
+# from pygsti.tools.lindbladtools import random_error_generator_rates
+from pygsti.baseobjs.errorgenbasis import CompleteElementaryErrorgenBasis
 from pygsti.circuits import Circuit
+
+
+from pygsti.baseobjs import qubitgraph as _qgraph
+from pygsti.processors import QubitProcessorSpec
+
+# Nick's additions to make it work.
+from pygsti.algorithms import BuiltinBasis
 from pygsti.tools import unitary_to_superop
 
 from pygsti.baseobjs import Label
@@ -13,6 +26,7 @@
 
 from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
 
+
 def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: Circuit):
 
     for key, val in op_dict.items():
@@ -59,57 +73,60 @@ def create_object(self, args=None, sslbls=None):
 def make_spam(num_qubits):
     state_space = QubitSpace(num_qubits)
     max_weights = {'H':1, 'S':1, 'C':1, 'A':1}
-    # egbn_H_only = CompleteElementaryErrorgenBasis('PP', state_space, ('H',), max_weights)
-
-    # rho_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
-    # rho_lindblad = LindbladErrorgen.from_elementary_errorgens(rho_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
-    # rho_errorgen = ExpErrorgenOp(rho_lindblad)
-    # rho_ideal    = ComputationalBasisState([0]*num_qubits)
-    # rho          = ComposedState(rho_ideal, rho_errorgen)
-
-    # M_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
-    # M_lindblad = LindbladErrorgen.from_elementary_errorgens(M_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
-    # M_errorgen = ExpErrorgenOp(M_lindblad)
-    # M = ComposedPOVM(M_errorgen)
-    M = None
-    return None, M
-
-def make_target_model(num_qubits):
+    egbn_H_only = CompleteElementaryErrorgenBasis(BuiltinBasis("PP", 4), state_space, ('H',), max_weights)
+
+    rho_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
+    rho_lindblad = LindbladErrorgen.from_elementary_errorgens(rho_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
+    rho_errorgen = ExpErrorgenOp(rho_lindblad)
+    rho_ideal    = ComputationalBasisState([0]*num_qubits)
+    rho          = ComposedState(rho_ideal, rho_errorgen)
+
+    M_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
+    M_lindblad = LindbladErrorgen.from_elementary_errorgens(M_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
+    M_errorgen = ExpErrorgenOp(M_lindblad)
+    M = ComposedPOVM(M_errorgen)
+
+    return rho, M
+
+
+def make_target_model(num_qubits, independent_gates: bool = True, arbitrary_unit: bool = False):
     ps_geometry = _qgraph.QubitGraph.common_graph(
         num_qubits, geometry='line',
         directed=True, all_directions=True,
         qubit_labels=tuple(range(num_qubits))
     )
     u_ecr = 1/np.sqrt(2)*np.array([[0,0,1,1j],[0,0,1j,1],[1,-1j,0,0],[-1j,1,0,0]])
-    gatenames = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi', 'Gii',  'Gecr', "Gcnot", "Gswap"]
+    gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii',  'Gecr']
     ps = QubitProcessorSpec(
         num_qubits=num_qubits,
         gate_names=gatenames,
-        # nonstd_gate_unitaries={"Gcustom": MyContinuouslyParameterizedGateFunction(),
-        #     'Gecr': u_ecr, 'Gii': np.eye(4)},
         nonstd_gate_unitaries={'Gecr': u_ecr, 'Gii': np.eye(4)},
         geometry=ps_geometry
     )
-    # gateerrs = dict()
-    # egb1 = CompleteElementaryErrorgenBasis('PP', QubitSpace(1), ('H','S'), default_label_type='local')
-    # for gn in gatenames[:-1]:
-    #     gateerrs[gn] = {ell: 0 for ell in egb1.labels}
-    # egb2 = CompleteElementaryErrorgenBasis('PP', QubitSpace(2), ('H','S'), default_label_type='local')
-    # gateerrs['Gecr'] = {ell: 0 for ell in egb2.labels}
-    # gateerrs['Gii'] = gateerrs['Gecr']
-    # tmn = pgmc.create_crosstalk_free_model(ps, lindblad_error_coeffs=gateerrs)
-
-    tmn = pgmc.create_crosstalk_free_model(ps, implicit_idle_mode="pad_1Q", independent_gates=True)
+    gateerrs = dict()
+    basis = BuiltinBasis("PP", QubitSpace(1))
+    egb1 = CompleteElementaryErrorgenBasis(basis, QubitSpace(1), ('H','S')) # XXXX From Riley's code, default_label_type='local')
+    for gn in gatenames[:-1]:
+        gateerrs[gn] = {ell: 0 for ell in egb1.labels}
+    egb2 = CompleteElementaryErrorgenBasis(basis, QubitSpace(2), ('H','S')) # XXXX From Riley's code, default_label_type='local')
+    gateerrs['Gecr'] = {ell: 0 for ell in egb2.labels}
+    gateerrs['Gii'] = gateerrs['Gecr']
+
+    tmn = pgmc.create_crosstalk_free_model(ps, lindblad_error_coeffs=gateerrs, independent_gates=independent_gates) #,
+                                        #    ideal_spam_type="CPTPLND")
+
+    rho, M = make_spam(num_qubits)
+    tmn.prep_blks['layers']['rho0'] = rho
+    tmn.povm_blks['layers']['Mdefault'] = M
+    # tmn._mark_for_rebuild()
+    tmn._rebuild_paramvec()
     
-    # rho, M = make_spam(num_qubits)
-    # tmn.prep_blks['layers']['rho0'] = rho
-    # tmn.povm_blks['layers']['Mdefault'] = M
-    # tmn._rebuild_paramvec()
+    # tmn._layer_rules.implicit_idle_mode = "pad_1Q"
 
-
-    for i in range(num_qubits):
-        Ga_factory = ArbParameterizedOpFactory(state_space=QubitSpace(num_qubits), location=(i,))
-        tmn.factories["layers"][("Gcustom", i)] = Ga_factory # add in the factory for every qubit.
+    if arbitrary_unit:
+        for i in range(num_qubits):
+            Ga_factory = ArbParameterizedOpFactory(state_space=QubitSpace(num_qubits), location=(i,))
+            tmn.factories["layers"][("Gcustom", i)] = Ga_factory # add in the factory for every qubit.
     
     return tmn
 
@@ -240,7 +257,7 @@ def build_circuit_with_multiple_qubit_gates(num_qubits: int, depth_L: int, gates
 #endregion Building Random Circuits
 
 
-#region Consistency of Pro
+#region Consistency of Probability
 def test_tensor_product_single_unitaries_yield_right_results():
 
     num_qubits = 4
@@ -369,3 +386,49 @@ def test_tensor_product_multi_qubit_gates_with_structured_lanes():
 
         assert_probability_densities_are_equal(probs, exp, circuit)
 #endregion Probabilities Consistency tests
+
+
+#region D Probabilities Consistency Tests
+
+def test_tensor_product_two_qubit_gates_dprobs():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+
+    circuitECR01 = Circuit([[("Gecr", 0,1), ("Gi", 2), ("Gzpi2", 3)]])    
+    circuitECR10 = Circuit([[("Gecr", 1,0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    for cir in [circuitECR01, circuitECR10]:
+        probs = under_test.sim.dprobs(cir)
+        exp = expected_model.sim.dprobs(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+
+def test_tensor_product_single_unitaries_yield_right_results_dprobs():
+
+    import importlib as _importlib
+    from pygsti.forwardsims import MapForwardSimulator
+    num_qubits = 2
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    # circuitNone = Circuit([], num_lines=num_qubits)
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+
+    circuits = [circuitX]
+    for cir in circuits:
+        under_test.sim = MapForwardSimulator()
+        probs = under_test.sim.dprobs(cir)
+        expected_model.sim.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic")
+
+        exp = expected_model.sim.dprobs(cir)
+
+        assert_probability_densities_are_equal(probs, exp, cir)
+
+test_tensor_product_single_unitaries_yield_right_results_dprobs()
\ No newline at end of file

From eb9d122ae140e2c301304950492c166bd828e5e9 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 22 Jul 2025 16:05:43 -0700
Subject: [PATCH 108/141] Add dprobs test cases.

---
 pygsti/models/model.py                        |  2 +-
 .../test_forwardsim_on_implicitop_model.py    | 72 +++++++++++++++++--
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index ef0c6dc6a..1243080e0 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -1007,7 +1007,7 @@ def _rebuild_paramvec(self):
         # bounds to "ops" bounds like we do the parameter vector.  Need something like:
         #wb = self._model_parambouds_to_ops_parambounds(self._param_bounds) \
         #    if (self._param_bounds is not None) else _default_param_bounds(Np)
-        debug = True
+        debug = False
         if debug: print("DEBUG: rebuilding model %s..." % str(id(self)))
 
         # Step 1: add parameters that don't exist yet
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index 13a183108..e3df08ed8 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -96,7 +96,7 @@ def make_target_model(num_qubits, independent_gates: bool = True, arbitrary_unit
         qubit_labels=tuple(range(num_qubits))
     )
     u_ecr = 1/np.sqrt(2)*np.array([[0,0,1,1j],[0,0,1j,1],[1,-1j,0,0],[-1j,1,0,0]])
-    gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii',  'Gecr']
+    gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii',  'Gecr', "Gcnot", "Gswap"]
     ps = QubitProcessorSpec(
         num_qubits=num_qubits,
         gate_names=gatenames,
@@ -415,15 +415,14 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
     under_test, expected_model = build_models_for_testing(num_qubits)
 
-    # circuitNone = Circuit([], num_lines=num_qubits)
+    circuitNone = Circuit([], num_lines=num_qubits)
     circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
     circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
     circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
     circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
 
-    circuits = [circuitX]
+    circuits = [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]
     for cir in circuits:
-        under_test.sim = MapForwardSimulator()
         probs = under_test.sim.dprobs(cir)
         expected_model.sim.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic")
 
@@ -431,4 +430,67 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
-test_tensor_product_single_unitaries_yield_right_results_dprobs()
\ No newline at end of file
+def test_tensor_product_single_unitaries_random_collection_of_xyz_dprobs():
+
+    for qb in range(2, 6):
+
+        under_test, expected_model = build_models_for_testing(qb)
+        allowed_gates = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi']
+
+        circuit100 = build_circuit(qb, 100, allowed_gates=allowed_gates)
+
+        probs = under_test.sim.dprobs(circuit100)
+        exp = expected_model.sim.dprobs(circuit100)
+
+        assert_probability_densities_are_equal(probs, exp, circuit100)
+
+def test_tensor_product_gates_with_implicit_idles_dprobs():
+
+    num_qubits = 5
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+    
+    gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
+    for gate in gatenames:
+        for i in range(num_qubits):
+            cir = Circuit([[(gate, i)]], num_lines=num_qubits)
+
+            probs = under_test.sim.dprobs(cir)
+            exp = expected_model.sim.dprobs(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+    # Now for the two qubit gates. Gecr and GCNOT
+
+    gatenames = ["Gecr", "Gcnot"]
+    # gatenames = ["Gecr"]
+    for gate in gatenames:
+        for i in range(num_qubits - 1):
+            cir = Circuit([[(gate, i, i+1)]], num_lines=num_qubits)
+
+            probs = under_test.sim.dprobs(cir)
+            exp = expected_model.sim.dprobs(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+            # Order swapped.
+            cir = Circuit([[(gate, i+1, i)]], num_lines=num_qubits)
+
+            probs = under_test.sim.dprobs(cir)
+            exp = expected_model.sim.dprobs(cir)
+            assert_probability_densities_are_equal(probs, exp, cir)
+
+
+def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
+
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    for qb in range(5, 6):
+
+        lanes = [1, 2, 4]
+
+        under_test, expected_model = build_models_for_testing(qb)
+
+        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb, 100, lanes, gates_to_qubits_used=gates_to_used_qubits)
+
+        probs = under_test.sim.dprobs(circuit)
+        exp = expected_model.sim.dprobs(circuit)
+
+        assert_probability_densities_are_equal(probs, exp, circuit)
\ No newline at end of file

From 80ecba9eee8e0cd7cc0f29599dff6273b0cec36d Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 22 Jul 2025 16:58:58 -0700
Subject: [PATCH 109/141] Fix formatting issues

---
 .../test_forwardsim_on_implicitop_model.py    | 211 +++++++++++-------
 1 file changed, 126 insertions(+), 85 deletions(-)

diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index e3df08ed8..cb74cec35 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -1,5 +1,4 @@
 import numpy as np
-import scipy.linalg as la
 
 from pygsti.baseobjs import qubitgraph as _qgraph
 from pygsti.baseobjs import QubitSpace
@@ -7,24 +6,16 @@
 from pygsti.processors import QubitProcessorSpec
 from pygsti.modelmembers.states import ComposedState, ComputationalBasisState
 from pygsti.modelmembers.povms import ComposedPOVM
-from pygsti.modelmembers.operations import ComposedOp, LindbladErrorgen, ExpErrorgenOp
-# from pygsti.tools.lindbladtools import random_error_generator_rates
+from pygsti.modelmembers.operations import LindbladErrorgen, ExpErrorgenOp
 from pygsti.baseobjs.errorgenbasis import CompleteElementaryErrorgenBasis
 from pygsti.circuits import Circuit
-
-
-from pygsti.baseobjs import qubitgraph as _qgraph
-from pygsti.processors import QubitProcessorSpec
-
-# Nick's additions to make it work.
 from pygsti.algorithms import BuiltinBasis
 from pygsti.tools import unitary_to_superop
-
 from pygsti.baseobjs import Label
 from pygsti.modelmembers import operations as op
 from pygsti.baseobjs import UnitaryGateFunction
-
 from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
+from pygsti.forwardsims import MapForwardSimulator
 
 
 def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: Circuit):
@@ -33,70 +24,90 @@ def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: C
         assert key in exp_dict
         assert np.allclose(exp_dict[key], val), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
 
+
 #region Model Construction
 def construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta):
 
-    first_term = np.exp(alpha*1j)
-    left_mat = np.array([np.exp(-1j*beta/2), 0, 0, np.exp(1j*beta/2)]).reshape(2,2)
-    rotate_mat = np.array([np.cos(gamma/2), -np.sin(gamma/2), np.sin(gamma/2), np.cos(gamma/2)]).reshape(2,2)
-    right_mat = np.array([np.exp(-1j*delta/2), 0, 0, np.exp(1j*delta/2)]).reshape(2,2)
+    first_term = np.exp(alpha * 1j)
+    left_mat = np.array([np.exp(-1j * beta / 2), 0, 0, np.exp(1j * beta / 2)]).reshape(2, 2)
+    rotate_mat = np.array([np.cos(gamma / 2), -np.sin(gamma / 2), np.sin(gamma / 2), np.cos(gamma / 2)]).reshape(2, 2)
+    right_mat = np.array([np.exp(-1j * delta / 2), 0, 0, np.exp(1j * delta / 2)]).reshape(2, 2)
 
     my_matrix = first_term * (left_mat @ (rotate_mat @ right_mat))
 
-    assert np.allclose(np.conjugate( my_matrix.T) @ my_matrix, np.eye(2))
+    assert np.allclose(np.conjugate(my_matrix.T) @ my_matrix, np.eye(2))
 
     return my_matrix
 
 
 class MyContinuouslyParameterizedGateFunction(UnitaryGateFunction):
     shape = (2, 2)
-    def __call__(self, alpha, beta, gamma, delta):                                                                                                                                                                                                                                                                                                                                                                                                                   
+
+    def __call__(self, alpha, beta, gamma, delta):
         return construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta)
 
 
 class ArbParameterizedOpFactory(op.OpFactory):
-    def __init__(self, state_space, location:int):
+    def __init__(self, state_space, location: int):
         op.OpFactory.__init__(self, state_space=state_space, evotype="densitymx")
         self.my_state_space = state_space
         self.interesting_qubit = location
 
     def create_object(self, args=None, sslbls=None):
-        # Note: don't worry about sslbls (unused) -- this argument allow factories to create different operations on different target qubits
-        assert(len(args) == 4)
+        assert (len(args) == 4)
         alpha, beta, gamma, delta = args
-        
+
         unitary = construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta)
-        
+
         superop = unitary_to_superop(unitary)
-        return op.EmbeddedOp(state_space = self.my_state_space, target_labels=self.interesting_qubit, operation_to_embed=op.StaticArbitraryOp(superop))
+        return op.EmbeddedOp(state_space=self.my_state_space,
+                             target_labels=self.interesting_qubit,
+                             operation_to_embed=op.StaticArbitraryOp(superop))
+
 
 def make_spam(num_qubits):
     state_space = QubitSpace(num_qubits)
-    max_weights = {'H':1, 'S':1, 'C':1, 'A':1}
-    egbn_H_only = CompleteElementaryErrorgenBasis(BuiltinBasis("PP", 4), state_space, ('H',), max_weights)
+    max_weights = {'H': 1, 'S': 1, 'C': 1, 'A': 1}
+    egbn_H_only = CompleteElementaryErrorgenBasis(BuiltinBasis("PP", 4), state_space, ('H', ), max_weights)
 
     rho_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
-    rho_lindblad = LindbladErrorgen.from_elementary_errorgens(rho_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
+    rho_lindblad = LindbladErrorgen.from_elementary_errorgens(rho_errgen_rates,
+                                                              parameterization='H',
+                                                              state_space=state_space,
+                                                              evotype='densitymx')
     rho_errorgen = ExpErrorgenOp(rho_lindblad)
-    rho_ideal    = ComputationalBasisState([0]*num_qubits)
-    rho          = ComposedState(rho_ideal, rho_errorgen)
+    rho_ideal = ComputationalBasisState([0] * num_qubits)
+    rho = ComposedState(rho_ideal, rho_errorgen)
 
     M_errgen_rates = {ell: 0.0 for ell in egbn_H_only.labels}
-    M_lindblad = LindbladErrorgen.from_elementary_errorgens(M_errgen_rates, parameterization='H', state_space=state_space, evotype='densitymx')
+    M_lindblad = LindbladErrorgen.from_elementary_errorgens(M_errgen_rates,
+                                                            parameterization='H',
+                                                            state_space=state_space,
+                                                            evotype='densitymx')
     M_errorgen = ExpErrorgenOp(M_lindblad)
     M = ComposedPOVM(M_errorgen)
 
     return rho, M
 
 
-def make_target_model(num_qubits, independent_gates: bool = True, arbitrary_unit: bool = False):
+def make_target_model(num_qubits,
+                      independent_gates: bool = True,
+                      arbitrary_unit: bool = False,
+                      simplify_for_dprobs: bool = True):
+
     ps_geometry = _qgraph.QubitGraph.common_graph(
         num_qubits, geometry='line',
         directed=True, all_directions=True,
         qubit_labels=tuple(range(num_qubits))
     )
-    u_ecr = 1/np.sqrt(2)*np.array([[0,0,1,1j],[0,0,1j,1],[1,-1j,0,0],[-1j,1,0,0]])
-    gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii',  'Gecr', "Gcnot", "Gswap"]
+    u_ecr = 1 / np.sqrt(2) * np.array([[0, 0, 1, 1j],
+                                       [0, 0, 1j, 1],
+                                       [1, -1j, 0, 0],
+                                       [-1j, 1, 0, 0]])
+    gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii', 'Gecr', "Gcnot", "Gswap"]
+    if simplify_for_dprobs:
+        gatenames = ['Gxpi2', 'Gypi2', 'Gzpi2', 'Gi', 'Gii', 'Gecr']
+
     ps = QubitProcessorSpec(
         num_qubits=num_qubits,
         gate_names=gatenames,
@@ -105,47 +116,56 @@ def make_target_model(num_qubits, independent_gates: bool = True, arbitrary_unit
     )
     gateerrs = dict()
     basis = BuiltinBasis("PP", QubitSpace(1))
-    egb1 = CompleteElementaryErrorgenBasis(basis, QubitSpace(1), ('H','S')) # XXXX From Riley's code, default_label_type='local')
+    egb1 = CompleteElementaryErrorgenBasis(basis, QubitSpace(1), ('H', 'S'))
     for gn in gatenames[:-1]:
         gateerrs[gn] = {ell: 0 for ell in egb1.labels}
-    egb2 = CompleteElementaryErrorgenBasis(basis, QubitSpace(2), ('H','S')) # XXXX From Riley's code, default_label_type='local')
+    egb2 = CompleteElementaryErrorgenBasis(basis, QubitSpace(2), ('H', 'S'))
     gateerrs['Gecr'] = {ell: 0 for ell in egb2.labels}
     gateerrs['Gii'] = gateerrs['Gecr']
 
-    tmn = pgmc.create_crosstalk_free_model(ps, lindblad_error_coeffs=gateerrs, independent_gates=independent_gates) #,
-                                        #    ideal_spam_type="CPTPLND")
+    if not simplify_for_dprobs:
+        gateerrs["Gswap"] = gateerrs["Gecr"]
+        gateerrs["Gcnot"] = gateerrs["Gecr"]
+
+    tmn = pgmc.create_crosstalk_free_model(ps, lindblad_error_coeffs=gateerrs, independent_gates=independent_gates)
 
     rho, M = make_spam(num_qubits)
     tmn.prep_blks['layers']['rho0'] = rho
     tmn.povm_blks['layers']['Mdefault'] = M
-    # tmn._mark_for_rebuild()
     tmn._rebuild_paramvec()
-    
+
     # tmn._layer_rules.implicit_idle_mode = "pad_1Q"
 
     if arbitrary_unit:
         for i in range(num_qubits):
-            Ga_factory = ArbParameterizedOpFactory(state_space=QubitSpace(num_qubits), location=(i,))
-            tmn.factories["layers"][("Gcustom", i)] = Ga_factory # add in the factory for every qubit.
-    
+            Ga_factory = ArbParameterizedOpFactory(state_space=QubitSpace(num_qubits), location=(i, ))
+            tmn.factories["layers"][("Gcustom", i)] = Ga_factory  # add in the factory for every qubit.
+
     return tmn
 
-def build_models_for_testing(num_qubits):
 
-    tgt_model = make_target_model(num_qubits)
+def build_models_for_testing(num_qubits, independent_gates: bool = False, simplify_for_dprobs: bool = False):
+
+    tgt_model = make_target_model(num_qubits,
+                                  independent_gates=independent_gates,
+                                  simplify_for_dprobs=simplify_for_dprobs)
 
     # target_model.sim.calclib = pygsti.forwardsims.mapforwardsim_calc_generic
     tgt_model.sim = LCSEvalTreeMatrixForwardSimulator()
 
-    tgt_model2 = make_target_model(num_qubits)
+    tgt_model2 = tgt_model.copy()
+    tgt_model2.sim = MapForwardSimulator()
+    # make_target_model(num_qubits, independent_gates=independent_gates, simplify_for_dprobs=simplify_for_dprobs)
 
     return tgt_model, tgt_model2
 
 
 #endregion Model Construction
 
+
 #region Building Random Circuits
-def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
+
+def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]) -> Circuit:
     my_circuit = []
     for lnum in range(depth_L):
         layer = []
@@ -155,12 +175,13 @@ def build_circuit(num_qubits: int, depth_L: int, allowed_gates: set[str]):
         my_circuit.append(layer)
     return Circuit(my_circuit)
 
-def build_circuit_with_arbitrarily_random_single_qubit_gates(num_qubits: int, depth_L: int):
+
+def build_circuit_with_arbitrarily_random_single_qubit_gates(num_qubits: int, depth_L: int) -> Circuit:
 
     my_circuit = []
     gate_name = "Gcustom"
 
-    full_args = np.random.random((depth_L, num_qubits, 4)) * 4 * np.pi # Need to be in [0, 2 \pi] for the half angles.
+    full_args = np.random.random((depth_L, num_qubits, 4)) * 4 * np.pi  # Need to be in [0, 2 \pi] for the half angles.
 
     for lnum in range(depth_L):
         layer = []
@@ -170,11 +191,14 @@ def build_circuit_with_arbitrarily_random_single_qubit_gates(num_qubits: int, de
         my_circuit.append(layer)
     return Circuit(my_circuit, num_lines=num_qubits)
 
-def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: int, depth_L: int, lane_end_points: list[int], gates_to_qubits_used: dict[str, int]):
 
-    assert lane_end_points[-1] <= num_qubits # if < then we have a lane from there to num_qubits.
+def build_circuit_with_multiple_qubit_gates_with_designated_lanes(
+        num_qubits: int, depth_L: int,
+        lane_end_points: list[int], gates_to_qubits_used: dict[str, int]) -> Circuit:
+
+    assert lane_end_points[-1] <= num_qubits  # if < then we have a lane from there to num_qubits.
     assert lane_end_points[0] > 0
-    assert np.all(np.diff(lane_end_points) > 0) # then it is sorted in increasing order.
+    assert np.all(np.diff(lane_end_points) > 0)  # then it is sorted in increasing order.
 
     if lane_end_points[-1] < num_qubits:
         lane_end_points.append(num_qubits)
@@ -203,10 +227,10 @@ def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: in
                     # we need to first choose how many to use.
                     nchosen = np.random.randint(1, navail + 1)
                 gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
-                tmp = list(np.random.permutation(nchosen) + num_used + start_point) # Increase to offset.
+                tmp = list(np.random.permutation(nchosen) + num_used + start_point)  # Increase to offset.
                 perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
                 if gate == "Gcustom":
-                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+                    layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4) * 4 * np.pi)))
                 else:
                     layer.append((gate, *perm_of_qubits_used))
                 num_used += nchosen
@@ -214,13 +238,16 @@ def build_circuit_with_multiple_qubit_gates_with_designated_lanes(num_qubits: in
             if num_used > (lane_ep - start_point) + 1:
                 print(num_used, f"lane ({start_point}, {lane_ep})")
                 raise AssertionError("lane barrier is broken")
-            
+
             start_point = lane_ep
         my_circuit.append(layer)
     return Circuit(my_circuit, num_lines=num_qubits)
 
 
-def build_circuit_with_multiple_qubit_gates(num_qubits: int, depth_L: int, gates_to_qubits_used: dict[str, int], starting_qubit: int=0):
+def build_circuit_with_multiple_qubit_gates(num_qubits: int,
+                                            depth_L: int,
+                                            gates_to_qubits_used: dict[str, int],
+                                            starting_qubit: int = 0):
 
     my_circuit = []
     n_qs_to_gates_avail = {}
@@ -243,12 +270,12 @@ def build_circuit_with_multiple_qubit_gates(num_qubits: int, depth_L: int, gates
                 # we need to first choose how many to use.
                 nchosen = np.random.randint(1, navail + 1)
             gate = str(np.random.choice(n_qs_to_gates_avail[nchosen]))
-            tmp = list(np.random.permutation(nchosen) + num_used) # Increase to offset.
+            tmp = list(np.random.permutation(nchosen) + num_used)  # Increase to offset.
             perm_of_qubits_used = [int(tmp[ind]) for ind in range(len(tmp))]
             if gate == "Gcustom":
-                layer.append(Label(gate, *perm_of_qubits_used, args=(np.random.random(4)*4*np.pi)))
+                layer.append(Label(gate, * perm_of_qubits_used, args=(np.random.random(4) * 4 * np.pi)))
             else:
-                layer.append((gate, *perm_of_qubits_used))
+                layer.append((gate, * perm_of_qubits_used))
             num_used += nchosen
 
         my_circuit.append(layer)
@@ -276,6 +303,7 @@ def test_tensor_product_single_unitaries_yield_right_results():
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
+
 def test_tensor_product_single_unitaries_random_collection_of_xyz():
 
     for qb in range(2, 6):
@@ -290,6 +318,7 @@ def test_tensor_product_single_unitaries_random_collection_of_xyz():
 
         assert_probability_densities_are_equal(probs, exp, circuit100)
 
+
 def test_tensor_product_arbitrarily_random_rotations():
 
     for qb in range(2, 6):
@@ -303,15 +332,15 @@ def test_tensor_product_arbitrarily_random_rotations():
 
         assert_probability_densities_are_equal(probs, exp, circuit)
 
+
 def test_tensor_product_two_qubit_gates():
 
     num_qubits = 4
 
     under_test, expected_model = build_models_for_testing(num_qubits)
 
-
-    circuitECR01 = Circuit([[("Gecr", 0,1), ("Gi", 2), ("Gzpi2", 3)]])    
-    circuitECR10 = Circuit([[("Gecr", 1,0), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
 
     for cir in [circuitECR01, circuitECR10]:
         probs = under_test.probabilities(cir)
@@ -319,12 +348,13 @@ def test_tensor_product_two_qubit_gates():
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
+
 def test_tensor_product_gates_with_implicit_idles():
 
     num_qubits = 5
 
     under_test, expected_model = build_models_for_testing(num_qubits)
-    
+
     gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
     for gate in gatenames:
         for i in range(num_qubits):
@@ -340,36 +370,36 @@ def test_tensor_product_gates_with_implicit_idles():
     gatenames = ["Gecr"]
     for gate in gatenames:
         for i in range(num_qubits - 1):
-            cir = Circuit([[(gate, i, i+1)]], num_lines=num_qubits)
+            cir = Circuit([[(gate, i, i + 1)]], num_lines=num_qubits)
 
             probs = under_test.probabilities(cir)
             exp = expected_model.probabilities(cir)
             assert_probability_densities_are_equal(probs, exp, cir)
 
             # Order swapped.
-            cir = Circuit([[(gate, i+1, i)]], num_lines=num_qubits)
+            cir = Circuit([[(gate, i + 1, i)]], num_lines=num_qubits)
 
             probs = under_test.probabilities(cir)
             exp = expected_model.probabilities(cir)
             assert_probability_densities_are_equal(probs, exp, cir)
 
-def test_tensor_product_multi_qubit_gates_arbitrarily_random_rotations():
 
+def test_tensor_product_multi_qubit_gates_arbitrarily_random_rotations():
 
-    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gcustom': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1,
+                            'Gcustom': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
     for qb in range(3, 6):
 
         under_test, expected_model = build_models_for_testing(qb)
 
         circuit = build_circuit_with_multiple_qubit_gates(qb, 100, gates_to_qubits_used=gates_to_used_qubits)
 
-        print(circuit)
-
         probs = under_test.probabilities(circuit)
         exp = expected_model.probabilities(circuit)
 
         assert_probability_densities_are_equal(probs, exp, circuit)
 
+
 def test_tensor_product_multi_qubit_gates_with_structured_lanes():
 
     gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
@@ -379,7 +409,10 @@ def test_tensor_product_multi_qubit_gates_with_structured_lanes():
 
         under_test, expected_model = build_models_for_testing(qb)
 
-        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb, 100, lanes, gates_to_qubits_used=gates_to_used_qubits)
+        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb,
+                                                                                100,
+                                                                                lanes,
+                                                                                gates_to_used_qubits)
 
         probs = under_test.probabilities(circuit)
         exp = expected_model.probabilities(circuit)
@@ -394,11 +427,10 @@ def test_tensor_product_two_qubit_gates_dprobs():
 
     num_qubits = 4
 
-    under_test, expected_model = build_models_for_testing(num_qubits)
-
+    under_test, expected_model = build_models_for_testing(num_qubits, simplify_for_dprobs=True)
 
-    circuitECR01 = Circuit([[("Gecr", 0,1), ("Gi", 2), ("Gzpi2", 3)]])    
-    circuitECR10 = Circuit([[("Gecr", 1,0), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
 
     for cir in [circuitECR01, circuitECR10]:
         probs = under_test.sim.dprobs(cir)
@@ -410,7 +442,7 @@ def test_tensor_product_two_qubit_gates_dprobs():
 def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
     import importlib as _importlib
-    from pygsti.forwardsims import MapForwardSimulator
+
     num_qubits = 2
 
     under_test, expected_model = build_models_for_testing(num_qubits)
@@ -430,26 +462,28 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
+
 def test_tensor_product_single_unitaries_random_collection_of_xyz_dprobs():
 
-    for qb in range(2, 6):
+    for qb in range(2, 4):
 
-        under_test, expected_model = build_models_for_testing(qb)
+        under_test, expected_model = build_models_for_testing(qb, independent_gates=True, simplify_for_dprobs=True)
         allowed_gates = ['Gxpi2', 'Gypi2', "Gzpi2", 'Gi']
 
-        circuit100 = build_circuit(qb, 100, allowed_gates=allowed_gates)
+        circuit100 = build_circuit(qb, 15, allowed_gates=allowed_gates)
 
         probs = under_test.sim.dprobs(circuit100)
         exp = expected_model.sim.dprobs(circuit100)
 
         assert_probability_densities_are_equal(probs, exp, circuit100)
 
+
 def test_tensor_product_gates_with_implicit_idles_dprobs():
 
     num_qubits = 5
 
-    under_test, expected_model = build_models_for_testing(num_qubits)
-    
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
+
     gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
     for gate in gatenames:
         for i in range(num_qubits):
@@ -461,18 +495,18 @@ def test_tensor_product_gates_with_implicit_idles_dprobs():
 
     # Now for the two qubit gates. Gecr and GCNOT
 
-    gatenames = ["Gecr", "Gcnot"]
-    # gatenames = ["Gecr"]
+    # gatenames = ["Gecr", "Gcnot"]
+    gatenames = ["Gecr"]
     for gate in gatenames:
         for i in range(num_qubits - 1):
-            cir = Circuit([[(gate, i, i+1)]], num_lines=num_qubits)
+            cir = Circuit([[(gate, i, i + 1)]], num_lines=num_qubits)
 
             probs = under_test.sim.dprobs(cir)
             exp = expected_model.sim.dprobs(cir)
             assert_probability_densities_are_equal(probs, exp, cir)
 
             # Order swapped.
-            cir = Circuit([[(gate, i+1, i)]], num_lines=num_qubits)
+            cir = Circuit([[(gate, i + 1, i)]], num_lines=num_qubits)
 
             probs = under_test.sim.dprobs(cir)
             exp = expected_model.sim.dprobs(cir)
@@ -482,15 +516,22 @@ def test_tensor_product_gates_with_implicit_idles_dprobs():
 def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
 
     gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
+    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gecr': 2}
     for qb in range(5, 6):
 
         lanes = [1, 2, 4]
 
-        under_test, expected_model = build_models_for_testing(qb)
+        under_test, expected_model = build_models_for_testing(qb, independent_gates=True, simplify_for_dprobs=True)
 
-        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb, 100, lanes, gates_to_qubits_used=gates_to_used_qubits)
+        circuit = build_circuit_with_multiple_qubit_gates_with_designated_lanes(qb,
+                                                                                10,
+                                                                                lanes,
+                                                                                gates_to_used_qubits)
 
         probs = under_test.sim.dprobs(circuit)
         exp = expected_model.sim.dprobs(circuit)
 
-        assert_probability_densities_are_equal(probs, exp, circuit)
\ No newline at end of file
+        assert_probability_densities_are_equal(probs, exp, circuit)
+
+
+#endregion Derivative of Probabilities consistencies.

From ea18d5b237f873b53a455e7d5caa49feb9e2aef0 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 23 Jul 2025 17:08:49 -0700
Subject: [PATCH 110/141] Add a cache for LCS external matches and collapse the
 sequences internally first.

---
 pygsti/layouts/evaltree.py    |  65 ++++++++++++++++---
 pygsti/tools/sequencetools.py | 115 +++++++++++++++++++++++++++++-----
 2 files changed, 154 insertions(+), 26 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 0c728981d..eeadbe8f8 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -23,7 +23,12 @@
 from pygsti.modelmembers.operations import create_from_superop_mx
 from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
-from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification, _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
+from pygsti.tools.sequencetools import (
+    conduct_one_round_of_lcs_simplification,
+    _compute_lcs_for_every_pair_of_sequences,
+    create_tables_for_internal_LCS,
+    simplify_internal_first_one_round
+)
 
 from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
@@ -581,9 +586,6 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
 
-        external_matches = _compute_lcs_for_every_pair_of_sequences(circuit_list)
-        
-        best_external_match = _np.max(external_matches[0])
         self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
         self.qubit_start_point = qubit_starting_loc
 
@@ -591,7 +593,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         internal_matches = create_tables_for_internal_LCS(circuit_list)
         best_internal_match = _np.max(internal_matches[0])
 
-        max_rounds = int(max(best_external_match,best_internal_match))
+        max_rounds = best_internal_match
 
         C = len(circuit_list)
         sequence_intro = {0: _np.arange(C)}
@@ -601,23 +603,54 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
 
+        # Let's try simplifying internally first.
+        self.internal_first = True
+        if self.internal_first:
+            i = 0
+            cache_pos = -1
+            while max_rounds > 1:
+
+                tmp = simplify_internal_first_one_round(new_circuit_list, 
+                                                        internal_matches,
+                                                        cache_pos,
+                                                        cache)
+                new_circuit_list, cache_pos, cache, sequence_intro[i-1] = tmp
+                i -= 1
+                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
+
+                max_rounds = _np.max(internal_matches[0])
+
+        external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
+                                                                    None,
+                                                                    None,
+                                                                    set(_np.arange(len(new_circuit_list))),
+                                                                    max([len(cir) for cir in new_circuit_list])-1)
+        
+        best_external_match = _np.max(external_matches[0])
+
+        max_rounds = int(max(best_external_match,best_internal_match))
         i = 0
+        cache_pos = len(new_circuit_list)
         while max_rounds > 1:
-            new_circuit_list, cache_pos, cache, sequence_intro[i+1] = conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches, internal_matches, cache_pos, cache)
+            tmp = conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches,
+                                                          internal_matches, cache_pos, cache)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1], ext_table, external_sequences, dirty_inds = tmp
             i += 1
-            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list)
+            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
+                                                                        ext_table,
+                                                                        external_sequences,
+                                                                        dirty_inds, max_rounds)
 
             if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
                 # We are not going to get a better internal match.
                 pass
-            else:
+            elif not self.internal_first:
                 internal_matches = create_tables_for_internal_LCS(new_circuit_list)
 
             best_external_match = _np.max(external_matches[0])
             best_internal_match = _np.max(internal_matches[0])
 
             max_rounds = int(max(best_external_match,best_internal_match))
-            print("Num rounds remaining ", max_rounds)
 
         self.cache = cache
         self.num_circuits = C
@@ -678,6 +711,20 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
         round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
         saved: dict[int | LabelTupTup, _np.ndarray] = {}
+
+        if self.internal_first:
+
+            round_keys = _np.unique(list(self.sequence_intro.keys()))
+
+            pos_inds = np.where(round_keys >=0)
+            pos_keys = round_keys[pos_inds]
+            pos_keys = sorted(pos_keys)[::-1]
+
+            neg_inds = np.where(round_keys < 0)
+            neg_keys = round_keys[neg_inds]
+            neg_keys = sorted(neg_keys)
+            assert neg_keys[0] < neg_keys[1]
+            round_keys = neg_keys + pos_keys
         
         for key in round_keys:
             for cind in self.sequence_intro[key]:
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 80f00141c..5d50c5944 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -1,6 +1,6 @@
 from typing import Sequence, Any, List, Literal, Tuple, MutableSequence
 import numpy as _np
-
+from tqdm import tqdm
 
 #region Longest Common Subsequence
 
@@ -52,7 +52,9 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
     if table_data_and_sequences:
         table, external_sequences = table_data_and_sequences
     else:
-        table, external_sequences = _compute_lcs_for_every_pair_of_sequences(sequences)
+        table_cache = _np.zeros((len(sequences), len(sequences)))
+        table, external_sequences = _compute_lcs_for_every_pair_of_sequences(sequences, table_cache,
+                                                None, set(_np.arange(len(sequences))))
 
     if internal_tables_and_sequences:
         internal_subtable, internal_subsequences = internal_tables_and_sequences
@@ -98,6 +100,7 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
 
     # Handle the updates.
     old_cache_num = cache_num
+    dirty_inds = set()
     for seq, cdict in all_subsequences_to_replace.items():
         w = len(seq)
         update_made = 0
@@ -109,6 +112,7 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
                 while sp+w <= len(my_cir):
                     if list(my_cir[sp: sp+w]) == list(seq):
                         my_cir[sp: sp + w] = [cache_num]
+                        dirty_inds.add(cir_ind)
                         update_made = 1
 
                     sp += 1
@@ -126,9 +130,75 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
 
     sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
 
+    dirty_inds = dirty_inds.union(set(sequences_introduced_in_this_round))
+
+    return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round, table, external_sequences, dirty_inds
+
+def simplify_internal_first_one_round(sequences: MutableSequence[MutableSequence[Any]],
+            internal_tables_and_sequences, starting_cache_num, cache_struct):
+    """
+    Simplify the set of sequences by contracting the set of longest common subsequences.
+
+    Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
+    
+    Cache number will decrement so ensure that cache_struct can handle positives and negatives.
+    """
+
+    if internal_tables_and_sequences:
+        internal_subtable, internal_subsequences = internal_tables_and_sequences
+    else:
+        internal_subtable, internal_subsequences = create_tables_for_internal_LCS(sequences)
+
+    best_internal_index = _np.where(internal_subtable == _np.max(internal_subtable))
+    updated_sequences = [seq for seq in sequences]
+    cache_num = starting_cache_num
+
+    # Build sequence dict
+    all_subsequences_to_replace: dict[tuple, dict[int, List[int]]] = {}
+
+    # We are only going to replace if this was the longest substring.
+    for cir_ind in best_internal_index[0]:
+        for seq in internal_subsequences[cir_ind]:
+            key = tuple(seq)
+            if key in all_subsequences_to_replace:
+                all_subsequences_to_replace[key][cir_ind] = internal_subsequences[cir_ind][seq]
+            else:
+                all_subsequences_to_replace[key] = {cir_ind: internal_subsequences[cir_ind][seq]}
+
+    # Handle the updates.
+    old_cache_num = cache_num
+    for seq, cdict in all_subsequences_to_replace.items():
+        w = len(seq)
+        update_made = 0
+        if  w > 1 or (not isinstance(seq[0], int)):
+            # We have reached an item which we can just compute.
+            for cir_ind in cdict:
+                my_cir = updated_sequences[cir_ind]
+                sp = 0
+                while sp+w <= len(my_cir):
+                    if list(my_cir[sp: sp+w]) == list(seq):
+                        my_cir[sp: sp + w] = [cache_num]
+                        update_made = 1
+
+                    sp += 1
+                updated_sequences[cir_ind] = my_cir
+
+                cache_struct[cir_ind] = updated_sequences[cir_ind]
+
+            if update_made:
+                # There may have been multiple overlapping subsequences in the same sequence.
+                # (e.g. QWEQWEQWERQWE has QWE, WEQ, and EQW all happen and all are length 3 subsequences.)
+                updated_sequences.append(list(seq))
+                cache_struct[cache_num] = updated_sequences[cache_num]
+
+                cache_num += -1
+
+    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+
     return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round
 
 
+
 def _find_starting_positions_using_dp_table(
         dp_table: _np.ndarray
     ) -> tuple[int, int, int] | Tuple[None, None, None]:
@@ -162,32 +232,43 @@ def _find_starting_positions_using_dp_table(
             return i, j, dp_table[0,0]
     return None, None, None
 
+def _lookup_in_sequence_cache(seq_cache: dict[tuple[int, int], tuple], i: int, j: int) -> tuple:
+
+    if seq_cache:
+        return seq_cache[(i, j)]
+    return (None, None, None)
 
-def _compute_lcs_for_every_pair_of_sequences(sequences: MutableSequence[Any]):
+
+def _compute_lcs_for_every_pair_of_sequences(sequences: MutableSequence[Any],
+                                             table_cache: _np.ndarray,
+                                             seq_cache: dict,
+                                             dirty_inds: set,
+                                             expected_best: int):
     """
     Computes the LCS for every pair of sequences A,B in sequences
     """
     best_subsequences = {}
     best_lengths = _np.zeros((len(sequences), len(sequences)))
-    curr_best = 0
-    for i in range(len(sequences)-1, -1, -1): # Lets do this in reverse order
+    curr_best = 2  # We want only subsequences that have at least two characters matching.
+    for i in tqdm(range(len(sequences)-1, -1, -1),
+                  f"LCS_circuits Expected Val {expected_best}: ", disable = True): # Lets do this in reverse order
         cir0 = sequences[i]
-        if len(cir0) >= curr_best:
-            # Could be the best.
-            for j in range(i-1, -1, -1):
-                cir1 = sequences[j]
-                if len(cir1) >= curr_best:
+        for j in range(i-1, -1, -1):
+            cir1 = sequences[j]
+            if i in dirty_inds or j in dirty_inds:
+                if len(cir0) < curr_best or len(cir1) < curr_best:
+                    # Mark pair as dirty to be computed later when it may be the longest subsequence.
+                    best_lengths[i,j] = -1
+                    best_subsequences[(i,j)] = (None, None, None)
+                else:
                     table = _lcs_dp_version(cir0, cir1)
                     best_lengths[i,j] = table[0,0]
                     best_subsequences[(i,j)] = _find_starting_positions_using_dp_table(table)
                     curr_best = max(best_lengths[i,j], curr_best)
-                else:
-                    best_lengths[i,j] = -1
-                    best_subsequences[(i,j)] = (None, None, None)
-        else:
-            # Skipped because cannot be the best yet.
-            best_lengths[i,j] = -1
-            best_subsequences[(i,j)] = (None, None, None)
+            else:
+                best_lengths[i,j] = table_cache[i,j]
+                best_subsequences[(i,j)] = _lookup_in_sequence_cache(seq_cache, i, j)
+
     return best_lengths, best_subsequences
 
 

From 96c97f212571a6fa93b82247d30ed61698def2e1 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 25 Jul 2025 11:30:11 -0700
Subject: [PATCH 111/141] update test to have the metadata information for
 caching.

---
 test/unit/tools/test_sequencetools.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
index d51ae5e17..f658358d2 100644
--- a/test/unit/tools/test_sequencetools.py
+++ b/test/unit/tools/test_sequencetools.py
@@ -6,7 +6,7 @@ def test_external_matches():
 
     my_strings = ["ABAARCR12LIO", "QWERTYASDFGH", "QWEELLKJAT"]
 
-    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings)
+    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings, None, None, set(0,1,2), 3)
 
     assert np.max(tables) == 3
 
@@ -47,7 +47,7 @@ def test_one_round_update_collecting_tables_first():
     ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
     example = [list(x) for x in example]
     internal = create_tables_for_internal_LCS(example)
-    external = _compute_lcs_for_every_pair_of_sequences(example)
+    external = _compute_lcs_for_every_pair_of_sequences(example, None, None, set(0,1,2), 3)
 
     cache = {i: s for i,s in enumerate(example)}
     updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, external, internal, len(example), cache)

From fa859acd2742fbe39aebc27e5b81fc202d39fb1b Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 29 Jul 2025 15:01:04 -0700
Subject: [PATCH 112/141] Collapse lanes internally first.

---
 pygsti/layouts/evaltree.py            | 242 ++++++--------------------
 pygsti/tools/sequencetools.py         |  34 +++-
 test/unit/tools/test_sequencetools.py |  33 +++-
 3 files changed, 102 insertions(+), 207 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index eeadbe8f8..548f6c9d3 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -32,7 +32,6 @@
 
 from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
-import numpy as np
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
 from typing import List, Optional, Iterable
@@ -552,7 +551,7 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup
     """
     if op.num_qubits == 2:
         # We may need to do swaps.
-        op_term : _np.ndarray = np.array([1.])
+        op_term : _np.ndarray = _np.array([1.])
         if op in saved:
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:  # type: ignore
@@ -604,28 +603,34 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
 
         # Let's try simplifying internally first.
-        self.internal_first = True
+        self.internal_first = False
+        seq_ind_to_cache_index = {i: i for i in range(C)}
         if self.internal_first:
             i = 0
             cache_pos = -1
             while max_rounds > 1:
 
+                breakpoint()
                 tmp = simplify_internal_first_one_round(new_circuit_list, 
                                                         internal_matches,
                                                         cache_pos,
-                                                        cache)
+                                                        cache,
+                                                        seq_ind_to_cache_index)
                 new_circuit_list, cache_pos, cache, sequence_intro[i-1] = tmp
                 i -= 1
                 internal_matches = create_tables_for_internal_LCS(new_circuit_list)
 
                 max_rounds = _np.max(internal_matches[0])
-
         external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
                                                                     None,
                                                                     None,
                                                                     set(_np.arange(len(new_circuit_list))),
                                                                     max([len(cir) for cir in new_circuit_list])-1)
         
+
+        if self.internal_first:
+            breakpoint()
+
         best_external_match = _np.max(external_matches[0])
 
         max_rounds = int(max(best_external_match,best_internal_match))
@@ -633,7 +638,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         cache_pos = len(new_circuit_list)
         while max_rounds > 1:
             tmp = conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches,
-                                                          internal_matches, cache_pos, cache)
+                                                          internal_matches, cache_pos, cache, seq_ind_to_cache_index)
             new_circuit_list, cache_pos, cache, sequence_intro[i+1], ext_table, external_sequences, dirty_inds = tmp
             i += 1
             external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
@@ -716,27 +721,27 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
             round_keys = _np.unique(list(self.sequence_intro.keys()))
 
-            pos_inds = np.where(round_keys >=0)
+            pos_inds = _np.where(round_keys >0)
             pos_keys = round_keys[pos_inds]
             pos_keys = sorted(pos_keys)[::-1]
 
-            neg_inds = np.where(round_keys < 0)
+            neg_inds = _np.where(round_keys < 0)
             neg_keys = round_keys[neg_inds]
             neg_keys = sorted(neg_keys)
-            assert neg_keys[0] < neg_keys[1]
-            round_keys = neg_keys + pos_keys
+
+            round_keys = pos_keys + neg_keys + _np.array([0])
         
         for key in round_keys:
-            for cind in self.sequence_intro[key]:
+            for cache_ind in self.sequence_intro[key]:
                 cumulative_term = None
-                for term in self.cache[cind]:
+                for term in self.cache[cache_ind]:
                     cumulative_term = self._collapse_cache_line(model, cumulative_term, term, saved, num_qubits_in_default)
                         
                 if cumulative_term is None:
-                    saved[cind] = _np.eye(4**num_qubits_in_default)
+                    saved[cache_ind] = _np.eye(4**num_qubits_in_default)
                     # NOTE: unclear when (if ever) this should be a noisy idle gate.
                 else:
-                    saved[cind] = cumulative_term
+                    saved[cache_ind] = cumulative_term
         if __debug__:
             # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
             for key in self.cache:
@@ -744,7 +749,20 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
         
         # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
     
-        return saved, self.circuit_to_save_location 
+        return saved, self.circuit_to_save_location
+    
+    def combine_for_visualization(self, val, visited):
+
+        if not isinstance(val, int):
+            return [val]
+        elif val in visited:
+            return visited[val]
+        else:
+            tmp = []
+            for child in self.cache[val]:
+                tmp.append(self.combine_for_visualization(child, visited))
+            visited[val] = tmp
+            return tmp
 
     def handle_results_cache_lookup_and_product(self,
                             cumulative_term: None | _np.ndarray,
@@ -768,9 +786,11 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
         """
 
+        if isinstance(term_to_extend_with, int):
+            assert term_to_extend_with in results_cache
+            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
         if term_to_extend_with in results_cache:
             return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
-
         else:
             val = 1
             qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
@@ -912,17 +932,7 @@ def reconstruct_full_matrices(self) -> Optional[List[KronStructured]]:
             for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
                 cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
                 lblkey = cir._line_labels
-                if lblkey == ("*",):
-                    # We are gettting a noisy idle line and so need to check the size we are expecting here.
-                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                    print(cir.num_lines)
-                    # lane_circuits.append(self.saved_results[lblkey][ind_in_results])
-
-                    # 
-                if cir.layertup not in self.sub_cir_to_ind_in_results[lblkey]:
-                    print(lblkey)
-                    print(cir)
-                    breakpoint()
+
                 ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
                 lane_circuits.append(self.saved_results[lblkey][ind_in_results])
             output.append(KronStructured(lane_circuits))
@@ -947,7 +957,7 @@ def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: boo
                 subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
                 qubit_list = (*qubit_list, len(subcir._line_labels))
             qubit_list = list(qubit_list)
-            total_num = np.sum(qubit_list)
+            total_num = _np.sum(qubit_list)
 
             tensor_cost += cost_to_compute_tensor_matvec_without_reordering(qubit_list, total_num)
 
@@ -1104,152 +1114,6 @@ def is_2d_square(arg):
     return arg.shape[0] == arg.shape[1]
 
 
-class InvTriangular(RealLinOp):
-    """
-    NOTE: can avoid relying on sparla.LinearOperator since we can implement matmul and rmatmul directly.
-    """
-
-    def __init__(self, A : np.ndarray, lower: bool, adjoint=None):
-        assert is_2d_square(A)
-        self.lower = lower
-        self.A = A
-        self._size  = A.shape[0]**2
-        self._shape = A.shape
-        self._dtype = A.dtype
-        self._adjoint = InvTriangular(A.T, not self.lower, self) if adjoint is None else adjoint
-
-    def item(self):
-        return 1 / self.A.item()
-
-    def __matmul__(self, other):
-        return la.solve_triangular(self.A, other, trans=0, lower=self.lower, check_finite=False)
-    
-    def __rmatmul__(self, other):
-        return la.solve_triangular(self.A, other.T, trans=1, lower=self.lower, check_finite=False).T
-
-
-class InvPosDef(RealLinOp):
-    """
-    NOTE: can avoid relying on sparla.LinearOperator since we can implement matmul and rmatmul directly.
-    """
-
-    def __init__(self, A: np.ndarray):
-        assert is_2d_square(A)
-        self.A = A
-        self._size  = A.shape[0]**2
-        self._shape = A.shape
-        self._dtype = A.dtype
-        self._chol = la.cho_factor(self.A)
-
-    @property
-    def T(self):
-        # override the default implementation, since we're self-adjoint.
-        return self
-
-    def item(self):
-        return 1 / self.A.item()
-    
-    def __matmul__(self, other):
-        return la.cho_solve(self._chol, other, check_finite=False)
-    
-    def __rmatmul__(self, other):
-        temp = self.__matmul__(other.T)
-        out = temp.T
-        return out
-    
-
-class InvUpdatedKronPosDef(RealLinOp):
-    """
-    A representation of a positive definite linear operator
-    
-        M = inv( K + U U' ),
-    
-    where K is a positive definite matrix with known Kronecker product
-    structure and U is a tall-and-thin matrix.
-
-    This linear operator's action is implemented by precomputing some
-    intermediate quantities at construction time and then using those
-    quantifies in the Woodbury matrix identity. Specifically, we precompute
-
-        1. an implicit representation of L = cho_factor(K, lower=True),
-        2. an explicit representation of V = inv(L) @ U,
-        3. a factored  representation of W = I + V'V,
-    
-    and then we use the formula 
-
-        M = inv(L') (I - V @ inv(W) @ V') @ inv(L).
-    
-    The essence of this method can be preserved with different factorizations
-    for K. For example, instead of computing L = cho_factor(K, lower=True),
-    we could compute P = pinv(sqrtm(K)) and substitute P wherever inv(L) or
-    inv(L') were used.
-    """
-
-    def verify(self):
-        """
-        If P = LL' + U U', then this operator is supposed to represent M = inv(P).
-        This function checks if self @ P is nearly the identity matrix.
-        """
-        explicit_K = np.eye(1)
-        for kf in self.kron_factors:
-            explicit_K = np.kron(explicit_K, kf)
-        explicit_P = explicit_K + self.U @ self.U.T
-        expect_I = self @ explicit_P
-        nrmP = la.norm(explicit_P)
-        I = np.eye(self.shape[0])
-        rel_tol = np.finfo(self.dtype).eps * nrmP
-        abs_tol = np.finfo(self.dtype).eps ** 0.5
-        tol = max(rel_tol, abs_tol)
-        assert la.norm(I - expect_I) <= tol
-        
-
-    def __init__(self, kron_factors : List[np.ndarray], U: np.ndarray, verify=False):
-        K_cho_factors = []
-        dim = 1
-        for kf in kron_factors:
-            K_cho_factors.append(la.cho_factor(kf, lower=True)[0])
-            dim *= kf.shape[0]
-        assert dim == U.shape[0]
-        self.K_cho_factors = K_cho_factors
-        invL_kron_factors = [InvTriangular(lf, lower=True) for lf in K_cho_factors]
-        self.invL = KronStructured(invL_kron_factors)
-        
-        dim_update = U.shape[1]
-        self.V = self.invL @ U
-        self.W = np.eye(dim_update) + self.V.T @ self.V
-        self.chol_W = la.cho_factor(self.W)
-        self._size  = dim * dim
-        self._shape = (dim, dim)
-        self._dtype = self.invL.dtype
-        if verify:
-            self.kron_factors = kron_factors
-            self.U = U
-            self.verify()
-        else:
-            self.U = None
-            self.kron_factors = None
-        self.verified = verify
-        pass
-
-    @property
-    def T(self):
-        return self
-    
-    def __matmul__(self, other):
-        temp1 = self.invL @ other
-        temp2 = self.V.T @ temp1
-        temp3 = la.cho_solve(self.chol_W, temp2)
-        temp4 = self.V @ temp3
-        out = self.invL.T @ (temp1 - temp4)
-        return out
-    
-    def __rmatmul__(self, other):
-        # use the fact that we're self-adjoint.
-        temp = self @ other.T
-        out = temp.T
-        return out
-
-
 class DyadicKronStructed(RealLinOp):
 
     def __init__(self, A, B, adjoint=None):
@@ -1278,8 +1142,8 @@ def matvec(self, other):
             return self.A.item() * (self.B @ other)
         if self._B_is_trivial:
             return self.B.item() * (self.A @ other)
-        out = self.B @ np.reshape(other, self._fwd_matvec_core_shape, order='F') @ self.A.T
-        out = np.reshape(out, inshape, order='F')
+        out = self.B @ _np.reshape(other, self._fwd_matvec_core_shape, order='F') @ self.A.T
+        out = _np.reshape(out, inshape, order='F')
         return out
 
     def rmatvec(self, other):
@@ -1289,8 +1153,8 @@ def rmatvec(self, other):
             return self.A.item() * (self.B.T @ other)
         if self._B_is_trivial:
             return self.B.item() * (self.A.T @ other)
-        out = self.B.T @ np.reshape(other, self._adj_matvec_core_shape, order='F') @ self.A
-        out = np.reshape(out, inshape, order='F')
+        out = self.B.T @ _np.reshape(other, self._adj_matvec_core_shape, order='F') @ self.A
+        out = _np.reshape(out, inshape, order='F')
         return out
     
     @staticmethod
@@ -1309,8 +1173,8 @@ class KronStructured(RealLinOp):
     def __init__(self, kron_operands):
         self.kron_operands = kron_operands
         assert all([op.ndim == 2 for op in kron_operands])
-        self.shapes = np.array([op.shape for op in kron_operands])
-        self._shape = tuple(int(i) for i in np.prod(self.shapes, axis=0))
+        self.shapes = _np.array([op.shape for op in kron_operands])
+        self._shape = tuple(int(i) for i in _np.prod(self.shapes, axis=0))
         forward = DyadicKronStructed.build_polyadic(self.kron_operands)
         self._linop   = forward._linop
         self._adjoint = forward.T
@@ -1319,32 +1183,26 @@ def __init__(self, kron_operands):
 
 def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int):
 
-    assert np.sum(qubit_list) == total_num_qubits
+    assert _np.sum(qubit_list) == total_num_qubits
 
     if len(qubit_list) == 1:
         # Basic matvec.
-
         cost = 2 * (4**qubit_list[0]**2)
         return cost
     
     elif len(qubit_list) == 2:
-
         # vec((A \tensor B) u) = vec(B U A.T)
-
         term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
-
         term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
-
         return term1 + term2
     
     else:
-
         # Just pop off the last term
-
         # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
 
-        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0]) * 4**(np.sum(qubit_list[1:]))
-
-        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:], total_num_qubits - qubit_list[0]) * 4**(qubit_list[0])
-
+        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0])
+        right *= 4**(_np.sum(qubit_list[1:]))
+        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:],
+                                                                total_num_qubits - qubit_list[0])
+        left *= 4**(qubit_list[0])
         return left + right
\ No newline at end of file
diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 5d50c5944..9c859cab1 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -1,4 +1,4 @@
-from typing import Sequence, Any, List, Literal, Tuple, MutableSequence
+from typing import Sequence, Any, List, Literal, Tuple, MutableSequence, Optional
 import numpy as _np
 from tqdm import tqdm
 
@@ -43,18 +43,20 @@ def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
 def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSequence[Any]], table_data_and_sequences,
                                             internal_tables_and_sequences,
                                             starting_cache_num,
-                                            cache_struct):
+                                            cache_struct, sequence_ind_to_cache_ind: Optional[dict[int, int]] = None):
     """
     Simplify the set of sequences by contracting the set of longest common subsequences.
 
     Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
     """
+    if not sequence_ind_to_cache_ind:
+        sequence_ind_to_cache_ind = {i: i for i in range(len(sequences))}
     if table_data_and_sequences:
         table, external_sequences = table_data_and_sequences
     else:
         table_cache = _np.zeros((len(sequences), len(sequences)))
         table, external_sequences = _compute_lcs_for_every_pair_of_sequences(sequences, table_cache,
-                                                None, set(_np.arange(len(sequences))))
+                                                None, set(_np.arange(len(sequences))), "Unknown")
 
     if internal_tables_and_sequences:
         internal_subtable, internal_subsequences = internal_tables_and_sequences
@@ -118,7 +120,7 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
                     sp += 1
                 updated_sequences[cir_ind] = my_cir
 
-                cache_struct[cir_ind] = updated_sequences[cir_ind]
+                cache_struct[sequence_ind_to_cache_ind[cir_ind]] = updated_sequences[cir_ind]
 
             if update_made:
                 # There may have been multiple overlapping subsequences in the same sequence.
@@ -126,16 +128,21 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
                 updated_sequences.append(list(seq))
                 cache_struct[cache_num] = updated_sequences[cache_num]
 
+                # This is a new sequence index which will need to be updated.
+                dirty_inds.add(cache_num)
+                sequence_ind_to_cache_ind[cache_num] = cache_num
                 cache_num += 1
 
+    assert cache_num >= old_cache_num
+    assert old_cache_num >=0
     sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
 
-    dirty_inds = dirty_inds.union(set(sequences_introduced_in_this_round))
 
     return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round, table, external_sequences, dirty_inds
 
 def simplify_internal_first_one_round(sequences: MutableSequence[MutableSequence[Any]],
-            internal_tables_and_sequences, starting_cache_num, cache_struct):
+            internal_tables_and_sequences, starting_cache_num, cache_struct,
+            seq_ind_to_cache_ind: Optional[dict[int, int]]):
     """
     Simplify the set of sequences by contracting the set of longest common subsequences.
 
@@ -143,6 +150,8 @@ def simplify_internal_first_one_round(sequences: MutableSequence[MutableSequence
     
     Cache number will decrement so ensure that cache_struct can handle positives and negatives.
     """
+    if not seq_ind_to_cache_ind:
+        seq_ind_to_cache_ind = {i: i for i in range(len(sequences))}
 
     if internal_tables_and_sequences:
         internal_subtable, internal_subsequences = internal_tables_and_sequences
@@ -183,17 +192,24 @@ def simplify_internal_first_one_round(sequences: MutableSequence[MutableSequence
                     sp += 1
                 updated_sequences[cir_ind] = my_cir
 
-                cache_struct[cir_ind] = updated_sequences[cir_ind]
+                cache_struct[seq_ind_to_cache_ind[cir_ind]] = updated_sequences[cir_ind]
 
             if update_made:
                 # There may have been multiple overlapping subsequences in the same sequence.
                 # (e.g. QWEQWEQWERQWE has QWE, WEQ, and EQW all happen and all are length 3 subsequences.)
                 updated_sequences.append(list(seq))
-                cache_struct[cache_num] = updated_sequences[cache_num]
+                cache_struct[cache_num] = list(seq) # Add the new sequence to the cache.
+
+                # Add a new mapping from sequences to cache index.
+                seq_ind_to_cache_ind[len(updated_sequences)-1] = cache_num
 
                 cache_num += -1
 
-    sequences_introduced_in_this_round = _np.arange(cache_num - old_cache_num) + old_cache_num
+    # Cache num and old_cache_num < 0
+    assert cache_num < 0
+    assert old_cache_num < 0
+    assert old_cache_num > cache_num
+    sequences_introduced_in_this_round = _np.arange(_np.abs(cache_num - old_cache_num))*-1 + old_cache_num
 
     return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round
 
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
index f658358d2..501123f82 100644
--- a/test/unit/tools/test_sequencetools.py
+++ b/test/unit/tools/test_sequencetools.py
@@ -1,12 +1,13 @@
 import numpy as np
 from pygsti.tools.sequencetools import _compute_lcs_for_every_pair_of_sequences, create_tables_for_internal_LCS
 from pygsti.tools.sequencetools import conduct_one_round_of_lcs_simplification
+from pygsti.tools.sequencetools import simplify_internal_first_one_round
 
 def test_external_matches():
 
     my_strings = ["ABAARCR12LIO", "QWERTYASDFGH", "QWEELLKJAT"]
 
-    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings, None, None, set(0,1,2), 3)
+    tables, sequences = _compute_lcs_for_every_pair_of_sequences(my_strings, None, None, set([0,1,2]), 3)
 
     assert np.max(tables) == 3
 
@@ -47,10 +48,10 @@ def test_one_round_update_collecting_tables_first():
     ('Q', 'W', 'E', 'R', 'T', 'Y', 'Q', 'W', 'E', 'Q', 'W', 'E', 'Q', 'W', 'E')]
     example = [list(x) for x in example]
     internal = create_tables_for_internal_LCS(example)
-    external = _compute_lcs_for_every_pair_of_sequences(example, None, None, set(0,1,2), 3)
+    external = _compute_lcs_for_every_pair_of_sequences(example, None, None, set([0,1,2]), 3)
 
     cache = {i: s for i,s in enumerate(example)}
-    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, external, internal, len(example), cache)
+    updated, num, cache, seq_intro, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(example, external, internal, len(example), cache)
 
     assert len(updated) == 4
     assert "".join(updated[3]) == "AAAA"
@@ -70,7 +71,7 @@ def test_one_round_update_without_collecting_tables_first():
 
 
     cache = {i: s for i,s in enumerate(example)}
-    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+    updated, num, cache, seq_intro, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
 
     assert len(updated) == 4
     assert "".join(updated[3]) == "AAAA"
@@ -89,9 +90,9 @@ def test_update_only_adds_those_strings_which_are_actually_used():
 
 
     cache = {i: s for i,s in enumerate(example)}
-    updated, num, cache, seq_intro = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
+    updated, num, cache, seq_intro, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(example, None, None, len(example), cache)
 
-    r2, num, c2, s2 = conduct_one_round_of_lcs_simplification(updated, None, None, num, cache)
+    r2, num, c2, s2, ext_table, ext_seq, ext_dirty = conduct_one_round_of_lcs_simplification(updated, None, None, num, cache)
 
     assert len(r2) == num
 
@@ -101,3 +102,23 @@ def test_update_only_adds_those_strings_which_are_actually_used():
 
     assert len(c2[4]) == 3
 
+def test_multiple_successive_internal_updates_first():
+
+    strings_list = [list("IIIIIIAIIIIII")]
+
+    cache = {}
+    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(strings_list, None, -1, cache)
+
+    assert -1 in cache.keys()
+
+    assert -2 == cache_num
+    assert len(updated_string_list) == 2
+
+    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(updated_string_list, None, cache_num, cache)
+
+    breakpoint()
+    assert -2 in cache.keys()
+    assert -3 == cache_num
+
+    assert len(updated_string_list) == 3
+    assert np.allclose(seq_intro, [-3])
\ No newline at end of file

From d73e251bb7cf5150001c75eb85e314f649c53c92 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 29 Jul 2025 18:28:43 -0700
Subject: [PATCH 113/141] Update LCS tree to track which cache indexes need to
 get recomputed if a specific gate changes.

---
 pygsti/layouts/evaltree.py                    | 147 ++++++++++++++----
 .../test_forwardsim_on_implicitop_model.py    |  30 ----
 test/unit/tools/test_sequencetools.py         |  16 +-
 3 files changed, 125 insertions(+), 68 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 548f6c9d3..484b34452 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -626,10 +626,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
                                                                     None,
                                                                     set(_np.arange(len(new_circuit_list))),
                                                                     max([len(cir) for cir in new_circuit_list])-1)
-        
 
-        if self.internal_first:
-            breakpoint()
 
         best_external_match = _np.max(external_matches[0])
 
@@ -637,14 +634,20 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         i = 0
         cache_pos = len(new_circuit_list)
         while max_rounds > 1:
-            tmp = conduct_one_round_of_lcs_simplification(new_circuit_list, external_matches,
-                                                          internal_matches, cache_pos, cache, seq_ind_to_cache_index)
+            tmp = conduct_one_round_of_lcs_simplification(new_circuit_list,
+                                                          external_matches,
+                                                          internal_matches,
+                                                          cache_pos,
+                                                          cache,
+                                                          seq_ind_to_cache_index)
             new_circuit_list, cache_pos, cache, sequence_intro[i+1], ext_table, external_sequences, dirty_inds = tmp
             i += 1
+            dirty_inds = set(_np.arange(len(new_circuit_list))) # TODO: fix to only correct those which are actually dirty.
             external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
                                                                         ext_table,
                                                                         external_sequences,
-                                                                        dirty_inds, max_rounds)
+                                                                        dirty_inds,
+                                                                        max_rounds)
 
             if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
                 # We are not going to get a better internal match.
@@ -665,6 +668,39 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         from pygsti.modelmembers.operations import StaticStandardOp
         self.swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense().round(16)
+
+        self.cache_ind_to_alphabet_vals_referenced: dict[int, set[LabelTupTup]] = {}
+
+
+        # Useful for repeated calculations seen in a derivative calculation.
+        for key in self.cache:
+            self.compute_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
+
+        alphabet_val_to_cache_inds_to_update: dict[LabelTupTup, set[int]] = {}
+
+        for cache_ind, vals in self.cache_ind_to_alphabet_vals_referenced.items():
+            for val in vals:
+                if val in alphabet_val_to_cache_inds_to_update:
+                    alphabet_val_to_cache_inds_to_update[val].add(cache_ind)
+                else:
+                    alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
+
+        self.results: dict[int | LabelTupTup, _np.ndarray] = {}
+
+        self.alphabet_val_to_sorted_cache_inds: dict[LabelTupTup, list[int]] = {}
+
+        for val, cache_inds in alphabet_val_to_cache_inds_to_update.items():
+            rnd_nums = {}
+            for cache_ind in cache_inds:
+                for rnd_num in self.sequence_intro:
+                    if cache_ind in self.sequence_intro[rnd_num]:
+                        rnd_nums[cache_ind] = rnd_num
+                        break
+
+            sorted_inds = sorted(cache_inds, key =lambda x : rnd_nums[x])[::-1] # We want to iterate large to small.
+
+            self.alphabet_val_to_sorted_cache_inds[val] = sorted_inds
+
       
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
@@ -707,50 +743,89 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
             updated[new_cir] = loc
         self.circuit_to_save_location = updated
 
-    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int):
+    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int, alphabet_piece_changing: Optional[LabelTupTup] = None):
         """
         Compute the total product cache. Note that this may still have a tensor product
         structure that the operator needs to combine again if they want to have the full 'dense' matrix.
         """
 
+        if alphabet_piece_changing is not None:
 
-        round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-        saved: dict[int | LabelTupTup, _np.ndarray] = {}
-
-        if self.internal_first:
-
-            round_keys = _np.unique(list(self.sequence_intro.keys()))
-
-            pos_inds = _np.where(round_keys >0)
-            pos_keys = round_keys[pos_inds]
-            pos_keys = sorted(pos_keys)[::-1]
-
-            neg_inds = _np.where(round_keys < 0)
-            neg_keys = round_keys[neg_inds]
-            neg_keys = sorted(neg_keys)
+            if alphabet_piece_changing not in self.alphabet_val_to_sorted_cache_inds:
+                # Nothing needs to change here.
+                return self.results, self.circuit_to_save_location
+            
+            cache_inds = self.alphabet_val_to_sorted_cache_inds[alphabet_piece_changing]
 
-            round_keys = pos_keys + neg_keys + _np.array([0])
-        
-        for key in round_keys:
-            for cache_ind in self.sequence_intro[key]:
+            local_changes = {k: v.copy() for k, v in self.results.items()}
+            for cache_ind in cache_inds:
                 cumulative_term = None
                 for term in self.cache[cache_ind]:
-                    cumulative_term = self._collapse_cache_line(model, cumulative_term, term, saved, num_qubits_in_default)
-                        
+                    cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes, num_qubits_in_default)
+
+                # Do not overwrite the results cache
+                # so that we can use it again on a different derivative.
                 if cumulative_term is None:
-                    saved[cache_ind] = _np.eye(4**num_qubits_in_default)
+                    local_changes[cache_ind] = _np.eye(4**num_qubits_in_default)
                     # NOTE: unclear when (if ever) this should be a noisy idle gate.
                 else:
-                    saved[cache_ind] = cumulative_term
+                    local_changes[cache_ind] = cumulative_term
+            return local_changes, self.circuit_to_save_location
+
+
+        else:
+            round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+            # saved: dict[int | LabelTupTup, _np.ndarray] = {}
+
+            if self.internal_first:
+
+                round_keys = _np.unique(list(self.sequence_intro.keys()))
+
+                pos_inds = _np.where(round_keys >0)
+                pos_keys = round_keys[pos_inds]
+                pos_keys = sorted(pos_keys)[::-1]
+
+                neg_inds = _np.where(round_keys < 0)
+                neg_keys = round_keys[neg_inds]
+                neg_keys = sorted(neg_keys)
+
+                round_keys = pos_keys + neg_keys + _np.array([0])
+            
+            for key in round_keys:
+                for cache_ind in self.sequence_intro[key]:
+                    cumulative_term = None
+                    for term in self.cache[cache_ind]:
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results, num_qubits_in_default)
+                            
+                    if cumulative_term is None:
+                        self.results[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
+                    else:
+                        self.results[cache_ind] = cumulative_term
         if __debug__:
             # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
             for key in self.cache:
-                assert key in saved
+                assert key in self.results
         
         # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
     
-        return saved, self.circuit_to_save_location
+        return self.results, self.circuit_to_save_location
     
+    def compute_depends_on(self, val: int | LabelTupTup, visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
+
+        if not isinstance(val, int):
+            return set([val])
+        elif val in visited:
+            return visited[val]
+        else:
+            tmp = set()
+            for child in self.cache[val]:
+                ret_val = self.compute_depends_on(child, visited)
+                tmp = tmp.union(ret_val)
+            visited[val] = tmp
+            return tmp
+
+
     def combine_for_visualization(self, val, visited):
 
         if not isinstance(val, int):
@@ -886,7 +961,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
                     self.trees[key] = sample
             else:
                 self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs, sorted(key)[0])
-                
+
         endtime = time.time()
 
         print(" Time to compute all the evaluation orders (s): ", endtime - starttime)
@@ -901,7 +976,11 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         self.saved_results = {}
         self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
 
-    def collapse_circuits_to_process_matrices(self, model):
+    def collapse_circuits_to_process_matrices(self, model, alphabet_piece_changing: Optional[LabelTupTup] = None):
+        """
+        Collapse all circuits to their process matrices. If alphabet_piece_changing is not None, then
+        we assume we have already collapsed this system once before and so only need to update part of the eval tree.
+        """
         # Just collapse all of them.
 
 
@@ -909,7 +988,7 @@ def collapse_circuits_to_process_matrices(self, model):
         for key in self.trees:
             num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
             tree = self.trees[key]
-            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits)
+            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits, alphabet_piece_changing)
             # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
             self.saved_results[key] = out1
             self.sub_cir_to_ind_in_results[key] = out2
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index cb74cec35..57ba7ea6b 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -319,20 +319,6 @@ def test_tensor_product_single_unitaries_random_collection_of_xyz():
         assert_probability_densities_are_equal(probs, exp, circuit100)
 
 
-def test_tensor_product_arbitrarily_random_rotations():
-
-    for qb in range(2, 6):
-
-        under_test, expected_model = build_models_for_testing(qb)
-
-        circuit = build_circuit_with_arbitrarily_random_single_qubit_gates(qb, 10)
-
-        probs = under_test.probabilities(circuit)
-        exp = expected_model.probabilities(circuit)
-
-        assert_probability_densities_are_equal(probs, exp, circuit)
-
-
 def test_tensor_product_two_qubit_gates():
 
     num_qubits = 4
@@ -384,22 +370,6 @@ def test_tensor_product_gates_with_implicit_idles():
             assert_probability_densities_are_equal(probs, exp, cir)
 
 
-def test_tensor_product_multi_qubit_gates_arbitrarily_random_rotations():
-
-    gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1,
-                            'Gcustom': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
-    for qb in range(3, 6):
-
-        under_test, expected_model = build_models_for_testing(qb)
-
-        circuit = build_circuit_with_multiple_qubit_gates(qb, 100, gates_to_qubits_used=gates_to_used_qubits)
-
-        probs = under_test.probabilities(circuit)
-        exp = expected_model.probabilities(circuit)
-
-        assert_probability_densities_are_equal(probs, exp, circuit)
-
-
 def test_tensor_product_multi_qubit_gates_with_structured_lanes():
 
     gates_to_used_qubits = {'Gxpi2': 1, 'Gypi2': 1, 'Gzpi2': 1, 'Gi': 1, 'Gswap': 2, 'Gcnot': 2, 'Gecr': 2}
diff --git a/test/unit/tools/test_sequencetools.py b/test/unit/tools/test_sequencetools.py
index 501123f82..ab8d7dd4c 100644
--- a/test/unit/tools/test_sequencetools.py
+++ b/test/unit/tools/test_sequencetools.py
@@ -107,18 +107,26 @@ def test_multiple_successive_internal_updates_first():
     strings_list = [list("IIIIIIAIIIIII")]
 
     cache = {}
-    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(strings_list, None, -1, cache)
+    seq_ind_to_cache_ind = {}
+    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(strings_list,
+                                                                                         None,
+                                                                                         -1,
+                                                                                         cache,
+                                                                                         seq_ind_to_cache_ind)
 
     assert -1 in cache.keys()
 
     assert -2 == cache_num
     assert len(updated_string_list) == 2
 
-    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(updated_string_list, None, cache_num, cache)
+    updated_string_list, cache_num, cache, seq_intro = simplify_internal_first_one_round(updated_string_list, 
+                                                                                         None,
+                                                                                         cache_num,
+                                                                                         cache,
+                                                                                         seq_ind_to_cache_ind)
 
-    breakpoint()
     assert -2 in cache.keys()
     assert -3 == cache_num
 
     assert len(updated_string_list) == 3
-    assert np.allclose(seq_intro, [-3])
\ No newline at end of file
+    assert np.allclose(seq_intro, [-2])
\ No newline at end of file

From a23360209f063f6f5441695532c2177e468d11d2 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 1 Aug 2025 17:33:38 -0700
Subject: [PATCH 114/141] Caching so that you only need to compute probs if
 something will change the result when working with dprobs.

---
 pygsti/layouts/evaltree.py                    | 242 +++++++-----------
 .../modelmembers/operations/dyadickronop.py   | 115 +++++++++
 .../test_forwardsim_on_implicitop_model.py    |  32 ++-
 3 files changed, 236 insertions(+), 153 deletions(-)
 create mode 100644 pygsti/modelmembers/operations/dyadickronop.py

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 484b34452..66b926ac6 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -30,6 +30,8 @@
     simplify_internal_first_one_round
 )
 
+from pygsti.modelmembers.operations.dyadickronop import KronStructured
+
 from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
 import time
 import scipy.linalg as la
@@ -547,7 +549,7 @@ def setup_circuit_list_for_LCS_computations(
 
 def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
     """
-    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_i+1, qu_i).
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
     """
     if op.num_qubits == 2:
         # We may need to do swaps.
@@ -556,14 +558,19 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:  # type: ignore
             # This is in the wrong order.
-            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-            op_term = swap_dense @ (op_term) @ swap_dense
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op).to_dense()
+            op_term = swap_dense @ (op_term) @ swap_dense.T
             saved[op] = op_term # Save so we only need to this operation once.
         else:
-            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op).to_dense()
         return op_term
-    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op).to_dense()
 
+def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray):
+    """
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
+    """
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
 
 def matrix_matrix_cost_estimate(matrix_size: tuple[int, int]) -> int:
     """
@@ -676,18 +683,30 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
         for key in self.cache:
             self.compute_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
 
-        alphabet_val_to_cache_inds_to_update: dict[LabelTupTup, set[int]] = {}
+        alphabet_val_to_cache_inds_to_update: dict[LabelTup, set[int]] = {}
 
         for cache_ind, vals in self.cache_ind_to_alphabet_vals_referenced.items():
             for val in vals:
-                if val in alphabet_val_to_cache_inds_to_update:
-                    alphabet_val_to_cache_inds_to_update[val].add(cache_ind)
+                if isinstance(val, LabelTupTup):
+                    for ind_gate in val:
+                        if ind_gate in alphabet_val_to_cache_inds_to_update:
+                            alphabet_val_to_cache_inds_to_update[ind_gate].add(cache_ind)
+                        else:
+                            alphabet_val_to_cache_inds_to_update[ind_gate] = set([cache_ind])
                 else:
-                    alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
+                    if val in alphabet_val_to_cache_inds_to_update:
+                        alphabet_val_to_cache_inds_to_update[val].add(cache_ind)
+                    else:
+                        alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
 
         self.results: dict[int | LabelTupTup, _np.ndarray] = {}
 
-        self.alphabet_val_to_sorted_cache_inds: dict[LabelTupTup, list[int]] = {}
+        self.alphabet_val_to_sorted_cache_inds: dict[LabelTup, list[int]] = {}
+
+        self.gpindex_to_cache_vals: dict[int, list[int]] = {}
+        # This will be filled later by _gpindex_to_cache_inds_needed_to_recompute when we have access to the model.
+        # Warning that changing the model paramvec will result in this cache becoming invalidated.
+        # The user is currently in charge of resetting this cache.
 
         for val, cache_inds in alphabet_val_to_cache_inds_to_update.items():
             rnd_nums = {}
@@ -702,6 +721,26 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
             self.alphabet_val_to_sorted_cache_inds[val] = sorted_inds
 
       
+    def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: int) -> list[int]:
+        """
+        Given that I change the representation of a gate by modifying this index, gp_index_changing,
+        what cache indices do I need to recompute and in what order.
+        """
+        if gp_index_changing in self.gpindex_to_cache_vals:
+            return self.gpindex_to_cache_vals[gp_index_changing]
+
+        cache_inds = []
+        for lbl in self.alphabet_val_to_sorted_cache_inds.keys():
+            my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
+            if gp_index_changing in my_op.gpindices_as_array():
+                cache_inds = self.alphabet_val_to_sorted_cache_inds[lbl]
+                my_arr = my_op.gpindices_as_array()
+                for ind in my_arr: # Save off all the values we know about.
+                    self.gpindex_to_cache_vals[ind] = cache_inds
+                return cache_inds
+        return cache_inds
+    
+
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
         Construct a tree from another tree.
@@ -743,35 +782,41 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
             updated[new_cir] = loc
         self.circuit_to_save_location = updated
 
-    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int, alphabet_piece_changing: Optional[LabelTupTup] = None):
+    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int, gp_index_changing: Optional[int] = None):
         """
         Compute the total product cache. Note that this may still have a tensor product
         structure that the operator needs to combine again if they want to have the full 'dense' matrix.
+
+        If gp_index_changing is not None then we have already computed the results once and we only need to update
+        those terms which depend on the specific gp_index.
         """
 
-        if alphabet_piece_changing is not None:
+        if gp_index_changing is not None:
 
-            if alphabet_piece_changing not in self.alphabet_val_to_sorted_cache_inds:
-                # Nothing needs to change here.
-                return self.results, self.circuit_to_save_location
-            
-            cache_inds = self.alphabet_val_to_sorted_cache_inds[alphabet_piece_changing]
+            # Dig through the tree to see if we have a matching
 
-            local_changes = {k: v.copy() for k, v in self.results.items()}
-            for cache_ind in cache_inds:
-                cumulative_term = None
-                for term in self.cache[cache_ind]:
-                    cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes, num_qubits_in_default)
-
-                # Do not overwrite the results cache
-                # so that we can use it again on a different derivative.
-                if cumulative_term is None:
-                    local_changes[cache_ind] = _np.eye(4**num_qubits_in_default)
-                    # NOTE: unclear when (if ever) this should be a noisy idle gate.
-                else:
-                    local_changes[cache_ind] = cumulative_term
-            return local_changes, self.circuit_to_save_location
+            cache_inds = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+            if cache_inds:
+                # Invalidate all gate labels that we saved just in case.
+                # Invalidate every index in the which we know to be influenced by my_op.
+                local_changes = {k: v.copy() for k, v in self.results.items() \
+                                    if (k not in cache_inds and not isinstance(k, LabelTupTup))}
+
+                for cache_ind in cache_inds:
+                    cumulative_term = None
+                    for term in self.cache[cache_ind]:
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes, num_qubits_in_default)
 
+                    # Save locally.
+                    if cumulative_term is None:
+                        local_changes[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
+                    else:
+                        local_changes[cache_ind] = cumulative_term
+                return local_changes, self.circuit_to_save_location
+
+            return self.results, self.circuit_to_save_location
 
         else:
             round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
@@ -976,19 +1021,33 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         self.saved_results = {}
         self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
 
-    def collapse_circuits_to_process_matrices(self, model, alphabet_piece_changing: Optional[LabelTupTup] = None):
+    def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
+
+        for key in self.trees:
+            inds = self.trees[key]._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+            if len(inds) > 0:
+                return True
+        return False
+
+
+    def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Optional[int] = None):
         """
         Collapse all circuits to their process matrices. If alphabet_piece_changing is not None, then
         we assume we have already collapsed this system once before and so only need to update part of the eval tree.
         """
         # Just collapse all of them.
 
+        if gp_index_changing is not None:
+            # We may not need to check all of the lanes.
+            pass
 
-        self.saved_results = {}
+        else:
+            self.saved_results = {}
+        
         for key in self.trees:
-            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
+            num_qubits = len(key)
             tree = self.trees[key]
-            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits, alphabet_piece_changing)
+            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits, gp_index_changing)
             # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
             self.saved_results[key] = out1
             self.sub_cir_to_ind_in_results[key] = out2
@@ -1146,119 +1205,6 @@ def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[
 
 
 
-class RealLinOp:
-    
-    # Function implementations below are merely defaults.
-    # Don't hesitate to override them if need be.
-
-    __array_priority__ = 100
-
-    @property
-    def ndim(self):
-        return 2
-
-    @property
-    def size(self):
-        return self._size
-
-    @property
-    def shape(self):
-        return self._shape
-
-    @property
-    def dtype(self):
-        return self._dtype
-
-    @property
-    def T(self):
-        return self._adjoint
-
-    def item(self):
-        # If self.size == 1, return a scalar representation of this linear operator.
-        # Otherwise, error.
-        raise NotImplementedError()
-
-    def __matmul__(self, other):
-        return self._linop @ other
-    
-    def __rmatmul__(self, other):
-        return other @ self._linop
-
-
-def is_2d_square(arg):
-    if not hasattr(arg, 'shape'):
-        return False
-    if len(arg.shape) != 2:
-        return False
-    return arg.shape[0] == arg.shape[1]
-
-
-class DyadicKronStructed(RealLinOp):
-
-    def __init__(self, A, B, adjoint=None):
-        assert A.ndim == 2
-        assert B.ndim == 2
-        self.A = A
-        self.B = B
-        self._A_is_trivial = A.size == 1
-        self._B_is_trivial = B.size == 1
-        self._shape = ( A.shape[0]*B.shape[0], A.shape[1]*B.shape[1] )
-        self._size = self.shape[0] * self.shape[1]
-        self._fwd_matvec_core_shape = (B.shape[1], A.shape[1])
-        self._adj_matvec_core_shape = (B.shape[0], A.shape[0])
-        self._dtype = A.dtype
-        self._linop =  sparla.LinearOperator(dtype=self.dtype, shape=self.shape, matvec=self.matvec, rmatvec=self.rmatvec)
-        self._adjoint = DyadicKronStructed(A.T, B.T, adjoint=self) if adjoint is None else adjoint
-
-    def item(self):
-        # This will raise a ValueError if self.size > 1.
-        return self.A.item() * self.B.item()
-    
-    def matvec(self, other):
-        inshape = other.shape
-        assert other.size == self.shape[1]
-        if self._A_is_trivial:
-            return self.A.item() * (self.B @ other)
-        if self._B_is_trivial:
-            return self.B.item() * (self.A @ other)
-        out = self.B @ _np.reshape(other, self._fwd_matvec_core_shape, order='F') @ self.A.T
-        out = _np.reshape(out, inshape, order='F')
-        return out
-
-    def rmatvec(self, other):
-        inshape = other.shape
-        assert other.size == self.shape[0]
-        if self._A_is_trivial:
-            return self.A.item() * (self.B.T @ other)
-        if self._B_is_trivial:
-            return self.B.item() * (self.A.T @ other)
-        out = self.B.T @ _np.reshape(other, self._adj_matvec_core_shape, order='F') @ self.A
-        out = _np.reshape(out, inshape, order='F')
-        return out
-    
-    @staticmethod
-    def build_polyadic(kron_operands):
-        if len(kron_operands) == 2:
-            out = DyadicKronStructed(kron_operands[0], kron_operands[1])
-            return out
-        # else, recurse
-        arg = DyadicKronStructed.build_polyadic(kron_operands[1:])
-        out = DyadicKronStructed(kron_operands[0], arg)
-        return out
-
-
-class KronStructured(RealLinOp):
-
-    def __init__(self, kron_operands):
-        self.kron_operands = kron_operands
-        assert all([op.ndim == 2 for op in kron_operands])
-        self.shapes = _np.array([op.shape for op in kron_operands])
-        self._shape = tuple(int(i) for i in _np.prod(self.shapes, axis=0))
-        forward = DyadicKronStructed.build_polyadic(self.kron_operands)
-        self._linop   = forward._linop
-        self._adjoint = forward.T
-        self._dtype = self.kron_operands[0].dtype
-
 
 def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int):
 
diff --git a/pygsti/modelmembers/operations/dyadickronop.py b/pygsti/modelmembers/operations/dyadickronop.py
new file mode 100644
index 000000000..b5798a47c
--- /dev/null
+++ b/pygsti/modelmembers/operations/dyadickronop.py
@@ -0,0 +1,115 @@
+import numpy as np
+import scipy.sparse.linalg as sparla
+
+class RealLinOp:
+    
+    # Function implementations below are merely defaults.
+    # Don't hesitate to override them if need be.
+
+    __array_priority__ = 100
+
+    @property
+    def ndim(self):
+        return 2
+
+    @property
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def T(self):
+        return self._adjoint
+
+    def item(self):
+        # If self.size == 1, return a scalar representation of this linear operator.
+        # Otherwise, error.
+        raise NotImplementedError()
+
+    def __matmul__(self, other):
+        return self._linop @ other
+    
+    def __rmatmul__(self, other):
+        return other @ self._linop
+
+
+def is_2d_square(arg):
+    if not hasattr(arg, 'shape'):
+        return False
+    if len(arg.shape) != 2:
+        return False
+    return arg.shape[0] == arg.shape[1]
+
+
+class DyadicKronStructed(RealLinOp):
+
+    def __init__(self, A, B, adjoint=None):
+        assert A.ndim == 2
+        assert B.ndim == 2
+        self.A = A
+        self.B = B
+        self._A_is_trivial = A.size == 1
+        self._B_is_trivial = B.size == 1
+        self._shape = ( A.shape[0]*B.shape[0], A.shape[1]*B.shape[1] )
+        self._size = self.shape[0] * self.shape[1]
+        self._fwd_matvec_core_shape = (B.shape[1], A.shape[1])
+        self._adj_matvec_core_shape = (B.shape[0], A.shape[0])
+        self._dtype = A.dtype
+        self._linop =  sparla.LinearOperator(dtype=self.dtype, shape=self.shape, matvec=self.matvec, rmatvec=self.rmatvec)
+        self._adjoint = DyadicKronStructed(A.T, B.T, adjoint=self) if adjoint is None else adjoint
+
+    def item(self):
+        # This will raise a ValueError if self.size > 1.
+        return self.A.item() * self.B.item()
+    
+    def matvec(self, other):
+        inshape = other.shape
+        assert other.size == self.shape[1]
+        if self._A_is_trivial:
+            return self.A.item() * (self.B @ other)
+        if self._B_is_trivial:
+            return self.B.item() * (self.A @ other)
+        out = self.B @ np.reshape(other, self._fwd_matvec_core_shape, order='F') @ self.A.T
+        out = np.reshape(out, inshape, order='F')
+        return out
+
+    def rmatvec(self, other):
+        inshape = other.shape
+        assert other.size == self.shape[0]
+        if self._A_is_trivial:
+            return self.A.item() * (self.B.T @ other)
+        if self._B_is_trivial:
+            return self.B.item() * (self.A.T @ other)
+        out = self.B.T @ np.reshape(other, self._adj_matvec_core_shape, order='F') @ self.A
+        out = np.reshape(out, inshape, order='F')
+        return out
+    
+    @staticmethod
+    def build_polyadic(kron_operands):
+        if len(kron_operands) == 2:
+            out = DyadicKronStructed(kron_operands[0], kron_operands[1])
+            return out
+        # else, recurse
+        arg = DyadicKronStructed.build_polyadic(kron_operands[1:])
+        out = DyadicKronStructed(kron_operands[0], arg)
+        return out
+
+
+class KronStructured(RealLinOp):
+
+    def __init__(self, kron_operands):
+        self.kron_operands = kron_operands
+        assert all([op.ndim == 2 for op in kron_operands])
+        self.shapes = np.array([op.shape for op in kron_operands])
+        self._shape = tuple(int(i) for i in np.prod(self.shapes, axis=0))
+        forward = DyadicKronStructed.build_polyadic(self.kron_operands)
+        self._linop   = forward._linop
+        self._adjoint = forward.T
+        self._dtype = self.kron_operands[0].dtype
\ No newline at end of file
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index 57ba7ea6b..161d6fc08 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -1,4 +1,6 @@
 import numpy as np
+from tqdm import tqdm
+
 
 from pygsti.baseobjs import qubitgraph as _qgraph
 from pygsti.baseobjs import QubitSpace
@@ -15,7 +17,7 @@
 from pygsti.modelmembers import operations as op
 from pygsti.baseobjs import UnitaryGateFunction
 from pygsti.forwardsims.matrixforwardsim import LCSEvalTreeMatrixForwardSimulator
-from pygsti.forwardsims import MapForwardSimulator
+from pygsti.forwardsims import MapForwardSimulator, MatrixForwardSimulator
 
 
 def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: Circuit):
@@ -450,13 +452,13 @@ def test_tensor_product_single_unitaries_random_collection_of_xyz_dprobs():
 
 def test_tensor_product_gates_with_implicit_idles_dprobs():
 
-    num_qubits = 5
+    num_qubits = 2
 
     under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
 
     gatenames = ["Gxpi2", "Gypi2", "Gzpi2", "Gi"]
-    for gate in gatenames:
-        for i in range(num_qubits):
+    for gate in tqdm(gatenames, "Gate: "):
+        for i in tqdm(range(num_qubits), "Qubit Location: "):
             cir = Circuit([[(gate, i)]], num_lines=num_qubits)
 
             probs = under_test.sim.dprobs(cir)
@@ -467,6 +469,7 @@ def test_tensor_product_gates_with_implicit_idles_dprobs():
 
     # gatenames = ["Gecr", "Gcnot"]
     gatenames = ["Gecr"]
+    gatenames = []
     for gate in gatenames:
         for i in range(num_qubits - 1):
             cir = Circuit([[(gate, i, i + 1)]], num_lines=num_qubits)
@@ -503,5 +506,24 @@ def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
 
         assert_probability_densities_are_equal(probs, exp, circuit)
 
-
+# test_tensor_product_gates_with_implicit_idles_dprobs()
 #endregion Derivative of Probabilities consistencies.
+
+
+
+def test_dprobs_matrices_are_close():
+
+    num_qubits = 3
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True,
+                                                          simplify_for_dprobs=True)
+    
+    cir = Circuit([[("Gxpi2", 1)]], num_lines=num_qubits)
+
+    expected_model.sim = MatrixForwardSimulator()
+
+    expected_dproduct = expected_model.sim.bulk_dproduct([cir])
+    actual_dproduct = under_test.sim.bulk_dproduct([cir])
+
+    assert np.allclose(actual_dproduct, expected_dproduct)
+
+test_dprobs_matrices_are_close()
\ No newline at end of file

From ec00225980465c13cd0617ead94aa68224b8ed22 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 1 Aug 2025 19:30:21 -0700
Subject: [PATCH 115/141] Cache collapse is missing something.

---
 pygsti/forwardsims/matrixforwardsim.py        | 528 ++++++++++++++++--
 pygsti/layouts/evaltree.py                    |  13 +-
 pygsti/models/localnoisemodel.py              |  28 -
 .../test_forwardsim_on_implicitop_model.py    |  11 +-
 4 files changed, 498 insertions(+), 82 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 2c436a868..f02ac03b8 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -15,6 +15,7 @@
 import time as _time
 import warnings as _warnings
 
+from tqdm import tqdm
 import numpy as _np
 import numpy.linalg as _nla
 
@@ -2205,7 +2206,7 @@ def bulk_product(self, circuits, scale=False, resource_alloc=None):
 
         full_tree = CollectionOfLCSEvalTrees(my_data[2], my_data[1], my_data[0])
 
-        full_tree.collapse_circuits_to_process_matrices(self.model)
+        full_tree.collapse_circuits_to_process_matrices(self.model, None)
         Gs = full_tree.reconstruct_full_matrices()
 
         return Gs
@@ -2216,7 +2217,7 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         dim = self.model.evotype.minimal_dim(self.model.state_space)
         # resource_alloc.check_can_allocate_memory(len(layout_atom.tree.cache) * dim**2)  # prod cache
 
-        layout_atom.tree.collapse_circuits_to_process_matrices(self.model)
+        layout_atom.tree.collapse_circuits_to_process_matrices(self.model, None)
 
         Gs = layout_atom.tree.reconstruct_full_matrices()
 
@@ -2250,12 +2251,43 @@ def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
         for i in range(len(gs)):
             out[i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
         out = out.reshape((len(gs)*len(e)), order="C")
-        print(out)
         return out
         return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
 
         return super()._probs_from_rho_e(rho, e, gs, scale_vals)
 
+    def _layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(self,
+                        layout_atom: _MatrixCOPALayoutAtomWithLCS):
+        
+        """
+        Assumes one state prep and many measurements.
+
+        We assume that there will be only one set of tree indices used throughout.
+        Also, we assume that we can find a slice 
+        """
+
+        sp_val = None
+        povm_vals: list[Label] = []
+        elm_inds: list[slice] = [] # This will get collapsed since we are assuming that each povm appears once.
+        tree_inds: list[slice] = [] # I am assuming that this will 
+        for spam_tuple, (element_indices, tree_indices) in layout_atom.indices_by_spamtuple.items():
+            if spam_tuple[0] != sp_val and sp_val is not None:
+                raise ValueError("More than one state prep is being used.")
+            else:
+                sp_val = spam_tuple[0]
+
+            
+            povm_vals.append(spam_tuple[1])
+            elm_inds.append(element_indices)
+            tree_inds.append(tree_indices)
+
+        sp_val = self.model.circuit_layer_operator(sp_val, "prep")
+
+        povm_vals = [self.model.circuit_layer_operator(elabel, 'povm') for elabel in povm_vals]
+
+        tree_inds = _np.unique(tree_inds)[0]
+        return sp_val, povm_vals, elm_inds, tree_inds        
+
     def _rho_es_elm_inds_and_tree_inds_from_spam_tuples(self,
                     layout_atom: _MatrixCOPALayoutAtomWithLCS) -> tuple[_np.ndarray, _np.ndarray]:
         """
@@ -2285,7 +2317,7 @@ def _rho_es_elm_inds_and_tree_inds_from_spam_tuples(self,
         tree_inds = _np.unique(tree_inds)[0]
         return sp_val, povm_vals, elm_inds, tree_inds
     
-    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+    def _bulk_fill_dprobs_atom_saved(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
 
         
         eps = 1e-7  # hardcoded?
@@ -2303,51 +2335,459 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
         probs = _np.empty(layout_atom.num_elements, 'd')
         self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
 
+        original_Gs = layout_atom.tree.reconstruct_full_matrices()
+
         probs2 = _np.empty(layout_atom.num_elements, 'd')
         orig_vec = self.model.to_vector().copy()
+        
+        sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
 
-        # if len(param_indices)>0:
-        #     probs2[:] = probs[:] # Could recompute only some of the tree.
-        #     first_param_idx = param_indices[0]
-        #     iFinal = iParamToFinal[first_param_idx]
-        #     self.model.set_parameter_value(first_param_idx, orig_vec[first_param_idx]+eps)
-        #     self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
-        #     array_to_fill[:, iFinal] = (probs2 - probs) / eps
+        orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
+        dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+        _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
+        if len(rho_gpindices)>0:
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+
+            iFinal = iParamToFinal[rho_gpindices[0]]
+            self.model.set_parameter_value(rho_gpindices[0], orig_vec[rho_gpindices[0]]+eps)
+
+            dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
+            probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
 
-        for i in range(len(param_indices)):
-            # probs2[:] = probs[:] # Could recompute only some of the tree.
-
-            iFinal = iParamToFinal[param_indices[i]]
-            # self.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
-            #                                 [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
-            vec = orig_vec.copy()
-            vec[param_indices[i]] += eps
-            self.model.from_vector(vec)
-            # vec = self.model.to_vector()
-            # assert _np.allclose(_np.where(vec != 0), [i])
-            self._bulk_fill_probs_atom(probs2,  layout_atom, resource_alloc)
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
-            print(iFinal)
+
+
+            for i in range(1, len(rho_gpindices)):
+                iFinal = iParamToFinal[rho_gpindices[i]]
+                self.model.set_parameter_values([rho_gpindices[i-1], rho_gpindices[i]], 
+                                                [orig_vec[rho_gpindices[i-1]], orig_vec[rho_gpindices[i]]+eps])
+
+                dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
+                probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
+            
+                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+            self.model.from_vector(orig_vec)  # Reset in case there were rho_gpindices.
+ 
+        _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        if len(povm_gpindices) >0:
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[povm_gpindices[0]]
+            self.model.set_parameter_value(povm_gpindices[0], orig_vec[povm_gpindices[0]]+eps)
+            
+            # We are only varying the POVMs.
+            effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
+            
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+            for i in range(1, len(povm_gpindices)):
+                probs2[:] = probs[:] # Could recompute only some of the tree.
+                iFinal = iParamToFinal[povm_gpindices[0]]
+                self.model.set_parameter_values([povm_gpindices[i-1], povm_gpindices[i]], 
+                                                [orig_vec[povm_gpindices[i-1]], orig_vec[povm_gpindices[i]]+eps])
+
+                # We are only varying the POVMs.
+                effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+                probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
+                
+                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+            self.model.from_vector(orig_vec)  # Reset in case there were povm_gpindices.
+        
+        remaining_param_inds = list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices)))
+
+        print(remaining_param_inds)
+
+        if len(remaining_param_inds) > 0:
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[remaining_param_inds[0]]
+            recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, remaining_param_inds[0]) 
+            recompute_if_change_i = True
+            if recompute_if_change_i:
+
+                self.model.set_parameter_value(remaining_param_inds[0], orig_vec[remaining_param_inds[0]]+eps)
+                
+                # The representation of a gate has changed so we need to recompute some of the sequences.
+                layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[0])
+                Gs = layout_atom.tree.reconstruct_full_matrices()
+
+                probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
+
+                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            else:
+                array_to_fill[:, iFinal] = 0  # Derivative must be zero in this direction.        
+
+            for ind in tqdm(range(1, len(remaining_param_inds)), "Dx: "):
+                probs2[:] = probs[:] # Could recompute only some of the tree.
+                recompute_if_change_i_minus_1 = recompute_if_change_i
+                # recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, remaining_param_inds[ind])
+                
+                inds_to_reset = []
+                if remaining_param_inds[ind] == 7:
+                    breakpoint()
+                vals = []
+                if recompute_if_change_i and recompute_if_change_i_minus_1:
+                    # We need to modify both.
+                    inds_to_reset = [remaining_param_inds[ind-1], remaining_param_inds[ind]]
+                    vals = [orig_vec[remaining_param_inds[ind-1]], orig_vec[remaining_param_inds[ind]] + eps]
+                elif recompute_if_change_i:
+                    inds_to_reset = [remaining_param_inds[ind]]
+                    vals = [orig_vec[remaining_param_inds[ind]] + eps]
+                elif recompute_if_change_i_minus_1:
+                    # only reset.
+                    inds_to_reset = [remaining_param_inds[ind-1]]
+                    vals = [orig_vec[remaining_param_inds[ind-1]]]
+                    
+                if inds_to_reset:
+                    self.model.set_parameter_values(inds_to_reset, vals)
+
+                    # The representation of a gate has changed so we need to recompute some of the sequences.
+                    layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[ind])
+                    Gs = layout_atom.tree.reconstruct_full_matrices()
+
+                    probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
+
+                    array_to_fill[:, iFinal] = (probs2 - probs) / eps
+                
+                else:
+                    # Derivative must be zero in this direction.
+                    array_to_fill[:, iFinal] = 0
+
+        self.model.from_vector(orig_vec) # reset to the original model.
+        # array_to_fill = array_to_fill / eps # Divide once
+
+
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+        CALL_BASIC = False
+        CALL_BASIC_SET_PARAM_VALS = False
+        CALL_SPLIT_OFF_POVM_AND_SPAM_REDO_ALL_OF_TREE = False
+        CALL_SPLIT_OFF_POVM_SPAM_CACHE_TREE = True
+
+        if CALL_BASIC:
+            return self._bulk_fill_dprobs_atom_using_from_vector(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
+        elif CALL_BASIC_SET_PARAM_VALS:
+            return self._bulk_fill_dprobs_atom_using_set_param_vals_no_cache(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
+        elif CALL_SPLIT_OFF_POVM_AND_SPAM_REDO_ALL_OF_TREE:
+            return self._bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
+        elif CALL_SPLIT_OFF_POVM_SPAM_CACHE_TREE:
+            return self._bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
+        
+        eps = 1e-7  # hardcoded?
+        avoiding_repeated_dividing_eps = 1 / eps 
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+
+        original_Gs = layout_atom.tree.reconstruct_full_matrices()
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+        
+
+        if len(param_indices) > 0:
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[param_indices[0]]
+            recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, param_indices[0]) 
+            recompute_if_change_i = True
+            if recompute_if_change_i:
+
+                self.model.set_parameter_value(param_indices[0], orig_vec[param_indices[0]]+eps)
+                
+                # The representation of a gate has changed so we need to recompute some of the sequences.
+                self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+                    
+                array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            else:
+                array_to_fill[:, iFinal] = 0  # Derivative must be zero in this direction.        
+
+            for ind in tqdm(range(1, len(param_indices)), "Dx: "):
+                probs2[:] = probs[:] # Could recompute only some of the tree.
+                recompute_if_change_i_minus_1 = recompute_if_change_i
+                # recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, param_indices[ind])
+                
+                inds_to_reset = []
+                if param_indices[ind] == 7:
+                    breakpoint()
+                vals = []
+                if recompute_if_change_i and recompute_if_change_i_minus_1:
+                    # We need to modify both.
+                    inds_to_reset = [param_indices[ind-1], param_indices[ind]]
+                    vals = [orig_vec[param_indices[ind-1]], orig_vec[param_indices[ind]] + eps]
+                elif recompute_if_change_i:
+                    inds_to_reset = [param_indices[ind]]
+                    vals = [orig_vec[param_indices[ind]] + eps]
+                elif recompute_if_change_i_minus_1:
+                    # only reset.
+                    inds_to_reset = [param_indices[ind-1]]
+                    vals = [orig_vec[param_indices[ind-1]]]
+                    
+                if inds_to_reset:
+                    self.model.set_parameter_values(inds_to_reset, vals)
+
+                    # The representation of a gate has changed so we need to recompute some of the sequences.
+                    self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+                    array_to_fill[:, iFinal] = (probs2 - probs) / eps
+                
+                else:
+                    # Derivative must be zero in this direction.
+                    array_to_fill[:, iFinal] = 0
+
+        self.model.from_vector(orig_vec) # reset to the original model.
         # array_to_fill = array_to_fill / eps # Divide once
 
     def create_layout(self, circuits : Sequence[_Circuit] | _CircuitList, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
-        # replace implicit idles.
-        # from pygsti.layouts.evaltree import _add_in_idle_gates_to_circuit
-        # model_idle_key = Label(())  # not true in general.
-        sanitized_circuits = []
-        for i, c in enumerate(circuits):
-            if len(c) > 0:
-                # # Attempt 1: Broken
-                # c = c.copy(True)
-                # c.replace_gatename_inplace([], model_idle_key)
-                # c.replace_gatename_inplace(Label(()), model_idle_key)
-                # c.done_editing()
-                # # Attempt 2: Broken
-                # c = _add_in_idle_gates_to_circuit(c, model_idle_key)
-                # TODO: try yet another thing.
-                #   IDEA: define a function in the ExplicitModel class that parses a circuit
-                #         and returns one with suitably substituted explicit idles. This seems
-                #         like something that only a model can be expected to resolve.
-                pass
-            sanitized_circuits.append(c)
-        return super().create_layout(sanitized_circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
+        return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
+
+
+
+    def _bulk_fill_dprobs_atom_using_from_vector(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+
+        eps = 1e-7  # hardcoded?
+        avoiding_repeated_dividing_eps = 1 / eps 
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+
+        for i in range(len(param_indices)):
+
+            new_vec = orig_vec.copy()
+            new_vec[param_indices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+
+            array_to_fill[:, iParamToFinal[param_indices[i]]] = (probs2 - probs) / eps
+
+        self.model.from_vector(orig_vec)
+
+    def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+        # THIS METHOD FAILS TO PRODUCE THE CORRECT RESULTS! THIS IS BECAUSE THE SET_PARAMETER_VALUES call still does not work.
+
+        eps = 1e-7  # hardcoded?
+        avoiding_repeated_dividing_eps = 1 / eps 
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+
+        if len(param_indices) > 0:
+
+            first_ind = param_indices[0]
+            new_vec = orig_vec.copy()
+            new_vec[first_ind] += eps
+
+            self.model.set_parameter_value(first_ind, orig_vec[first_ind] + eps)
+
+            self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+
+            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - probs) / eps
+
+        for i in range(1, len(param_indices)):
+            self.model.set_parameter_values([param_indices[i - 1], param_indices[i]],
+                                            [orig_vec[i-1], orig_vec[i] + eps])
+            new_vec = self.model.to_vector()
+            assert new_vec[i] == (orig_vec[i] + eps)
+            assert _np.allclose(new_vec[:i], orig_vec[:i] )
+            assert _np.allclose(new_vec[i+1:], orig_vec[i+1:])
+
+            self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
+
+            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - probs) / eps
+
+
+        self.model.from_vector(orig_vec)
+
+    def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+        eps = 1e-7  # hardcoded?
+        avoiding_repeated_dividing_eps = 1 / eps 
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+
+        original_Gs = layout_atom.tree.reconstruct_full_matrices()
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+        
+        sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
+
+        orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
+        dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+        _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
+        for i in range(len(rho_gpindices)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+
+            iFinal = iParamToFinal[rho_gpindices[i]]
+            
+            new_vec = orig_vec.copy()
+            new_vec[rho_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
+            probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+ 
+        _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        for i in range(len(povm_gpindices)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[povm_gpindices[i]]
+            new_vec = orig_vec.copy()
+            new_vec[povm_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            # We are only varying the POVMs.
+            effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
+            
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+
+        remaining_param_inds = list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices)))
+
+
+        for i in range(len(remaining_param_inds)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[remaining_param_inds[i]]
+
+            new_vec = orig_vec.copy()
+            new_vec[remaining_param_inds[i]] += eps
+            self.model.from_vector(new_vec)
+
+            layout_atom.tree.collapse_circuits_to_process_matrices(self.model, None)
+            Gs = layout_atom.tree.reconstruct_full_matrices()
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+        self.model.from_vector(orig_vec) # reset to the original model.
+
+    def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+
+        eps = 1e-7  # hardcoded?
+        avoiding_repeated_dividing_eps = 1 / eps 
+        if param_slice is None:
+            param_slice = slice(0, self.model.num_params)
+        param_indices = _slct.to_array(param_slice)
+
+        if dest_param_slice is None:
+            dest_param_slice = slice(0, len(param_indices))
+        dest_param_indices = _slct.to_array(dest_param_slice)
+
+        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+
+        probs = _np.empty(layout_atom.num_elements, 'd')
+        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+
+        original_Gs = layout_atom.tree.reconstruct_full_matrices()
+
+        probs2 = _np.empty(layout_atom.num_elements, 'd')
+        orig_vec = self.model.to_vector().copy()
+        
+        sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
+
+        orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
+        dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+        _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
+        for i in range(len(rho_gpindices)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+
+            iFinal = iParamToFinal[rho_gpindices[i]]
+            
+            new_vec = orig_vec.copy()
+            new_vec[rho_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
+            probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+ 
+        _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        for i in range(len(povm_gpindices)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[povm_gpindices[i]]
+            new_vec = orig_vec.copy()
+            new_vec[povm_gpindices[i]] += eps
+            self.model.from_vector(new_vec)
+
+            # We are only varying the POVMs.
+            effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
+            
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+
+        remaining_param_inds = list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices)))
+
+
+        for i in range(len(remaining_param_inds)):
+            probs2[:] = probs[:] # Could recompute only some of the tree.
+            iFinal = iParamToFinal[remaining_param_inds[i]]
+
+            new_vec = orig_vec.copy()
+            new_vec[remaining_param_inds[i]] += eps
+            self.model.from_vector(new_vec)
+
+            layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[i])
+            Gs = layout_atom.tree.reconstruct_full_matrices()
+
+            probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
+
+            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+
+        self.model.from_vector(orig_vec) # reset to the original model.
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 66b926ac6..ad226a00f 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -558,13 +558,13 @@ def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup
             op_term = saved[op]
         elif op.qubits[1] < op.qubits[0]:  # type: ignore
             # This is in the wrong order.
-            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op).to_dense()
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
             op_term = swap_dense @ (op_term) @ swap_dense.T
             saved[op] = op_term # Save so we only need to this operation once.
         else:
-            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op).to_dense()
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
         return op_term
-    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op).to_dense()
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
 
 def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray):
     """
@@ -731,7 +731,11 @@ def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: i
 
         cache_inds = []
         for lbl in self.alphabet_val_to_sorted_cache_inds.keys():
-            my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
+            # my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
+            try:
+                my_op = model.circuit_layer_operator(lbl, "op") # Assumes that layers have the same gpindices as the gates themselves.
+            except KeyError:
+                return cache_inds
             if gp_index_changing in my_op.gpindices_as_array():
                 cache_inds = self.alphabet_val_to_sorted_cache_inds[lbl]
                 my_arr = my_op.gpindices_as_array()
@@ -819,6 +823,7 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
             return self.results, self.circuit_to_save_location
 
         else:
+            self.results = {} # We are asking to reset all the calculations.
             round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
             # saved: dict[int | LabelTupTup, _np.ndarray] = {}
 
diff --git a/pygsti/models/localnoisemodel.py b/pygsti/models/localnoisemodel.py
index 6b4ad778e..9be357b23 100644
--- a/pygsti/models/localnoisemodel.py
+++ b/pygsti/models/localnoisemodel.py
@@ -652,31 +652,3 @@ def get_dense_process_matrix_represention_for_gate(self, model: _ImplicitOpModel
         else:
             # Assume a perfect idle q-qubit gate.
             return _np.eye(4**len(lbl.qubits))
-
-
-
-
-class LocalNoiseModelWithEquivalentClassesForSingleQubits(LocalNoiseModel):
-
-    def __init__(self, processor_spec, gatedict, prep_layers=None, povm_layers=None, evotype="default",
-                 simulator="auto", on_construction_error='raise',
-                 independent_gates=False, ensure_composed_gates=False, implicit_idle_mode="none", equiv_qubits_classes=None):
-
-        super().__init__(processor_spec, gatedict, prep_layers, povm_layers, evotype, simulator,
-                on_construction_error, independent_gates, ensure_composed_gates, implicit_idle_mode)
-
-        self.equiv_qubit_classes = equiv_qubit_classes
-
-        for key in self.operation_blks:
-            for labels in self.operation_blks[key]:
-                qubit_used = labels.qubits
-                if len(qubits_used) == 1:
-                    # We may be able to replace this.
-                    new_qubit = self.equiv_qubit_classes[int(qubits_used[0])]
-                    if new_qubit not in qubits_used:
-                        # Need to replace.
-                        new_label = labels[0] + (new_qubit,)
-                        self.operation_blks[key][labels] = self.operation_blks[key][new_label]
-                        # This assumes no circular updates.
-
-        
\ No newline at end of file
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index 161d6fc08..e64efe480 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -417,10 +417,11 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
     num_qubits = 2
 
-    under_test, expected_model = build_models_for_testing(num_qubits)
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
 
     circuitNone = Circuit([], num_lines=num_qubits)
-    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    single_layer = tuple([("Gxpi2", i) for i in range(num_qubits)])
+    circuitX = Circuit([single_layer], num_lines=num_qubits)
     circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
     circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
     circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
@@ -428,7 +429,7 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
     circuits = [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]
     for cir in circuits:
         probs = under_test.sim.dprobs(cir)
-        expected_model.sim.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic")
+        # expected_model.sim.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic")
 
         exp = expected_model.sim.dprobs(cir)
 
@@ -511,7 +512,7 @@ def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
 
 
 
-def test_dprobs_matrices_are_close():
+def test_matrices_are_close():
 
     num_qubits = 3
     under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True,
@@ -525,5 +526,3 @@ def test_dprobs_matrices_are_close():
     actual_dproduct = under_test.sim.bulk_dproduct([cir])
 
     assert np.allclose(actual_dproduct, expected_dproduct)
-
-test_dprobs_matrices_are_close()
\ No newline at end of file

From 656bff371be2c6a8f8a730db3548164ead82dc7f Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 6 Aug 2025 13:53:37 -0700
Subject: [PATCH 116/141] Invalidate the cache for all labels and those
 numerical values which are present in the list of indices to recompute.

---
 pygsti/layouts/evaltree.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index ad226a00f..6c0f56417 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -805,7 +805,7 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                 # Invalidate all gate labels that we saved just in case.
                 # Invalidate every index in the which we know to be influenced by my_op.
                 local_changes = {k: v.copy() for k, v in self.results.items() \
-                                    if (k not in cache_inds and not isinstance(k, LabelTupTup))}
+                                    if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
 
                 for cache_ind in cache_inds:
                     cumulative_term = None
@@ -912,7 +912,9 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
         """
 
         if isinstance(term_to_extend_with, int):
-            assert term_to_extend_with in results_cache
+            if term_to_extend_with not in results_cache:
+                breakpoint()
+                assert term_to_extend_with in results_cache, f"Term {term_to_extend_with} not in cache: {results_cache.keys()}"
             return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
         if term_to_extend_with in results_cache:
             return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)

From 661aa376deace3342bc7b3ebdc0398ff60a2f6fb Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 7 Aug 2025 15:52:28 -0700
Subject: [PATCH 117/141] Caching for dprobs passes.

---
 pygsti/forwardsims/matrixforwardsim.py        | 49 +++++++++++++++++--
 pygsti/layouts/evaltree.py                    | 13 ++---
 .../modelmembers/operations/dyadickronop.py   | 13 ++++-
 3 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index f02ac03b8..bc8a82d1d 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2210,6 +2210,18 @@ def bulk_product(self, circuits, scale=False, resource_alloc=None):
         Gs = full_tree.reconstruct_full_matrices()
 
         return Gs
+    
+    def bulk_dproduct(self, circuits, flat=False, return_prods=False, scale=False, resource_alloc=None, wrt_filter=None):
+
+        layout = self.create_layout(circuits, resource_alloc=resource_alloc, array_types=('ep'))
+
+        # I will assume that we are working with only one rank here.
+
+        the_atom = layout.atoms[0]
+
+
+
+        return super().bulk_dproduct(circuits, flat, return_prods, scale, resource_alloc, wrt_filter)
 
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, resource_alloc):
         
@@ -2634,7 +2646,7 @@ def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, de
 
         self.model.from_vector(orig_vec)
 
-    def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+    def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc, actually_just_sequence_matrices: bool = False):
 
         eps = 1e-7  # hardcoded?
         avoiding_repeated_dividing_eps = 1 / eps 
@@ -2661,6 +2673,9 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
         orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
         dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
 
+        if actually_just_sequence_matrices:
+            output = []
+
         _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
         for i in range(len(rho_gpindices)):
             probs2[:] = probs[:] # Could recompute only some of the tree.
@@ -2676,6 +2691,8 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
 
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
+            if actually_just_sequence_matrices:
+                output.append(original_Gs)
  
         _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
         for i in range(len(povm_gpindices)):
@@ -2692,8 +2709,10 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
             
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
+            if actually_just_sequence_matrices:
+                output.append(original_Gs)
 
-        remaining_param_inds = list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices)))
+        remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
 
 
         for i in range(len(remaining_param_inds)):
@@ -2711,9 +2730,16 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
 
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
+            if actually_just_sequence_matrices:
+                output.append(Gs)
+
         self.model.from_vector(orig_vec) # reset to the original model.
 
-    def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
+        if actually_just_sequence_matrices:
+            return output
+
+
+    def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc, return_prob_blocks: bool = False, actually_just_sequence_matrices: bool = False):
 
         eps = 1e-7  # hardcoded?
         avoiding_repeated_dividing_eps = 1 / eps 
@@ -2739,6 +2765,8 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
         orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
         dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
+        if actually_just_sequence_matrices:
+            output = []
 
         _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
         for i in range(len(rho_gpindices)):
@@ -2755,6 +2783,8 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
+            if actually_just_sequence_matrices:
+                output.append(original_Gs)
  
         _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
         for i in range(len(povm_gpindices)):
@@ -2771,8 +2801,10 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
             
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
+            if actually_just_sequence_matrices:
+                output.append(original_Gs)
 
-        remaining_param_inds = list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices)))
+        remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
 
 
         for i in range(len(remaining_param_inds)):
@@ -2790,4 +2822,13 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
             array_to_fill[:, iFinal] = (probs2 - probs) / eps
 
+            if actually_just_sequence_matrices:
+                output.append(Gs)
+
         self.model.from_vector(orig_vec) # reset to the original model.
+
+        # if not return_prob_blocks:
+        #     array_to_fill = (array_to_fill - probs[:, None]) / eps
+
+        if actually_just_sequence_matrices:
+            return output
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 6c0f56417..b1a5c151e 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -36,7 +36,7 @@
 import time
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
-from typing import List, Optional, Iterable
+from typing import List, Optional, Iterable, Union, TYPE_CHECKING
 from pygsti.tools.tqdm import our_tqdm
 
 
@@ -735,11 +735,12 @@ def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: i
             try:
                 my_op = model.circuit_layer_operator(lbl, "op") # Assumes that layers have the same gpindices as the gates themselves.
             except KeyError:
-                return cache_inds
-            if gp_index_changing in my_op.gpindices_as_array():
+                # Skip to the next lbl to check. Do not immediately return None!
+                continue
+            op_inds = my_op.gpindices_as_array()
+            if gp_index_changing in op_inds:
                 cache_inds = self.alphabet_val_to_sorted_cache_inds[lbl]
-                my_arr = my_op.gpindices_as_array()
-                for ind in my_arr: # Save off all the values we know about.
+                for ind in op_inds: # Save off all the values we know about.
                     self.gpindex_to_cache_vals[ind] = cache_inds
                 return cache_inds
         return cache_inds
@@ -1025,7 +1026,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         self.cir_id_to_tensor_order: dict[int, list[list[int], int]] = {}
         self.compute_tensor_orders()
 
-        self.saved_results = {}
+        self.saved_results: dict[Union[LabelTupTup, int], _np.ndarray] = {}
         self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
 
     def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
diff --git a/pygsti/modelmembers/operations/dyadickronop.py b/pygsti/modelmembers/operations/dyadickronop.py
index b5798a47c..d10bb739b 100644
--- a/pygsti/modelmembers/operations/dyadickronop.py
+++ b/pygsti/modelmembers/operations/dyadickronop.py
@@ -112,4 +112,15 @@ def __init__(self, kron_operands):
         forward = DyadicKronStructed.build_polyadic(self.kron_operands)
         self._linop   = forward._linop
         self._adjoint = forward.T
-        self._dtype = self.kron_operands[0].dtype
\ No newline at end of file
+        self._dtype = self.kron_operands[0].dtype
+
+    def to_full_array(self) -> np.ndarray:
+        """
+        Return the full dense matrix. Do not use this method in a performance sensitive routine
+        as you will not be utilizing the structure of the matrix to its full
+        potential. This is mainly used as a debugging tool.
+        """
+        output = 1
+        for i in range(len(self.kron_operands)):
+            output = np.kron(self.kron_operands[i], output)
+        return output
\ No newline at end of file

From 2fa4ad59719c20ba05a595c21a065159cb3b1cd0 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Fri, 8 Aug 2025 17:02:41 -0700
Subject: [PATCH 118/141] Working on storing only the updates we need to
 compute when doing the one directional step.

---
 pygsti/forwardsims/matrixforwardsim.py        | 421 ++++--------------
 pygsti/layouts/evaltree.py                    | 119 +++--
 .../test_forwardsim_on_implicitop_model.py    |  41 +-
 3 files changed, 191 insertions(+), 390 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index bc8a82d1d..3c5d5486a 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -40,7 +40,7 @@
 from pygsti.tools.optools import unitary_to_superop
 from pygsti.baseobjs.label import LabelTup, LabelTupTup, Label
 
-from typing import Sequence
+from typing import Sequence, Optional
 
 
 _dummy_profiler = _DummyProfiler()
@@ -2210,18 +2210,6 @@ def bulk_product(self, circuits, scale=False, resource_alloc=None):
         Gs = full_tree.reconstruct_full_matrices()
 
         return Gs
-    
-    def bulk_dproduct(self, circuits, flat=False, return_prods=False, scale=False, resource_alloc=None, wrt_filter=None):
-
-        layout = self.create_layout(circuits, resource_alloc=resource_alloc, array_types=('ep'))
-
-        # I will assume that we are working with only one rank here.
-
-        the_atom = layout.atoms[0]
-
-
-
-        return super().bulk_dproduct(circuits, flat, return_prods, scale, resource_alloc, wrt_filter)
 
     def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, resource_alloc):
         
@@ -2231,7 +2219,7 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
 
         layout_atom.tree.collapse_circuits_to_process_matrices(self.model, None)
 
-        Gs = layout_atom.tree.reconstruct_full_matrices()
+        Gs, all_circuits = layout_atom.tree.reconstruct_full_matrices()
 
         sp_val, povm_vals, element_indices, tree_indices = self._rho_es_elm_inds_and_tree_inds_from_spam_tuples(layout_atom)
         old_err = _np.seterr(over='ignore')
@@ -2252,7 +2240,7 @@ def _bulk_fill_probs_atom(self, array_to_fill, layout_atom: _MatrixCOPALayoutAto
         array_to_fill[:] = probs
         _np.seterr(**old_err)
 
-    def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
+    def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1, return_two_D: bool = False):
         """
         Compute the probabilities from rho, a set of povms, the circuits defined by gs, and then scale appropriately.
         """
@@ -2262,7 +2250,9 @@ def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1):
         out = _np.zeros((len(gs), len(e)))
         for i in range(len(gs)):
             out[i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
-        out = out.reshape((len(gs)*len(e)), order="C")
+
+        if not return_two_D:
+            out = out.reshape((len(gs)*len(e)), order="C")
         return out
         return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
 
@@ -2328,169 +2318,10 @@ def _rho_es_elm_inds_and_tree_inds_from_spam_tuples(self,
         povm_vals = _np.vstack(povm_vals)
         tree_inds = _np.unique(tree_inds)[0]
         return sp_val, povm_vals, elm_inds, tree_inds
-    
-    def _bulk_fill_dprobs_atom_saved(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
-
-        
-        eps = 1e-7  # hardcoded?
-        avoiding_repeated_dividing_eps = 1 / eps 
-        if param_slice is None:
-            param_slice = slice(0, self.model.num_params)
-        param_indices = _slct.to_array(param_slice)
-
-        if dest_param_slice is None:
-            dest_param_slice = slice(0, len(param_indices))
-        dest_param_indices = _slct.to_array(dest_param_slice)
-
-        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
-
-        probs = _np.empty(layout_atom.num_elements, 'd')
-        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
-
-        original_Gs = layout_atom.tree.reconstruct_full_matrices()
-
-        probs2 = _np.empty(layout_atom.num_elements, 'd')
-        orig_vec = self.model.to_vector().copy()
-        
-        sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
-
-        orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
-        dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
-
-        _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
-        if len(rho_gpindices)>0:
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-
-            iFinal = iParamToFinal[rho_gpindices[0]]
-            self.model.set_parameter_value(rho_gpindices[0], orig_vec[rho_gpindices[0]]+eps)
-
-            dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
-            probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
-
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
-
 
-            for i in range(1, len(rho_gpindices)):
-                iFinal = iParamToFinal[rho_gpindices[i]]
-                self.model.set_parameter_values([rho_gpindices[i-1], rho_gpindices[i]], 
-                                                [orig_vec[rho_gpindices[i-1]], orig_vec[rho_gpindices[i]]+eps])
-
-                dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
-                probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
-            
-                array_to_fill[:, iFinal] = (probs2 - probs) / eps
-
-            self.model.from_vector(orig_vec)  # Reset in case there were rho_gpindices.
- 
-        _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
-        if len(povm_gpindices) >0:
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-            iFinal = iParamToFinal[povm_gpindices[0]]
-            self.model.set_parameter_value(povm_gpindices[0], orig_vec[povm_gpindices[0]]+eps)
-            
-            # We are only varying the POVMs.
-            effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
-
-            probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
-            
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
-
-            for i in range(1, len(povm_gpindices)):
-                probs2[:] = probs[:] # Could recompute only some of the tree.
-                iFinal = iParamToFinal[povm_gpindices[0]]
-                self.model.set_parameter_values([povm_gpindices[i-1], povm_gpindices[i]], 
-                                                [orig_vec[povm_gpindices[i-1]], orig_vec[povm_gpindices[i]]+eps])
-
-                # We are only varying the POVMs.
-                effects = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
-
-                probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
-                
-                array_to_fill[:, iFinal] = (probs2 - probs) / eps
-
-            self.model.from_vector(orig_vec)  # Reset in case there were povm_gpindices.
-        
-        remaining_param_inds = list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices)))
-
-        print(remaining_param_inds)
-
-        if len(remaining_param_inds) > 0:
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-            iFinal = iParamToFinal[remaining_param_inds[0]]
-            recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, remaining_param_inds[0]) 
-            recompute_if_change_i = True
-            if recompute_if_change_i:
-
-                self.model.set_parameter_value(remaining_param_inds[0], orig_vec[remaining_param_inds[0]]+eps)
-                
-                # The representation of a gate has changed so we need to recompute some of the sequences.
-                layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[0])
-                Gs = layout_atom.tree.reconstruct_full_matrices()
-
-                probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
-
-                array_to_fill[:, iFinal] = (probs2 - probs) / eps
-            else:
-                array_to_fill[:, iFinal] = 0  # Derivative must be zero in this direction.        
-
-            for ind in tqdm(range(1, len(remaining_param_inds)), "Dx: "):
-                probs2[:] = probs[:] # Could recompute only some of the tree.
-                recompute_if_change_i_minus_1 = recompute_if_change_i
-                # recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, remaining_param_inds[ind])
-                
-                inds_to_reset = []
-                if remaining_param_inds[ind] == 7:
-                    breakpoint()
-                vals = []
-                if recompute_if_change_i and recompute_if_change_i_minus_1:
-                    # We need to modify both.
-                    inds_to_reset = [remaining_param_inds[ind-1], remaining_param_inds[ind]]
-                    vals = [orig_vec[remaining_param_inds[ind-1]], orig_vec[remaining_param_inds[ind]] + eps]
-                elif recompute_if_change_i:
-                    inds_to_reset = [remaining_param_inds[ind]]
-                    vals = [orig_vec[remaining_param_inds[ind]] + eps]
-                elif recompute_if_change_i_minus_1:
-                    # only reset.
-                    inds_to_reset = [remaining_param_inds[ind-1]]
-                    vals = [orig_vec[remaining_param_inds[ind-1]]]
-                    
-                if inds_to_reset:
-                    self.model.set_parameter_values(inds_to_reset, vals)
-
-                    # The representation of a gate has changed so we need to recompute some of the sequences.
-                    layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[ind])
-                    Gs = layout_atom.tree.reconstruct_full_matrices()
-
-                    probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
-
-                    array_to_fill[:, iFinal] = (probs2 - probs) / eps
-                
-                else:
-                    # Derivative must be zero in this direction.
-                    array_to_fill[:, iFinal] = 0
-
-        self.model.from_vector(orig_vec) # reset to the original model.
-        # array_to_fill = array_to_fill / eps # Divide once
-
-
-    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
-
-        CALL_BASIC = False
-        CALL_BASIC_SET_PARAM_VALS = False
-        CALL_SPLIT_OFF_POVM_AND_SPAM_REDO_ALL_OF_TREE = False
-        CALL_SPLIT_OFF_POVM_SPAM_CACHE_TREE = True
-
-        if CALL_BASIC:
-            return self._bulk_fill_dprobs_atom_using_from_vector(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
-        elif CALL_BASIC_SET_PARAM_VALS:
-            return self._bulk_fill_dprobs_atom_using_set_param_vals_no_cache(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
-        elif CALL_SPLIT_OFF_POVM_AND_SPAM_REDO_ALL_OF_TREE:
-            return self._bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
-        elif CALL_SPLIT_OFF_POVM_SPAM_CACHE_TREE:
-            return self._bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(array_to_fill, dest_param_slice, layout_atom, param_slice, resource_alloc)
-        
-        eps = 1e-7  # hardcoded?
-        avoiding_repeated_dividing_eps = 1 / eps 
+    def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice: Optional[slice],
+                               layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice: Optional[slice], resource_alloc):
+        eps = 1e-7
         if param_slice is None:
             param_slice = slice(0, self.model.num_params)
         param_indices = _slct.to_array(param_slice)
@@ -2503,88 +2334,34 @@ def _bulk_fill_dprobs_atom(self, array_to_fill, dest_param_slice, layout_atom: _
 
         probs = _np.empty(layout_atom.num_elements, 'd')
         self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
-
-        original_Gs = layout_atom.tree.reconstruct_full_matrices()
-
-        probs2 = _np.empty(layout_atom.num_elements, 'd')
         orig_vec = self.model.to_vector().copy()
-        
-
-        if len(param_indices) > 0:
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-            iFinal = iParamToFinal[param_indices[0]]
-            recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, param_indices[0]) 
-            recompute_if_change_i = True
-            if recompute_if_change_i:
-
-                self.model.set_parameter_value(param_indices[0], orig_vec[param_indices[0]]+eps)
-                
-                # The representation of a gate has changed so we need to recompute some of the sequences.
-                self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
-                    
-                array_to_fill[:, iFinal] = (probs2 - probs) / eps
-            else:
-                array_to_fill[:, iFinal] = 0  # Derivative must be zero in this direction.        
-
-            for ind in tqdm(range(1, len(param_indices)), "Dx: "):
-                probs2[:] = probs[:] # Could recompute only some of the tree.
-                recompute_if_change_i_minus_1 = recompute_if_change_i
-                # recompute_if_change_i = layout_atom.tree.do_I_need_to_recompute_portions_if_I_change_this_index(self.model, param_indices[ind])
-                
-                inds_to_reset = []
-                if param_indices[ind] == 7:
-                    breakpoint()
-                vals = []
-                if recompute_if_change_i and recompute_if_change_i_minus_1:
-                    # We need to modify both.
-                    inds_to_reset = [param_indices[ind-1], param_indices[ind]]
-                    vals = [orig_vec[param_indices[ind-1]], orig_vec[param_indices[ind]] + eps]
-                elif recompute_if_change_i:
-                    inds_to_reset = [param_indices[ind]]
-                    vals = [orig_vec[param_indices[ind]] + eps]
-                elif recompute_if_change_i_minus_1:
-                    # only reset.
-                    inds_to_reset = [param_indices[ind-1]]
-                    vals = [orig_vec[param_indices[ind-1]]]
-                    
-                if inds_to_reset:
-                    self.model.set_parameter_values(inds_to_reset, vals)
 
-                    # The representation of a gate has changed so we need to recompute some of the sequences.
-                    self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
-                    array_to_fill[:, iFinal] = (probs2 - probs) / eps
-                
-                else:
-                    # Derivative must be zero in this direction.
-                    array_to_fill[:, iFinal] = 0
+        # CALL_BASIC = False
+        # CALL_BASIC_SET_PARAM_VALS = False
+        # CALL_SPLIT_OFF_SPAM_REDO_ALL_OF_TREE = False
+        # CALL_SPLIT_OFF_SPAM_PARTIAL_CACHE_TREE = True
 
-        self.model.from_vector(orig_vec) # reset to the original model.
-        # array_to_fill = array_to_fill / eps # Divide once
+        # if CALL_BASIC:
+        #     return self._bulk_fill_dprobs_atom_using_from_vector(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+        # elif CALL_BASIC_SET_PARAM_VALS:
+        #     return self._bulk_fill_dprobs_atom_using_set_param_vals_no_cache(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+        # elif CALL_SPLIT_OFF_SPAM_REDO_ALL_OF_TREE:
+        #     return self._bulk_fill_dprobs_atom_with_from_vector_split_SPAM_off(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
+        # elif CALL_SPLIT_OFF_SPAM_PARTIAL_CACHE_TREE:
+        return self._bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(array_to_fill, layout_atom, param_indices, resource_alloc, iParamToFinal, probs, orig_vec, eps)
 
     def create_layout(self, circuits : Sequence[_Circuit] | _CircuitList, dataset=None, resource_alloc=None, array_types=('E', ), derivative_dimensions=None, verbosity=0, layout_creation_circuit_cache=None):
         return super().create_layout(circuits, dataset, resource_alloc, array_types, derivative_dimensions, verbosity, layout_creation_circuit_cache, use_old_tree_style=False)
 
 
 
-    def _bulk_fill_dprobs_atom_using_from_vector(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
-
-
-        eps = 1e-7  # hardcoded?
-        avoiding_repeated_dividing_eps = 1 / eps 
-        if param_slice is None:
-            param_slice = slice(0, self.model.num_params)
-        param_indices = _slct.to_array(param_slice)
-
-        if dest_param_slice is None:
-            dest_param_slice = slice(0, len(param_indices))
-        dest_param_indices = _slct.to_array(dest_param_slice)
-
-        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
+    def _bulk_fill_dprobs_atom_using_from_vector(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_indices,
+                resource_alloc, iParamToFinal: dict[int, int], base_probs: _np.ndarray, orig_vec: _np.ndarray, eps: float = 1e-7):
+        """
+        Specifically use the from_vector method to update the model before the finite difference scheme.
+        """
 
-        probs = _np.empty(layout_atom.num_elements, 'd')
-        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
         probs2 = _np.empty(layout_atom.num_elements, 'd')
-        orig_vec = self.model.to_vector().copy()
 
         for i in range(len(param_indices)):
 
@@ -2594,31 +2371,20 @@ def _bulk_fill_dprobs_atom_using_from_vector(self, array_to_fill, dest_param_sli
 
             self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
 
-            array_to_fill[:, iParamToFinal[param_indices[i]]] = (probs2 - probs) / eps
+            array_to_fill[:, iParamToFinal[param_indices[i]]] = (probs2 - base_probs) / eps
 
         self.model.from_vector(orig_vec)
 
-    def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc):
-
+    def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_indices,
+                resource_alloc, iParamToFinal: dict[int, int], base_probs: _np.ndarray, orig_vec: _np.ndarray, eps: float = 1e-7):
+        """
+        Specifically use the set_parameter_values method to update the model before the finite difference scheme which will recompute the whole LCS tree.
+        """
+        
         # THIS METHOD FAILS TO PRODUCE THE CORRECT RESULTS! THIS IS BECAUSE THE SET_PARAMETER_VALUES call still does not work.
 
-        eps = 1e-7  # hardcoded?
-        avoiding_repeated_dividing_eps = 1 / eps 
-        if param_slice is None:
-            param_slice = slice(0, self.model.num_params)
-        param_indices = _slct.to_array(param_slice)
-
-        if dest_param_slice is None:
-            dest_param_slice = slice(0, len(param_indices))
-        dest_param_indices = _slct.to_array(dest_param_slice)
-
-        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
-
-        probs = _np.empty(layout_atom.num_elements, 'd')
-        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
         probs2 = _np.empty(layout_atom.num_elements, 'd')
-        orig_vec = self.model.to_vector().copy()
-
+        
         if len(param_indices) > 0:
 
             first_ind = param_indices[0]
@@ -2629,7 +2395,7 @@ def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, de
 
             self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
 
-            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - probs) / eps
+            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - base_probs) / eps
 
         for i in range(1, len(param_indices)):
             self.model.set_parameter_values([param_indices[i - 1], param_indices[i]],
@@ -2641,45 +2407,30 @@ def _bulk_fill_dprobs_atom_using_set_param_vals_no_cache(self, array_to_fill, de
 
             self._bulk_fill_probs_atom(probs2, layout_atom, resource_alloc)
 
-            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - probs) / eps
-
+            array_to_fill[:, iParamToFinal[first_ind]] = (probs2 - base_probs) / eps
 
         self.model.from_vector(orig_vec)
 
-    def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc, actually_just_sequence_matrices: bool = False):
-
-        eps = 1e-7  # hardcoded?
-        avoiding_repeated_dividing_eps = 1 / eps 
-        if param_slice is None:
-            param_slice = slice(0, self.model.num_params)
-        param_indices = _slct.to_array(param_slice)
-
-        if dest_param_slice is None:
-            dest_param_slice = slice(0, len(param_indices))
-        dest_param_indices = _slct.to_array(dest_param_slice)
-
-        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
-
-        probs = _np.empty(layout_atom.num_elements, 'd')
-        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+    def _bulk_fill_dprobs_atom_with_from_vector_split_SPAM_off(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS,
+                                                               param_indices: _np.ndarray, resource_alloc,
+                                                               iParamToFinal: dict[int, int], base_probs: _np.ndarray,
+                                                               orig_vec: _np.ndarray, eps: float = 1e-7,
+                                                               return_sequence_matrices: bool = False, recompute_whole_tree: bool = False):
 
         original_Gs = layout_atom.tree.reconstruct_full_matrices()
 
         probs2 = _np.empty(layout_atom.num_elements, 'd')
-        orig_vec = self.model.to_vector().copy()
         
         sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
 
         orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
         dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
 
-        if actually_just_sequence_matrices:
+        if return_sequence_matrices:
             output = []
 
-        _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
+        _, rho_gpindices = self._process_wrt_filter(param_indices, sp_obj)
         for i in range(len(rho_gpindices)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-
             iFinal = iParamToFinal[rho_gpindices[i]]
             
             new_vec = orig_vec.copy()
@@ -2689,14 +2440,13 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
             dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
             probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
 
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
-            if actually_just_sequence_matrices:
+            if return_sequence_matrices:
                 output.append(original_Gs)
  
-        _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        _, povm_gpindices = self._process_wrt_filter(param_indices, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
         for i in range(len(povm_gpindices)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
             iFinal = iParamToFinal[povm_gpindices[i]]
             new_vec = orig_vec.copy()
             new_vec[povm_gpindices[i]] += eps
@@ -2707,16 +2457,14 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
 
             probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
             
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
-            if actually_just_sequence_matrices:
+            if return_sequence_matrices:
                 output.append(original_Gs)
 
         remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
 
-
         for i in range(len(remaining_param_inds)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
             iFinal = iParamToFinal[remaining_param_inds[i]]
 
             new_vec = orig_vec.copy()
@@ -2728,50 +2476,37 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_split_out_collapse_if_possible(
 
             probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
 
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
-            if actually_just_sequence_matrices:
+            if return_sequence_matrices:
                 output.append(Gs)
 
         self.model.from_vector(orig_vec) # reset to the original model.
 
-        if actually_just_sequence_matrices:
+        if return_sequence_matrices:
             return output
 
 
-    def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_fill, dest_param_slice, layout_atom: _MatrixCOPALayoutAtomWithLCS, param_slice, resource_alloc, return_prob_blocks: bool = False, actually_just_sequence_matrices: bool = False):
-
-        eps = 1e-7  # hardcoded?
-        avoiding_repeated_dividing_eps = 1 / eps 
-        if param_slice is None:
-            param_slice = slice(0, self.model.num_params)
-        param_indices = _slct.to_array(param_slice)
-
-        if dest_param_slice is None:
-            dest_param_slice = slice(0, len(param_indices))
-        dest_param_indices = _slct.to_array(dest_param_slice)
-
-        iParamToFinal = {i: dest_param_indices[ii] for ii, i in enumerate(param_indices)}
-
-        probs = _np.empty(layout_atom.num_elements, 'd')
-        self._bulk_fill_probs_atom(probs, layout_atom, resource_alloc)
+    def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_fill, layout_atom: _MatrixCOPALayoutAtomWithLCS,
+                param_indices: _np.ndarray, resource_alloc, iParamToFinal: dict[int, int], base_probs: _np.ndarray,
+                orig_vec: _np.ndarray, eps: float = 1e-7, return_sequence_matrices: bool = False):
+        """
+        Use from_vector to update the model. Then, recompute only the part of the tree which you need to recompute.
+        """
 
-        original_Gs = layout_atom.tree.reconstruct_full_matrices()
+        original_Gs, all_cirs = layout_atom.tree.reconstruct_full_matrices()
 
         probs2 = _np.empty(layout_atom.num_elements, 'd')
-        orig_vec = self.model.to_vector().copy()
         
         sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
 
         orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
         dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
-        if actually_just_sequence_matrices:
+        if return_sequence_matrices:
             output = []
 
-        _, rho_gpindices = self._process_wrt_filter(param_slice, sp_obj)
+        _, rho_gpindices = self._process_wrt_filter(param_indices, sp_obj)
         for i in range(len(rho_gpindices)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
-
             iFinal = iParamToFinal[rho_gpindices[i]]
             
             new_vec = orig_vec.copy()
@@ -2781,14 +2516,13 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
             dense_sp = sp_obj.to_dense(on_space="minimal")[:, None]
             probs2 = self._probs_from_rho_e(dense_sp, dense_povms, original_Gs[tree_indices])
 
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
-            if actually_just_sequence_matrices:
+            if return_sequence_matrices:
                 output.append(original_Gs)
  
-        _, povm_gpindices = self._process_wrt_filter(param_slice, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
+        _, povm_gpindices = self._process_wrt_filter(param_indices, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
         for i in range(len(povm_gpindices)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
             iFinal = iParamToFinal[povm_gpindices[i]]
             new_vec = orig_vec.copy()
             new_vec[povm_gpindices[i]] += eps
@@ -2799,16 +2533,17 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
             probs2 = self._probs_from_rho_e(orig_dense_sp, effects, original_Gs[tree_indices])
             
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
-            if actually_just_sequence_matrices:
+            if return_sequence_matrices:
                 output.append(original_Gs)
 
         remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
 
 
+        tVals = _slct.indices(tree_indices)
         for i in range(len(remaining_param_inds)):
-            probs2[:] = probs[:] # Could recompute only some of the tree.
+            probs2[:] = base_probs[:] # Copy off the data
             iFinal = iParamToFinal[remaining_param_inds[i]]
 
             new_vec = orig_vec.copy()
@@ -2816,19 +2551,25 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
             self.model.from_vector(new_vec)
 
             layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[i])
-            Gs = layout_atom.tree.reconstruct_full_matrices()
+            Gs, inds_to_update = layout_atom.tree.reconstruct_full_matrices(self.model, remaining_param_inds[i])
 
-            probs2 = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs[tree_indices])
 
-            array_to_fill[:, iFinal] = (probs2 - probs) / eps
+            probs2 = probs2.reshape((layout_atom.num_elements // len(tVals), len(tVals)), order="C")
+            probs2 = probs2.T
+        
+            tmp = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs, return_two_D=True)
+            if len(inds_to_update) > 0:
+                breakpoint()
+            probs2[inds_to_update] = tmp
+            probs2 = probs2.T
+            probs2 = probs2.reshape(layout_atom.num_elements, order="C")
 
-            if actually_just_sequence_matrices:
+            array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
+
+            if return_sequence_matrices:
                 output.append(Gs)
 
         self.model.from_vector(orig_vec) # reset to the original model.
 
-        # if not return_prob_blocks:
-        #     array_to_fill = (array_to_fill - probs[:, None]) / eps
-
-        if actually_just_sequence_matrices:
+        if return_sequence_matrices:
             return output
\ No newline at end of file
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index b1a5c151e..80c8c415f 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -36,7 +36,7 @@
 import time
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
-from typing import List, Optional, Iterable, Union, TYPE_CHECKING
+from typing import List, Optional, Iterable, Union, TYPE_CHECKING, Tuple
 from pygsti.tools.tqdm import our_tqdm
 
 
@@ -739,12 +739,22 @@ def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: i
                 continue
             op_inds = my_op.gpindices_as_array()
             if gp_index_changing in op_inds:
-                cache_inds = self.alphabet_val_to_sorted_cache_inds[lbl]
+                cache_inds = self.alphabet_val_to_sorted_cache_inds[lbl] + [lbl] # We also invalidate the lbl.
                 for ind in op_inds: # Save off all the values we know about.
                     self.gpindex_to_cache_vals[ind] = cache_inds
                 return cache_inds
         return cache_inds
     
+    def _which_full_circuits_will_change_due_to_gpindex_changing(self, model, gp_index_changing: int) -> list[int]:
+
+        cache_inds = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+        if len(cache_inds) == 0:
+            return []
+        
+        answer = [ind for ind in range(self.num_circuits) if ind in cache_inds]
+        return answer
+
 
     def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
         """
@@ -805,13 +815,20 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
             if cache_inds:
                 # Invalidate all gate labels that we saved just in case.
                 # Invalidate every index in the which we know to be influenced by my_op.
-                local_changes = {k: v.copy() for k, v in self.results.items() \
-                                    if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
+                # local_changes = {k: v for k, v in self.results.items() \
+                #                     if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
+
+                # Iterating over all the cache will take too long.
+                # So we need to handle the invalidness of certain cache inds when we encounter them.
+                local_changes = {}
 
-                for cache_ind in cache_inds:
+                # Ignore the last index which is the Label that matched the gpindex.
+                # We assume that only one will match.
+                for cache_ind in cache_inds[:-1]:
                     cumulative_term = None
                     for term in self.cache[cache_ind]:
-                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes, num_qubits_in_default)
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes,
+                                                                    num_qubits_in_default, cache_inds)
 
                     # Save locally.
                     if cumulative_term is None:
@@ -842,11 +859,12 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
                 round_keys = pos_keys + neg_keys + _np.array([0])
             
+            empty = []
             for key in round_keys:
                 for cache_ind in self.sequence_intro[key]:
                     cumulative_term = None
                     for term in self.cache[cache_ind]:
-                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results, num_qubits_in_default)
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results, num_qubits_in_default, empty)
                             
                     if cumulative_term is None:
                         self.results[cache_ind] = _np.eye(4**num_qubits_in_default)
@@ -859,7 +877,6 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                 assert key in self.results
         
         # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
-    
         return self.results, self.circuit_to_save_location
     
     def compute_depends_on(self, val: int | LabelTupTup, visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
@@ -893,18 +910,39 @@ def combine_for_visualization(self, val, visited):
     def handle_results_cache_lookup_and_product(self,
                             cumulative_term: None | _np.ndarray,
                             term_to_extend_with: int | LabelTupTup,
-                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
+                            results_cache: dict[int | LabelTupTup, _np.ndarray],
+                            globally_invalid_cache_inds: list[Union[int, Label]]) -> _np.ndarray:
 
+        if isinstance(term_to_extend_with, int):
+            if term_to_extend_with in globally_invalid_cache_inds[:-1]:
+                # look up the result in the local results cache.
+                # This is just for that derivative step.
+                if cumulative_term is None:
+                    return results_cache[term_to_extend_with]
+                return results_cache[term_to_extend_with] @ cumulative_term
+        else:
+            if term_to_extend_with in globally_invalid_cache_inds[-1:]:
+                # Only one label gets invalidated and that is stored at the end of the list.
+
+                # look up the result in the local results cache.
+                # This is just for that derivative step.
+                if cumulative_term is None:
+                    return results_cache[term_to_extend_with]
+                return results_cache[term_to_extend_with] @ cumulative_term
+        
+        # We should use the cache for all the probs calculation.
         if cumulative_term is None:
             # look up result.
-            return results_cache[term_to_extend_with]
-        return results_cache[term_to_extend_with] @ cumulative_term 
+            return self.results[term_to_extend_with]
+        return self.results[term_to_extend_with] @ cumulative_term 
 
 
     def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
                             term_to_extend_with: int | LabelTupTup,
-                            results_cache: dict[int | LabelTupTup, _np.ndarray],
-                            num_qubits_in_default: int) -> _np.ndarray:
+                            local_results_cache: dict[int | LabelTupTup, _np.ndarray],
+                            num_qubits_in_default: int,
+                            globally_invalid_cache_inds: list[Union[int, LabelTupTup]]
+                            ) -> _np.ndarray:
         """
         Reduce a cache line to a single process matrix.
 
@@ -912,18 +950,15 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
         """
 
-        if isinstance(term_to_extend_with, int):
-            if term_to_extend_with not in results_cache:
-                breakpoint()
-                assert term_to_extend_with in results_cache, f"Term {term_to_extend_with} not in cache: {results_cache.keys()}"
-            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
-        if term_to_extend_with in results_cache:
-            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with, results_cache)
+        if (term_to_extend_with in local_results_cache) or (term_to_extend_with in self.results):
+            # It is in one of the caches.
+            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
+                                                local_results_cache, globally_invalid_cache_inds)
         else:
             val = 1
             qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
             matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
-                                            results_cache, self.swap_gate) for op in term_to_extend_with}
+                                            local_results_cache, self.swap_gate) for op in term_to_extend_with}
             qubit_used = []
             for key in matrix_reps.keys():
                 qubit_used.extend(key)
@@ -933,7 +968,7 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
             implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
                                         Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
-                                        results_cache, self.swap_gate) for qu in unused_qubits}
+                                        local_results_cache, self.swap_gate) for qu in unused_qubits}
 
             while qubits_available:
 
@@ -948,11 +983,11 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
                     qubits_available = qubits_available[len(gatekey):]
 
-            results_cache[term_to_extend_with] = val
+            local_results_cache[term_to_extend_with] = val
             if cumulative_term is None:
                 return val
             # Cache if off.
-            return results_cache[term_to_extend_with] @ cumulative_term
+            return local_results_cache[term_to_extend_with] @ cumulative_term
 
 
     def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
@@ -1060,7 +1095,10 @@ def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Option
             self.saved_results[key] = out1
             self.sub_cir_to_ind_in_results[key] = out2
 
-    def reconstruct_full_matrices(self) -> Optional[List[KronStructured]]:
+    def reconstruct_full_matrices(self,
+                                  model = None,
+                                  gp_index_changing: Optional[int] = None) -> \
+                                    Optional[Tuple[List[Union[KronStructured, _np.ndarray]], List[int]]]:
         """
         Construct a tensor product structure for each individual circuit
         """
@@ -1068,21 +1106,40 @@ def reconstruct_full_matrices(self) -> Optional[List[KronStructured]]:
         if len(self.saved_results) == 0:
             return
 
-        # Now we can do the combination.
-
         num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
+        if (gp_index_changing is not None) and (model is not None):
+            cir_changes = set()
+
+            for key in self.trees:
+                cir_changes = cir_changes.union(self.trees[key]._which_full_circuits_will_change_due_to_gpindex_changing(model, gp_index_changing))
+
+            cir_inds = _np.array(sorted(list(cir_changes)), dtype=_np.int32)
 
         output = []
-        for icir in range(num_cirs):
+
+        # Now we can do the combination.
+
+        for icir in cir_inds:
             lane_circuits = []
             for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
                 cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
                 lblkey = cir._line_labels
 
                 ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                lane_circuits.append(self.saved_results[lblkey][ind_in_results])
-            output.append(KronStructured(lane_circuits))
-        return output
+                if ind_in_results not in self.saved_results[lblkey]:
+                    # We have only the local changes.
+                    # This will be stored in the results file of the subtree.
+                    lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                else:
+                    lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+            if len(lane_circuits) > 1:
+               output.append(KronStructured(lane_circuits))
+            elif len(lane_circuits) == 1:
+                output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
+            else:
+                raise ValueError()
+        return output, cir_inds
     
     def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False):
 
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index e64efe480..6b83e3193 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -24,8 +24,11 @@ def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: C
 
     for key, val in op_dict.items():
         assert key in exp_dict
-        assert np.allclose(exp_dict[key], val), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
-
+        try:
+            assert np.allclose(exp_dict[key], val), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
+        except:
+            breakpoint()
+            raise AssertionError()
 
 #region Model Construction
 def construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta):
@@ -410,6 +413,23 @@ def test_tensor_product_two_qubit_gates_dprobs():
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
+def test_tensor_product_two_qubit_gates_dprobs_bulk():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits, simplify_for_dprobs=True)
+
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitECR01, circuitECR10]
+    dprobs = under_test.sim.bulk_dprobs(circ_list)
+    exp = expected_model.sim.bulk_dprobs(circ_list)
+
+
+    for cir in circ_list:
+        assert_probability_densities_are_equal(dprobs[cir], exp[cir], cir)
+
 
 def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
@@ -507,22 +527,5 @@ def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
 
         assert_probability_densities_are_equal(probs, exp, circuit)
 
-# test_tensor_product_gates_with_implicit_idles_dprobs()
 #endregion Derivative of Probabilities consistencies.
 
-
-
-def test_matrices_are_close():
-
-    num_qubits = 3
-    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True,
-                                                          simplify_for_dprobs=True)
-    
-    cir = Circuit([[("Gxpi2", 1)]], num_lines=num_qubits)
-
-    expected_model.sim = MatrixForwardSimulator()
-
-    expected_dproduct = expected_model.sim.bulk_dproduct([cir])
-    actual_dproduct = under_test.sim.bulk_dproduct([cir])
-
-    assert np.allclose(actual_dproduct, expected_dproduct)

From 25af0e7593e7f5cce1e09ddca1630482efbf9c81 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 13 Aug 2025 13:37:49 -0700
Subject: [PATCH 119/141] Works if not testing in bulk.

---
 pygsti/forwardsims/matrixforwardsim.py        | 33 +++++++++++--------
 pygsti/layouts/evaltree.py                    | 29 +++++++++++++---
 .../modelmembers/operations/dyadickronop.py   |  2 +-
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 3c5d5486a..e489fcc61 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2247,12 +2247,12 @@ def _probs_from_rho_e(self, rho, e: _np.ndarray, gs, scale_vals = 1, return_two_
 
         assert e.ndim == 2
         ## WHY ??? assert e[0] > 1
-        out = _np.zeros((len(gs), len(e)))
+        out = _np.zeros((len(e), len(gs)))
         for i in range(len(gs)):
-            out[i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
+            out[:, i] = _np.squeeze(e @ (gs[i] @ rho), axis=(1))
 
         if not return_two_D:
-            out = out.reshape((len(gs)*len(e)), order="C")
+            out = out.reshape((len(gs)*len(e)))
         return out
         return _np.squeeze(e @ (gs @ rho), axis=(2)) # only one rho.
 
@@ -2498,7 +2498,9 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
         probs2 = _np.empty(layout_atom.num_elements, 'd')
         
-        sp_obj, povm_objs, _, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
+        sp_obj, povm_objs, elm_indices, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
+
+        elm_indices = [_slct.indices(elm) for elm in elm_indices]
 
         orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
         dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
@@ -2553,16 +2555,21 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
             layout_atom.tree.collapse_circuits_to_process_matrices(self.model, remaining_param_inds[i])
             Gs, inds_to_update = layout_atom.tree.reconstruct_full_matrices(self.model, remaining_param_inds[i])
 
-
-            probs2 = probs2.reshape((layout_atom.num_elements // len(tVals), len(tVals)), order="C")
-            probs2 = probs2.T
-        
             tmp = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs, return_two_D=True)
-            if len(inds_to_update) > 0:
-                breakpoint()
-            probs2[inds_to_update] = tmp
-            probs2 = probs2.T
-            probs2 = probs2.reshape(layout_atom.num_elements, order="C")
+
+            for j in range(len(inds_to_update)):
+                for k in range(len(elm_indices)):
+                    probs2[elm_indices[k][inds_to_update[j]]] = tmp[k, inds_to_update[j]]
+
+
+            # if len(Gs) > 0:
+            #     probs2 = probs2.reshape((layout_atom.num_elements // len(tVals), len(tVals)), order="F")
+            
+            #     tmp = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs, return_two_D=True)
+                
+            #     if len(tmp) > 0:
+            #         probs2[:, inds_to_update] = tmp
+            #     probs2 = probs2.reshape(layout_atom.num_elements, order="F")
 
             array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 80c8c415f..07b34f897 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -617,7 +617,6 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
             cache_pos = -1
             while max_rounds > 1:
 
-                breakpoint()
                 tmp = simplify_internal_first_one_round(new_circuit_list, 
                                                         internal_matches,
                                                         cache_pos,
@@ -836,6 +835,7 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                         # NOTE: unclear when (if ever) this should be a noisy idle gate.
                     else:
                         local_changes[cache_ind] = cumulative_term
+
                 return local_changes, self.circuit_to_save_location
 
             return self.results, self.circuit_to_save_location
@@ -913,6 +913,10 @@ def handle_results_cache_lookup_and_product(self,
                             results_cache: dict[int | LabelTupTup, _np.ndarray],
                             globally_invalid_cache_inds: list[Union[int, Label]]) -> _np.ndarray:
 
+        if cumulative_term is None:
+            return results_cache[term_to_extend_with]
+        return results_cache[term_to_extend_with] @ cumulative_term
+
         if isinstance(term_to_extend_with, int):
             if term_to_extend_with in globally_invalid_cache_inds[:-1]:
                 # look up the result in the local results cache.
@@ -950,10 +954,27 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
         """
 
-        if (term_to_extend_with in local_results_cache) or (term_to_extend_with in self.results):
-            # It is in one of the caches.
+        if (term_to_extend_with in local_results_cache):
+            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
+                                                                local_results_cache, globally_invalid_cache_inds)
+        elif isinstance(term_to_extend_with, int) and \
+            (term_to_extend_with not in globally_invalid_cache_inds[:-1]) and \
+                (term_to_extend_with in self.results):
+            
+            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
+                                                                self.results, globally_invalid_cache_inds)
+        elif isinstance(term_to_extend_with, LabelTupTup) and \
+            not (any([t in globally_invalid_cache_inds[-1:] for t in term_to_extend_with])) \
+                and (term_to_extend_with in self.results):        
             return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
-                                                local_results_cache, globally_invalid_cache_inds)
+                                                                self.results, globally_invalid_cache_inds)
+        
+        # elif isinstance(term_to_extend_with, LabelTup) and \
+        #     (term_to_extend_with not in globally_invalid_cache_inds[-1:]) \
+        #         and (term_to_extend_with in self.results):        
+        #     return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
+        #                                                         local_results_cache, globally_invalid_cache_inds)
+
         else:
             val = 1
             qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
diff --git a/pygsti/modelmembers/operations/dyadickronop.py b/pygsti/modelmembers/operations/dyadickronop.py
index d10bb739b..08a6c80cb 100644
--- a/pygsti/modelmembers/operations/dyadickronop.py
+++ b/pygsti/modelmembers/operations/dyadickronop.py
@@ -122,5 +122,5 @@ def to_full_array(self) -> np.ndarray:
         """
         output = 1
         for i in range(len(self.kron_operands)):
-            output = np.kron(self.kron_operands[i], output)
+            output = np.kron(output, self.kron_operands[i])
         return output
\ No newline at end of file

From 282cb5979911291cf1212cb7a2e5a03d3c039c85 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Wed, 13 Aug 2025 15:03:49 -0700
Subject: [PATCH 120/141] bulk works with independent gates

---
 pygsti/forwardsims/matrixforwardsim.py        |   4 +-
 .../test_forwardsim_on_implicitop_model.py    | 205 +++++++++++++++++-
 2 files changed, 201 insertions(+), 8 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index e489fcc61..4481e32d1 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2509,6 +2509,7 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
         _, rho_gpindices = self._process_wrt_filter(param_indices, sp_obj)
         for i in range(len(rho_gpindices)):
+            probs2[:] = base_probs[:]
             iFinal = iParamToFinal[rho_gpindices[i]]
             
             new_vec = orig_vec.copy()
@@ -2525,6 +2526,7 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
  
         _, povm_gpindices = self._process_wrt_filter(param_indices, self.model.circuit_layer_operator(self.model.primitive_povm_labels[0], "povm"))
         for i in range(len(povm_gpindices)):
+            probs2[:] = base_probs[:]
             iFinal = iParamToFinal[povm_gpindices[i]]
             new_vec = orig_vec.copy()
             new_vec[povm_gpindices[i]] += eps
@@ -2559,7 +2561,7 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
             for j in range(len(inds_to_update)):
                 for k in range(len(elm_indices)):
-                    probs2[elm_indices[k][inds_to_update[j]]] = tmp[k, inds_to_update[j]]
+                    probs2[elm_indices[k][inds_to_update[j]]] = tmp[k, j]
 
 
             # if len(Gs) > 0:
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index 6b83e3193..b6a6108d0 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -24,11 +24,23 @@ def assert_probability_densities_are_equal(op_dict: dict, exp_dict: dict, cir: C
 
     for key, val in op_dict.items():
         assert key in exp_dict
-        try:
-            assert np.allclose(exp_dict[key], val), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
-        except:
-            breakpoint()
-            raise AssertionError()
+        assert np.allclose(val, exp_dict[key]), f"Circuit {cir}, Outcome {key}, Expected: {exp_dict[key]}, Got: {val}"
+        # try:
+        # except:
+            # breakpoint()
+            # raise AssertionError()
+
+def convert_dict_of_dist_to_array(my_dictionary: dict) -> np.ndarray:
+
+    out = [[] for _ in my_dictionary.keys()]
+
+    for i, key in enumerate(sorted(my_dictionary.keys())):
+        val = my_dictionary[key]
+        if isinstance(val, dict):
+            val = convert_dict_of_dist_to_array(val)
+        out[i] = val
+
+    return np.array(out)
 
 #region Model Construction
 def construct_arbitrary_single_qubit_unitary(alpha, beta, gamma, delta):
@@ -324,6 +336,24 @@ def test_tensor_product_single_unitaries_random_collection_of_xyz():
         assert_probability_densities_are_equal(probs, exp, circuit100)
 
 
+def test_tensor_product_two_qubit_gates_bulk():
+
+    num_qubits = 4
+
+    under_test, expected_model = build_models_for_testing(num_qubits)
+
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitECR01, circuitECR10]
+
+    probs = under_test.sim.bulk_probs(circ_list)
+    exp = expected_model.sim.bulk_probs(circ_list)
+
+    for cir in circ_list:
+        assert_probability_densities_are_equal(probs[cir], exp[cir], cir)
+
+
 def test_tensor_product_two_qubit_gates():
 
     num_qubits = 4
@@ -417,19 +447,22 @@ def test_tensor_product_two_qubit_gates_dprobs_bulk():
 
     num_qubits = 4
 
-    under_test, expected_model = build_models_for_testing(num_qubits, simplify_for_dprobs=True)
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
 
     circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
     circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
 
     circ_list = [circuitECR01, circuitECR10]
-    dprobs = under_test.sim.bulk_dprobs(circ_list)
     exp = expected_model.sim.bulk_dprobs(circ_list)
 
+    breakpoint()
+    dprobs = under_test.sim.bulk_dprobs(circ_list)
+
 
     for cir in circ_list:
         assert_probability_densities_are_equal(dprobs[cir], exp[cir], cir)
 
+test_tensor_product_two_qubit_gates_dprobs_bulk()
 
 def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
@@ -455,6 +488,31 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
+def test_tensor_product_single_unitaries_yield_right_results_dprobs_bulk():
+
+    import importlib as _importlib
+
+    num_qubits = 2
+
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
+
+    circuitNone = Circuit([], num_lines=num_qubits)
+    single_layer = tuple([("Gxpi2", i) for i in range(num_qubits)])
+    circuitX = Circuit([single_layer], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+
+    circuits = [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]
+
+    probs = under_test.sim.bulk_dprobs(circuits)
+    exp = expected_model.sim.bulk_dprobs(circuits)
+
+
+    for cir in circuits:
+        assert_probability_densities_are_equal(probs[cir], exp[cir], cir)
+
+# test_tensor_product_single_unitaries_yield_right_results_dprobs_bulk()
 
 def test_tensor_product_single_unitaries_random_collection_of_xyz_dprobs():
 
@@ -529,3 +587,136 @@ def test_tensor_product_multi_qubit_gates_with_structured_lanes_dprobs():
 
 #endregion Derivative of Probabilities consistencies.
 
+
+
+
+def test_reconstruct_full_matrices_returns_in_correct_order():
+
+    num_qubits = 4
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True)
+
+    circ_list = []
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitX, circuitY, circuitZ, circuitIdle, circuitECR10, circuitECR01]
+
+    layout = under_test.sim.create_layout(circ_list, array_types='ep')
+
+    atom = layout.atoms[0]
+    atom.tree.collapse_circuits_to_process_matrices(under_test, None)
+
+    Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None)
+
+    expected_vals = []
+    for cir in circ_list:
+        val = np.eye(4**num_qubits)
+        for i in range(len(cir)):
+            term = expected_model.circuit_layer_operator(cir[i]).to_dense()
+            val = term @ val
+
+        expected_vals.append(val)
+
+    
+    for i, cir in enumerate(circ_list):
+
+        kron_version = Gs[i].to_full_array()
+        assert np.allclose(kron_version, expected_vals[i])
+
+    print()
+
+def test_application_is_equivalent():
+
+    num_qubits = 4
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True)
+
+    circ_list = []
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitX, circuitY, circuitZ, circuitIdle, circuitECR10, circuitECR01]
+
+    layout = under_test.sim.create_layout(circ_list, array_types='ep')
+
+    atom = layout.atoms[0]
+    atom.tree.collapse_circuits_to_process_matrices(under_test, None)
+
+    Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None)
+
+    rhs = np.eye(4**num_qubits)
+    for i, cir in enumerate(circ_list):
+
+        kron_version = Gs[i].to_full_array()
+        
+        full = kron_version @ rhs
+        the_trick = Gs[i] @ rhs
+
+        assert np.allclose(the_trick, full)
+
+    print()
+    
+def test_reconstructed_vals_update_with_dprobs():
+
+    num_qubits = 4
+    under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True)
+
+    circ_list = []
+    circuitX = Circuit([("Gxpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitY = Circuit([("Gypi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitZ = Circuit([("Gzpi2", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitIdle = Circuit([("Gi", i) for i in range(num_qubits)], num_lines=num_qubits)
+    circuitECR01 = Circuit([[("Gecr", 0, 1), ("Gi", 2), ("Gzpi2", 3)]])
+    circuitECR10 = Circuit([[("Gecr", 1, 0), ("Gi", 2), ("Gzpi2", 3)]])
+
+    circ_list = [circuitX, circuitY, circuitZ, circuitIdle, circuitECR10, circuitECR01]
+    circ_list = [circuitX, circuitY, circuitIdle, circuitECR01, circuitECR10]
+
+    layout = under_test.sim.create_layout(circ_list, array_types='ep')
+
+    atom = layout.atoms[0]
+    atom.tree.collapse_circuits_to_process_matrices(under_test, None)
+
+    original_Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None)
+
+    eps = 1e-7
+
+    orig_vec = under_test.to_vector().copy()
+    for i in range(under_test.num_params):
+
+        new_vec = orig_vec.copy()
+        new_vec[i] += eps
+
+        under_test.from_vector(new_vec)
+        expected_model.from_vector(new_vec)
+
+        atom.tree.collapse_circuits_to_process_matrices(under_test, i)
+
+        new_Gs, _ = atom.tree.reconstruct_full_matrices(under_test, None) # Return all of them
+
+        expected_vals = []
+        for cir in circ_list:
+            val = np.eye(4**num_qubits)
+            for j in range(len(cir)):
+                term = expected_model.circuit_layer_operator(cir[j]).to_dense()
+                val = term @ val
+
+            expected_vals.append(val)
+
+    
+        for j, cir in enumerate(circ_list):
+
+            kron_version = new_Gs[j].to_full_array()
+            assert np.allclose(kron_version, expected_vals[j])
+
+    print()
+
+
+test_reconstructed_vals_update_with_dprobs()
\ No newline at end of file

From 74b6433d9585006925bff3e85ad896b3672471f9 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 14 Aug 2025 09:58:09 -0700
Subject: [PATCH 121/141] Attempt to handle identical gates.

---
 pygsti/layouts/evaltree.py                    | 60 +++++++++++--------
 .../test_forwardsim_on_implicitop_model.py    |  8 ---
 2 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 07b34f897..e81129079 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -702,7 +702,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         self.alphabet_val_to_sorted_cache_inds: dict[LabelTup, list[int]] = {}
 
-        self.gpindex_to_cache_vals: dict[int, list[int]] = {}
+        self.gpindex_to_cache_vals: dict[int, tuple[list[int], list[Label]]] = {}
         # This will be filled later by _gpindex_to_cache_inds_needed_to_recompute when we have access to the model.
         # Warning that changing the model paramvec will result in this cache becoming invalidated.
         # The user is currently in charge of resetting this cache.
@@ -728,7 +728,9 @@ def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: i
         if gp_index_changing in self.gpindex_to_cache_vals:
             return self.gpindex_to_cache_vals[gp_index_changing]
 
-        cache_inds = []
+        cache_inds: list[int] = []
+        all_op_inds: list[int] = []
+        invalid_lbls: list[Label] = []
         for lbl in self.alphabet_val_to_sorted_cache_inds.keys():
             # my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
             try:
@@ -738,15 +740,17 @@ def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: i
                 continue
             op_inds = my_op.gpindices_as_array()
             if gp_index_changing in op_inds:
-                cache_inds = self.alphabet_val_to_sorted_cache_inds[lbl] + [lbl] # We also invalidate the lbl.
-                for ind in op_inds: # Save off all the values we know about.
-                    self.gpindex_to_cache_vals[ind] = cache_inds
-                return cache_inds
-        return cache_inds
+                cache_inds.extend(self.alphabet_val_to_sorted_cache_inds[lbl])
+                invalid_lbls.append(lbl)  # We also invalidate the lbl.
+                all_op_inds.extend(op_inds)
+
+        for ind in all_op_inds:
+            self.gpindex_to_cache_vals[ind] = (cache_inds, invalid_lbls)
+        return cache_inds, invalid_lbls
     
     def _which_full_circuits_will_change_due_to_gpindex_changing(self, model, gp_index_changing: int) -> list[int]:
 
-        cache_inds = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+        cache_inds, _ = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
 
         if len(cache_inds) == 0:
             return []
@@ -809,7 +813,7 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
             # Dig through the tree to see if we have a matching
 
-            cache_inds = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
 
             if cache_inds:
                 # Invalidate all gate labels that we saved just in case.
@@ -823,11 +827,11 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
                 # Ignore the last index which is the Label that matched the gpindex.
                 # We assume that only one will match.
-                for cache_ind in cache_inds[:-1]:
+                for cache_ind in cache_inds:
                     cumulative_term = None
                     for term in self.cache[cache_ind]:
                         cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes,
-                                                                    num_qubits_in_default, cache_inds)
+                                                                    num_qubits_in_default, cache_inds, invalid_lbls)
 
                     # Save locally.
                     if cumulative_term is None:
@@ -864,7 +868,8 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                 for cache_ind in self.sequence_intro[key]:
                     cumulative_term = None
                     for term in self.cache[cache_ind]:
-                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results, num_qubits_in_default, empty)
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results,
+                                                                    num_qubits_in_default, empty, empty)
                             
                     if cumulative_term is None:
                         self.results[cache_ind] = _np.eye(4**num_qubits_in_default)
@@ -910,8 +915,7 @@ def combine_for_visualization(self, val, visited):
     def handle_results_cache_lookup_and_product(self,
                             cumulative_term: None | _np.ndarray,
                             term_to_extend_with: int | LabelTupTup,
-                            results_cache: dict[int | LabelTupTup, _np.ndarray],
-                            globally_invalid_cache_inds: list[Union[int, Label]]) -> _np.ndarray:
+                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
 
         if cumulative_term is None:
             return results_cache[term_to_extend_with]
@@ -945,7 +949,8 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
                             term_to_extend_with: int | LabelTupTup,
                             local_results_cache: dict[int | LabelTupTup, _np.ndarray],
                             num_qubits_in_default: int,
-                            globally_invalid_cache_inds: list[Union[int, LabelTupTup]]
+                            globally_invalid_cache_inds: Optional[list[int]] = None,
+                            globally_invalid_labels: Optional[list[LabelTupTup]] = None
                             ) -> _np.ndarray:
         """
         Reduce a cache line to a single process matrix.
@@ -955,19 +960,24 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
         """
 
         if (term_to_extend_with in local_results_cache):
-            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
-                                                                local_results_cache, globally_invalid_cache_inds)
+            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                local_results_cache)
         elif isinstance(term_to_extend_with, int) and \
-            (term_to_extend_with not in globally_invalid_cache_inds[:-1]) and \
+            (globally_invalid_cache_inds is not None) and \
+            (term_to_extend_with not in globally_invalid_cache_inds) and \
                 (term_to_extend_with in self.results):
             
-            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
-                                                                self.results, globally_invalid_cache_inds)
+            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                self.results)
         elif isinstance(term_to_extend_with, LabelTupTup) and \
-            not (any([t in globally_invalid_cache_inds[-1:] for t in term_to_extend_with])) \
+            (globally_invalid_labels is not None) and \
+            not (any([t in globally_invalid_labels for t in term_to_extend_with])) \
                 and (term_to_extend_with in self.results):        
-            return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
-                                                                self.results, globally_invalid_cache_inds)
+            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                self.results)
         
         # elif isinstance(term_to_extend_with, LabelTup) and \
         #     (term_to_extend_with not in globally_invalid_cache_inds[-1:]) \
@@ -978,6 +988,8 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
         else:
             val = 1
             qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
+            if isinstance(term_to_extend_with, int):
+                breakpoint()
             matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
                                             local_results_cache, self.swap_gate) for op in term_to_extend_with}
             qubit_used = []
@@ -1088,7 +1100,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
     def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
 
         for key in self.trees:
-            inds = self.trees[key]._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+            inds, lbls = self.trees[key]._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
             if len(inds) > 0:
                 return True
         return False
diff --git a/test/unit/objects/test_forwardsim_on_implicitop_model.py b/test/unit/objects/test_forwardsim_on_implicitop_model.py
index b6a6108d0..9b811c696 100644
--- a/test/unit/objects/test_forwardsim_on_implicitop_model.py
+++ b/test/unit/objects/test_forwardsim_on_implicitop_model.py
@@ -455,7 +455,6 @@ def test_tensor_product_two_qubit_gates_dprobs_bulk():
     circ_list = [circuitECR01, circuitECR10]
     exp = expected_model.sim.bulk_dprobs(circ_list)
 
-    breakpoint()
     dprobs = under_test.sim.bulk_dprobs(circ_list)
 
 
@@ -465,9 +464,6 @@ def test_tensor_product_two_qubit_gates_dprobs_bulk():
 test_tensor_product_two_qubit_gates_dprobs_bulk()
 
 def test_tensor_product_single_unitaries_yield_right_results_dprobs():
-
-    import importlib as _importlib
-
     num_qubits = 2
 
     under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)
@@ -482,16 +478,12 @@ def test_tensor_product_single_unitaries_yield_right_results_dprobs():
     circuits = [circuitNone, circuitX, circuitY, circuitZ, circuitIdle]
     for cir in circuits:
         probs = under_test.sim.dprobs(cir)
-        # expected_model.sim.calclib = _importlib.import_module("pygsti.forwardsims.mapforwardsim_calc_generic")
-
         exp = expected_model.sim.dprobs(cir)
 
         assert_probability_densities_are_equal(probs, exp, cir)
 
 def test_tensor_product_single_unitaries_yield_right_results_dprobs_bulk():
 
-    import importlib as _importlib
-
     num_qubits = 2
 
     under_test, expected_model = build_models_for_testing(num_qubits, independent_gates=True, simplify_for_dprobs=True)

From 821e5327613a68d9365b7a99bb1d68d67e1fe657 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Thu, 14 Aug 2025 17:11:31 -0700
Subject: [PATCH 122/141] Attempting to speed up by only replacing part of the
 KronStructured data structure each time. This currently fails. I believe due
 to the sparse linear op not updating as well when you update the A and B
 matrices it uses.

---
 pygsti/forwardsims/matrixforwardsim.py        | 20 +---
 pygsti/layouts/evaltree.py                    | 94 ++++++++++++++-----
 .../modelmembers/operations/dyadickronop.py   | 60 +++++++++---
 3 files changed, 122 insertions(+), 52 deletions(-)

diff --git a/pygsti/forwardsims/matrixforwardsim.py b/pygsti/forwardsims/matrixforwardsim.py
index 4481e32d1..6aab332cc 100644
--- a/pygsti/forwardsims/matrixforwardsim.py
+++ b/pygsti/forwardsims/matrixforwardsim.py
@@ -2500,7 +2500,7 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
         
         sp_obj, povm_objs, elm_indices, tree_indices = self._layout_atom_to_rho_es_elm_inds_and_tree_inds_objs(layout_atom)
 
-        elm_indices = [_slct.indices(elm) for elm in elm_indices]
+        elm_indices = _np.array([_slct.indices(elm) for elm in elm_indices])
 
         orig_dense_sp = sp_obj.to_dense(on_space="minimal")[:,None] # To maintain expected shape.
         dense_povms = _np.vstack([_np.conjugate(_np.transpose(povm.to_dense(on_space='minimal')[:, None])) for povm in povm_objs])
@@ -2544,8 +2544,8 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
         remaining_param_inds = sorted(list(set(param_indices) - set(_slct.to_array(povm_gpindices)) - set(_slct.to_array(rho_gpindices))))
 
+        dirty_circuits = layout_atom.tree.determine_which_circuits_will_update_for_what_gpindices(self.model)
 
-        tVals = _slct.indices(tree_indices)
         for i in range(len(remaining_param_inds)):
             probs2[:] = base_probs[:] # Copy off the data
             iFinal = iParamToFinal[remaining_param_inds[i]]
@@ -2559,22 +2559,12 @@ def _bulk_fill_dprobs_atom_using_from_vector_but_cache_collapse(self, array_to_f
 
             tmp = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs, return_two_D=True)
 
-            for j in range(len(inds_to_update)):
-                for k in range(len(elm_indices)):
-                    probs2[elm_indices[k][inds_to_update[j]]] = tmp[k, j]
-
-
-            # if len(Gs) > 0:
-            #     probs2 = probs2.reshape((layout_atom.num_elements // len(tVals), len(tVals)), order="F")
-            
-            #     tmp = self._probs_from_rho_e(orig_dense_sp, dense_povms, Gs, return_two_D=True)
-                
-            #     if len(tmp) > 0:
-            #         probs2[:, inds_to_update] = tmp
-            #     probs2 = probs2.reshape(layout_atom.num_elements, order="F")
+            probs2[elm_indices[:, inds_to_update]] = tmp
 
             array_to_fill[:, iFinal] = (probs2 - base_probs) / eps
 
+            layout_atom.tree.reset_full_matrices_to_base_probs_version()
+
             if return_sequence_matrices:
                 output.append(Gs)
 
diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index e81129079..dbe78e976 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -1096,6 +1096,9 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
 
         self.saved_results: dict[Union[LabelTupTup, int], _np.ndarray] = {}
         self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
+        self.original_matrices: dict[int, dict[int, _np.ndarray]] = {}
+        self.full_matrices: list[KronStructured] = []
+        self.process_matrices_which_will_need_to_update_for_index: _np.ndarray = []
 
     def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
 
@@ -1128,6 +1131,29 @@ def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Option
             self.saved_results[key] = out1
             self.sub_cir_to_ind_in_results[key] = out2
 
+    def determine_which_circuits_will_update_for_what_gpindices(self, model):
+
+        dirty_circuits = _np.zeros((model.num_params, len(self.trees), len(self.cir_id_and_lane_id_to_sub_cir)))
+        for ind in range(model.num_params):
+
+            for ikey, key in enumerate(self.trees):
+                dirty_circuits[ind, ikey, self.trees[key]._which_full_circuits_will_change_due_to_gpindex_changing(model, ind)] = 1
+
+        self.process_matrices_which_will_need_to_update_for_index = dirty_circuits
+        return dirty_circuits
+
+    def reset_full_matrices_to_base_probs_version(self) -> None:
+        """
+        Any matrix which was updated previously reset to the original version.
+        """
+
+        for icir in self.original_matrices:
+            for lane_in_cir in self.original_matrices[icir]:
+                self.full_matrices[icir].update_operand(lane_in_cir, self.original_matrices[icir][lane_in_cir])
+        self.original_matrices = {}
+        return
+
+
     def reconstruct_full_matrices(self,
                                   model = None,
                                   gp_index_changing: Optional[int] = None) -> \
@@ -1142,36 +1168,58 @@ def reconstruct_full_matrices(self,
         num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
         cir_inds = _np.arange(num_cirs, dtype=_np.int32)
         if (gp_index_changing is not None) and (model is not None):
-            cir_changes = set()
+            
+            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
+
+            lane_key_to_ind: dict[tuple[int, ...], int] = {key: ikey for ikey, key in enumerate(self.trees)}
 
-            for key in self.trees:
-                cir_changes = cir_changes.union(self.trees[key]._which_full_circuits_will_change_due_to_gpindex_changing(model, gp_index_changing))
+            output = []
+            if len(cir_inds) > 0:
+                self.original_matrices = {} # Reset the cache of updated process matrices.
 
-            cir_inds = _np.array(sorted(list(cir_changes)), dtype=_np.int32)
+            for icir in cir_inds:
+                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                    cir: _Circuit = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    lblkey = cir._line_labels
 
-        output = []
+                    lane_ind = lane_key_to_ind[lblkey]
+                    if self.process_matrices_which_will_need_to_update_for_index[gp_index_changing, lane_ind, icir]:
+                        ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                        if icir in self.original_matrices:
+                            self.original_matrices[icir][i] = self.full_matrices[icir].kron_operands[i]
+                        else:
+                            self.original_matrices[icir] = {lane_ind: self.full_matrices[icir].kron_operands[i]}
+                        self.full_matrices[icir].update_operand(i, self.trees[lblkey].results[ind_in_results])
+                        output.append(self.full_matrices[icir])
 
-        # Now we can do the combination.
+            return output, cir_inds
+
+        else:
+            output = []
 
-        for icir in cir_inds:
-            lane_circuits = []
-            for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
-                cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
-                lblkey = cir._line_labels
+            # Now we can do the combination.
 
-                ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                if ind_in_results not in self.saved_results[lblkey]:
-                    # We have only the local changes.
-                    # This will be stored in the results file of the subtree.
-                    lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+            for icir in cir_inds:
+                lane_circuits = []
+                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    lblkey = cir._line_labels
+
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    if ind_in_results not in self.saved_results[lblkey]:
+                        # We have only the local changes.
+                        # This will be stored in the results file of the subtree.
+                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                    else:
+                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                if len(lane_circuits) > 1:
+                    output.append(KronStructured(lane_circuits))
+                elif len(lane_circuits) == 1:
+                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
                 else:
-                    lane_circuits.append(self.saved_results[lblkey][ind_in_results])
-            if len(lane_circuits) > 1:
-               output.append(KronStructured(lane_circuits))
-            elif len(lane_circuits) == 1:
-                output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
-            else:
-                raise ValueError()
+                    raise ValueError()
+            
+        self.full_matrices = output
         return output, cir_inds
     
     def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False):
diff --git a/pygsti/modelmembers/operations/dyadickronop.py b/pygsti/modelmembers/operations/dyadickronop.py
index 08a6c80cb..8916a1856 100644
--- a/pygsti/modelmembers/operations/dyadickronop.py
+++ b/pygsti/modelmembers/operations/dyadickronop.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 import numpy as np
 import scipy.sparse.linalg as sparla
 
@@ -40,14 +41,6 @@ def __rmatmul__(self, other):
         return other @ self._linop
 
 
-def is_2d_square(arg):
-    if not hasattr(arg, 'shape'):
-        return False
-    if len(arg.shape) != 2:
-        return False
-    return arg.shape[0] == arg.shape[1]
-
-
 class DyadicKronStructed(RealLinOp):
 
     def __init__(self, A, B, adjoint=None):
@@ -92,7 +85,7 @@ def rmatvec(self, other):
         return out
     
     @staticmethod
-    def build_polyadic(kron_operands):
+    def build_polyadic(kron_operands) -> DyadicKronStructed:
         if len(kron_operands) == 2:
             out = DyadicKronStructed(kron_operands[0], kron_operands[1])
             return out
@@ -106,12 +99,12 @@ class KronStructured(RealLinOp):
 
     def __init__(self, kron_operands):
         self.kron_operands = kron_operands
-        assert all([op.ndim == 2 for op in kron_operands])
+        # assert all([op.ndim == 2 for op in kron_operands])
         self.shapes = np.array([op.shape for op in kron_operands])
         self._shape = tuple(int(i) for i in np.prod(self.shapes, axis=0))
-        forward = DyadicKronStructed.build_polyadic(self.kron_operands)
-        self._linop   = forward._linop
-        self._adjoint = forward.T
+        self.dyadic_struct = DyadicKronStructed.build_polyadic(self.kron_operands)
+        self._linop   = self.dyadic_struct._linop
+        self._adjoint = self.dyadic_struct.T
         self._dtype = self.kron_operands[0].dtype
 
     def to_full_array(self) -> np.ndarray:
@@ -123,4 +116,43 @@ def to_full_array(self) -> np.ndarray:
         output = 1
         for i in range(len(self.kron_operands)):
             output = np.kron(output, self.kron_operands[i])
-        return output
\ No newline at end of file
+        return output
+    
+    def update_operand(self, lane: int, matrix: np.ndarray) -> None:
+        """
+        Replace a specific matrix with a new matrix of the same size and layout.
+        """
+
+        assert lane >= 0 and lane < len(self.kron_operands)
+
+        # Iterate through the structure and replace the A matrix of the appropriate index with the new matrix.
+
+        def walk_forward(curr_loc: int, dydadic_struct: DyadicKronStructed, mat: np.ndarray):
+            if curr_loc == lane:
+                dydadic_struct.A = mat
+                breakpoint()
+                return
+            elif lane == len(self.kron_operands) - 1 and (curr_loc == lane -1):
+                # We have hit the leaf node.
+                dydadic_struct.B = mat
+                return
+            return walk_forward(curr_loc + 1, dydadic_struct.B, mat) 
+        
+        walk_forward(0, self.dyadic_struct, matrix)
+        walk_forward(0, self._adjoint, matrix.T)
+
+
+
+class KronStructuredPath:
+    """
+    This class is for making a path for computing a matvec and adjoint of a kron structured matrix.
+
+    When those functions are called you need to specify the actual dense operands being used.
+    This way we can just swap out one the matrices each time we update the representation of a gate.
+    """
+
+
+    def __init__(self, shapes: list[tuple[int, int]]):
+
+
+        pass
\ No newline at end of file

From 9bb7a5961e445723f2bbb072f77c2a79cdd1dba1 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 10:32:59 -0700
Subject: [PATCH 123/141] Add in a flops estimate for dprobs LCS tree.

---
 pygsti/layouts/evaltree.py | 70 +++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index dbe78e976..8387f407e 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -36,6 +36,7 @@
 import time
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
+from scipy.sparse import kron as sparse_kron
 from typing import List, Optional, Iterable, Union, TYPE_CHECKING, Tuple
 from pygsti.tools.tqdm import our_tqdm
 
@@ -1037,7 +1038,7 @@ def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
 
         return list(output)
 
-    def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int]):
+    def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int], model = None, gp_index_changing: Optional[int] = None) -> int:
         """
         We assume that each matrix matrix multiply is the same size.
         """
@@ -1045,7 +1046,14 @@ def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int]):
         assert matrix_size[0] == matrix_size[1]
 
         total_flop_cost = 0
-        for cache_ind in self.cache:
+        if (model is not None) and (gp_index_changing is not None):
+
+            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+        else:
+            cache_inds = list(self.cache.keys())
+
+        for cache_ind in cache_inds:
             num_mm_on_this_cache_line = len(self.cache[cache_ind]) - 1
             total_flop_cost += (matrix_matrix_cost_estimate(matrix_size)) * num_mm_on_this_cache_line
 
@@ -1178,25 +1186,32 @@ def reconstruct_full_matrices(self,
                 self.original_matrices = {} # Reset the cache of updated process matrices.
 
             for icir in cir_inds:
+                lane_circuits = []
                 for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
-                    cir: _Circuit = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
                     lblkey = cir._line_labels
 
-                    lane_ind = lane_key_to_ind[lblkey]
-                    if self.process_matrices_which_will_need_to_update_for_index[gp_index_changing, lane_ind, icir]:
-                        ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                        if icir in self.original_matrices:
-                            self.original_matrices[icir][i] = self.full_matrices[icir].kron_operands[i]
-                        else:
-                            self.original_matrices[icir] = {lane_ind: self.full_matrices[icir].kron_operands[i]}
-                        self.full_matrices[icir].update_operand(i, self.trees[lblkey].results[ind_in_results])
-                        output.append(self.full_matrices[icir])
-
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    if ind_in_results not in self.saved_results[lblkey]:
+                        # We have only the local changes.
+                        # This will be stored in the results file of the subtree.
+                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                    else:
+                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                if len(lane_circuits) > 1:
+                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
+                    # output.append(KronStructured(lane_circuits))
+                elif len(lane_circuits) == 1:
+                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
+                else:
+                    raise ValueError()
+        
             return output, cir_inds
 
         else:
             output = []
 
+
             # Now we can do the combination.
 
             for icir in cir_inds:
@@ -1213,7 +1228,8 @@ def reconstruct_full_matrices(self,
                     else:
                         lane_circuits.append(self.saved_results[lblkey][ind_in_results])
                 if len(lane_circuits) > 1:
-                    output.append(KronStructured(lane_circuits))
+                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
+                    # output.append(KronStructured(lane_circuits))
                 elif len(lane_circuits) == 1:
                     output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
                 else:
@@ -1221,21 +1237,35 @@ def reconstruct_full_matrices(self,
             
         self.full_matrices = output
         return output, cir_inds
-    
-    def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False):
+
+
+    def recurse_to_build_sparse_kron_matrix(self, operands: list[_np.ndarray]):
+        if len(operands) == 1:
+            return operands[0]
+        return sparse_kron(operands[0], self.recurse_to_build_sparse_kron_matrix(operands[1:]))
+
+
+    def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False, model = None, gp_index_changing: Optional[int] = None):
 
 
         cost_collapse = 0
         for key in self.trees:
             num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
             tree = self.trees[key]
-            cost_collapse += tree.flop_cost_of_evaluating_tree(tuple([4**num_qubits, 4**num_qubits]))
+            cost_collapse += tree.flop_cost_of_evaluating_tree(tuple([4**num_qubits, 4**num_qubits]), model, gp_index_changing)
         
 
         tensor_cost = 0
         num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
 
-        for cir_id in range(num_cirs):
+        if (model is not None) and (gp_index_changing is not None):
+
+            dirty_circuits = self.determine_which_circuits_will_update_for_what_gpindices(model)
+            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
+
+        
+        for cir_id in cir_inds:
             qubit_list = ()
             for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
                 subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
@@ -1249,6 +1279,8 @@ def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: boo
             return tensor_cost + cost_collapse, cost_collapse
         elif return_tensor_matvec:
             return tensor_cost + cost_collapse, tensor_cost
+        elif gp_index_changing is not None:
+            return tensor_cost + cost_collapse, len(cir_inds)  # Since you are not updating all of the representations we do not need to update the state props either for those.
 
         return tensor_cost + cost_collapse
 
@@ -1352,7 +1384,7 @@ def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[
 
 
 
-def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int):
+def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int) -> int:
 
     assert _np.sum(qubit_list) == total_num_qubits
 

From 7992a6b3df5f57e2ee838e865ed42e39281cf7be Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 11:49:05 -0700
Subject: [PATCH 124/141] Merge made memo set twice.

---
 pygsti/models/model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pygsti/models/model.py b/pygsti/models/model.py
index dc97e4e13..a6db8a942 100644
--- a/pygsti/models/model.py
+++ b/pygsti/models/model.py
@@ -988,7 +988,6 @@ def _rebuild_paramvec(self):
                 if num_new_params > 0:
                     memo = set()
                     # If so, before allocating anything, make the necessary space in the parameter arrays:
-                    memo = set()
                     for _, o in self._iter_parameterized_objs():
                         o.shift_gpindices(insertion_point, num_new_params, self, memo)
                     w = _np.insert(w, insertion_point, _np.empty(num_new_params, 'd'))

From 82260793ba5d52b992125b4a6de66ab3faedc737 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 19 Aug 2025 12:41:38 -0700
Subject: [PATCH 125/141] finish actually changing Circuit.from_cirq from a
 classmethod to a static method

---
 pygsti/circuits/circuit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 0755af4b4..b4d92c70f 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -4277,9 +4277,9 @@ def from_cirq(cls, circuit: Circuit, qubit_conversion=None, cirq_gate_conversion
         #labels to include all of the qubits appearing in the cirq circuit, otherwise
         #we'll let the Circuit constructor figure this out.
         if seen_global_idle:
-            return cls(circuit_layers, line_labels = tuple(sorted([qubit_conversion[qubit] for qubit in all_cirq_qubits])))
+            return Circuit(circuit_layers, line_labels = tuple(sorted([qubit_conversion[qubit] for qubit in all_cirq_qubits])))
         else:
-            return cls(circuit_layers)        
+            return Circuit(circuit_layers)        
 
     def convert_to_quil(self,
                         num_qubits=None,

From 5a29f4b4f0b4889586b9afd9521e927cc075e685 Mon Sep 17 00:00:00 2001
From: Riley Murray <rjmurr@sandia.gov>
Date: Tue, 19 Aug 2025 12:48:26 -0700
Subject: [PATCH 126/141] Revert "finish actually changing Circuit.from_cirq
 from a classmethod to a static method"

This reverts commit 82260793ba5d52b992125b4a6de66ab3faedc737.
---
 pygsti/circuits/circuit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index b4d92c70f..0755af4b4 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -4277,9 +4277,9 @@ def from_cirq(cls, circuit: Circuit, qubit_conversion=None, cirq_gate_conversion
         #labels to include all of the qubits appearing in the cirq circuit, otherwise
         #we'll let the Circuit constructor figure this out.
         if seen_global_idle:
-            return Circuit(circuit_layers, line_labels = tuple(sorted([qubit_conversion[qubit] for qubit in all_cirq_qubits])))
+            return cls(circuit_layers, line_labels = tuple(sorted([qubit_conversion[qubit] for qubit in all_cirq_qubits])))
         else:
-            return Circuit(circuit_layers)        
+            return cls(circuit_layers)        
 
     def convert_to_quil(self,
                         num_qubits=None,

From d4e5db69ea522b239180603ce4038b5b9645f750 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:20:28 -0700
Subject: [PATCH 127/141] Move daydic kron op to tools.

---
 .../operations => tools}/dyadickronop.py      | 52 +++++--------------
 1 file changed, 13 insertions(+), 39 deletions(-)
 rename pygsti/{modelmembers/operations => tools}/dyadickronop.py (74%)

diff --git a/pygsti/modelmembers/operations/dyadickronop.py b/pygsti/tools/dyadickronop.py
similarity index 74%
rename from pygsti/modelmembers/operations/dyadickronop.py
rename to pygsti/tools/dyadickronop.py
index 8916a1856..687d22c76 100644
--- a/pygsti/modelmembers/operations/dyadickronop.py
+++ b/pygsti/tools/dyadickronop.py
@@ -1,3 +1,16 @@
+"""
+Tools for working with Kronecker Products especially the Dyadic forms.
+"""
+#***************************************************************************************************
+# Copyright 2015, 2019, 2025 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
+# in this software.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.  You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
+#***************************************************************************************************
+
+
 from __future__ import annotations
 import numpy as np
 import scipy.sparse.linalg as sparla
@@ -117,42 +130,3 @@ def to_full_array(self) -> np.ndarray:
         for i in range(len(self.kron_operands)):
             output = np.kron(output, self.kron_operands[i])
         return output
-    
-    def update_operand(self, lane: int, matrix: np.ndarray) -> None:
-        """
-        Replace a specific matrix with a new matrix of the same size and layout.
-        """
-
-        assert lane >= 0 and lane < len(self.kron_operands)
-
-        # Iterate through the structure and replace the A matrix of the appropriate index with the new matrix.
-
-        def walk_forward(curr_loc: int, dydadic_struct: DyadicKronStructed, mat: np.ndarray):
-            if curr_loc == lane:
-                dydadic_struct.A = mat
-                breakpoint()
-                return
-            elif lane == len(self.kron_operands) - 1 and (curr_loc == lane -1):
-                # We have hit the leaf node.
-                dydadic_struct.B = mat
-                return
-            return walk_forward(curr_loc + 1, dydadic_struct.B, mat) 
-        
-        walk_forward(0, self.dyadic_struct, matrix)
-        walk_forward(0, self._adjoint, matrix.T)
-
-
-
-class KronStructuredPath:
-    """
-    This class is for making a path for computing a matvec and adjoint of a kron structured matrix.
-
-    When those functions are called you need to specify the actual dense operands being used.
-    This way we can just swap out one the matrices each time we update the representation of a gate.
-    """
-
-
-    def __init__(self, shapes: list[tuple[int, int]]):
-
-
-        pass
\ No newline at end of file

From b3a226ce2dd94674c020d0d5fc3993fb33eb051a Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:21:25 -0700
Subject: [PATCH 128/141] Remove permutation op.

---
 .../modelmembers/operations/permutationop.py  | 68 -------------------
 1 file changed, 68 deletions(-)
 delete mode 100644 pygsti/modelmembers/operations/permutationop.py

diff --git a/pygsti/modelmembers/operations/permutationop.py b/pygsti/modelmembers/operations/permutationop.py
deleted file mode 100644
index f19d25c5b..000000000
--- a/pygsti/modelmembers/operations/permutationop.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from pygsti.modelmembers.operations import DenseOperator
-from pygsti.baseobjs.basisconstructors import pp_labels
-import numpy as _np
-
-class PermutationOperator(DenseOperator):
-
-    def __init__(self, perm: _np.ndarray):
-        dim = perm.size
-        mx = _np.eye(dim)
-        mx = mx[perm,:]
-        super().__init__(mx, 'pp', 'densitymx')
-        self._perm = perm
-
-    @property
-    def num_params(self):
-        return 0
-    
-    def to_vector(self):
-        return _np.array([])
-
-    def from_vector(self, v, close=False, dirty_value=True):
-        if v.size > 0:
-            raise ValueError()
-        return
-    
-    def transform(self, S):
-        raise NotImplementedError("PermutationOperator cannot be transformed!")
-    
-    def inverse_operator(self):
-        iperm = PermutationOperator.inv_perm(self._perm)
-        return PermutationOperator(iperm)
-    
-    @staticmethod
-    def inv_perm(perm):
-        iperm = perm.copy()
-        iperm[iperm] = _np.arange(iperm.size)
-        return iperm
-    
-    @staticmethod
-    def perm_from_mx(mx):
-        perm = _np.array([_np.where(row == 1)[0][0] for row in mx])
-        return perm
-    
-    ## We need to implement this in order to deserialize.
-    @classmethod
-    def _from_memoized_dict(cls, mm_dict, serial_memo):
-        mx = cls._decodemx(mm_dict['dense_matrix'])
-        mx = mx.squeeze()
-        # state_space = _statespace.StateSpace.from_nice_serialization(mm_dict['state_space'])
-        # basis = _Basis.from_nice_serialization(mm_dict['basis']) if (mm_dict['basis'] is not None) else None
-        # return cls(m, basis, mm_dict['evotype'], state_space)
-        perm = PermutationOperator.perm_from_mx(mx)
-        return PermutationOperator(perm)
-
-    @staticmethod
-    def pp_braiding_operators(subsystem_perm):
-        subsystem_perm = _np.atleast_1d(subsystem_perm).copy()
-        n_qubits = subsystem_perm.size
-        labels = _np.array(pp_labels(2**n_qubits))
-        braid_labels = _np.array([''.join([ell[i] for i in subsystem_perm]) for ell in labels])
-        braid_perm = []
-        for bl in braid_labels:
-            loc = _np.where(labels == bl)[0].item()
-            braid_perm.append(loc)
-        braid_perm = _np.array(braid_perm)
-        pop = PermutationOperator(braid_perm)
-        ipop = pop.inverse_operator()
-        return pop, ipop

From 72303c3b05b2332697ce9080b86d9903c0e521f0 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:23:51 -0700
Subject: [PATCH 129/141] Remove unused_xfgst.py

---
 pygsti/protocols/unused_xfgst.py | 66 --------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 pygsti/protocols/unused_xfgst.py

diff --git a/pygsti/protocols/unused_xfgst.py b/pygsti/protocols/unused_xfgst.py
deleted file mode 100644
index 5a4963372..000000000
--- a/pygsti/protocols/unused_xfgst.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# # NOTE: Ignore this function. I'm pretty sure it is not needed.
-# def generate_edge_colorings(vertices: list, edges: list) -> list:
-#     """
-#     Generate a set of edge colorings for a graph until all edges are colored.
-
-#     This function takes an edge set of a simple undirected graph and repeatedly 
-#     applies the Misra & Gries edge coloring algorithm until every edge is 
-#     contained in some edge coloring. It returns a dictionary mapping colors 
-#     to the edges colored with that color.
-
-#     Parameters:
-#     vertices (list): A list of vertices in the graph.
-#     edges (list): A list of edges represented as tuples (u, v) where u and v 
-#                   are vertices in the graph.
-
-#     Returns:
-#     list: A list of edge colorings (dictionaries whose keys are colors and items are lists colored edges)
-#     """
-#     list_of_edge_colorings = []
-#     uncolored_edges = set(edges)
-
-#     while uncolored_edges:
-#         # Determine which vertices are neighbors in a graph with only uncolored edges
-#         # Could call find_neighbors here...
-#         updated_neighbors = {v: [] for v in vertices}
-#         for u, v in uncolored_edges:
-#             updated_neighbors[u].append(v)
-
-#         # Calculate the maximum degree of the graph
-#         deg = max(len(updated_neighbors[v]) for v in vertices)
-
-#         # Find an edge coloring
-#         new_color_patches = find_edge_coloring(deg, vertices, list(uncolored_edges), updated_neighbors)
-
-#         # Update color patches and remove newly colored edges from uncolored_edges
-#         list_of_edge_colorings.append(new_color_patches)
-#         for _, edge_list in new_color_patches.items():
-#             uncolored_edges.difference_update(edge_list)
-#             uncolored_edges.difference_update([(v,u) for u, v in edge_list]) # need to symmetrize
-
-#     return list_of_edge_colorings
-
-# # NOTE: This class is superfluous. Keeping it around in case I realize that it isn't - Daniel H.
-# class CrosstalkFreeCombinedExperimentDesign(CombinedExperimentDesign, HasProcessorSpec):
-#     def __init__(self, processor_spec, oneq_gstdesign, twoq_gstdesign, seed = None, interleave = False):
-        
-#         HasProcessorSpec.__init__(self, processor_spec)
-        
-#         randstate = np.random.RandomState(seed)
-#         self.interleave = interleave
-#         self.oneq_gstdesign = oneq_gstdesign
-#         self.twoq_gstdesign = twoq_gstdesign
-#         self.vertices = self.processor_spec.qubit_labels
-#         self.edges = self.processor_spec.compute_2Q_connectivity().edges()
-#         self.neighbors = find_neighbors(self.vertices, self.edges)
-#         self.deg = max([len(self.neighbors[v]) for v in self.vertices])
-
-
-#         # Generate the sub-experiment designs
-#         self.edge_colorings = generate_edge_colorings(self.vertices, self.edges)
-#         self.sub_designs = [CrosstalkFreeSubExperimentDesign(self.processor_spec, 
-#                                                             self.oneq_gstdesign,
-#                                                             self.twoq_gstdesign,
-#                                                             edge_coloring,
-#                                                             randstate) for edge_coloring in self.edge_colorings]
-#         CombinedExperimentDesign.__init__(self, sub_designs = self.sub_designs, qubit_labels = self.vertices, interleave = self.interleave)

From 19d43d512040e013b3bce8475f0faa219aa39552 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:28:14 -0700
Subject: [PATCH 130/141] Reactivate test_errgenproptools.py

---
 ...eactivated_test_errgenproptools.py => test_errgenproptools.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/unit/tools/{tempdeactivated_test_errgenproptools.py => test_errgenproptools.py} (100%)

diff --git a/test/unit/tools/tempdeactivated_test_errgenproptools.py b/test/unit/tools/test_errgenproptools.py
similarity index 100%
rename from test/unit/tools/tempdeactivated_test_errgenproptools.py
rename to test/unit/tools/test_errgenproptools.py

From 525373e38ad5f6f80809fa572c72d93db189ce5c Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:32:05 -0700
Subject: [PATCH 131/141] Remove unused code and commented out code.

---
 pygsti/report/factory.py | 15 ---------------
 pygsti/tools/optools.py  |  2 --
 2 files changed, 17 deletions(-)

diff --git a/pygsti/report/factory.py b/pygsti/report/factory.py
index a0d585c41..8be472401 100644
--- a/pygsti/report/factory.py
+++ b/pygsti/report/factory.py
@@ -84,21 +84,6 @@ def _add_lbl(lst, lbl):
     return running_lbls
 
 
-#def _robust_estimate_has_same_models(estimates, est_lbl):
-#    lbl_robust = est_lbl+ROBUST_SUFFIX
-#    if lbl_robust not in estimates: return False #no robust estimate
-#
-#    for mdl_lbl in list(estimates[est_lbl].goparameters.keys()) \
-#        + ['final iteration estimate']:
-#        if mdl_lbl not in estimates[lbl_robust].models:
-#            return False #robust estimate is missing mdl_lbl!
-#
-#        mdl = estimates[lbl_robust].models[mdl_lbl]
-#        if estimates[est_lbl].models[mdl_lbl].frobeniusdist(mdl) > 1e-8:
-#            return False #model mismatch!
-#
-#    return True
-
 def _get_viewable_crf(est, est_lbl, mdl_lbl, verbosity=0):
     printer = _VerbosityPrinter.create_printer(verbosity)
 
diff --git a/pygsti/tools/optools.py b/pygsti/tools/optools.py
index 6309b0f99..0355fd7da 100644
--- a/pygsti/tools/optools.py
+++ b/pygsti/tools/optools.py
@@ -27,10 +27,8 @@
 from pygsti.baseobjs.label import Label as _Label
 from pygsti.baseobjs.errorgenlabel import LocalElementaryErrorgenLabel as _LocalElementaryErrorgenLabel
 from pygsti.tools.legacytools import deprecate as _deprecated_fn
-from pygsti import SpaceT
 
 IMAG_TOL = 1e-7  # tolerance for imaginary part being considered zero
-DIAMOND_NORM_SOLVE_VERBOSE = False
 
 
 def _flat_mut_blks(i, j, block_dims):

From bb78983d793993713a713b6ac9f3cfa577e132f0 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:44:19 -0700
Subject: [PATCH 132/141] Revert changes to mapforward_calc_generic

---
 pygsti/forwardsims/mapforwardsim_calc_generic.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pygsti/forwardsims/mapforwardsim_calc_generic.py b/pygsti/forwardsims/mapforwardsim_calc_generic.py
index 08b104721..6262e8810 100644
--- a/pygsti/forwardsims/mapforwardsim_calc_generic.py
+++ b/pygsti/forwardsims/mapforwardsim_calc_generic.py
@@ -184,8 +184,7 @@ def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, la
         iFinal = iParamToFinal[param_indices[i]]
         fwdsim.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
                                           [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
-        vec = fwdsim.model.to_vector()
-        assert _np.allclose(_np.where(vec != 0), [i])
+
         #mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc)
         cond_update_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, param_indices[i], resource_alloc)
         #assert _np.linalg.norm(probs2_test-probs2) < 1e-10

From d0152d1e8754487c67477641161074009a48cda8 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:45:05 -0700
Subject: [PATCH 133/141] No actually though.

---
 pygsti/forwardsims/mapforwardsim_calc_generic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pygsti/forwardsims/mapforwardsim_calc_generic.py b/pygsti/forwardsims/mapforwardsim_calc_generic.py
index 6262e8810..89e39087d 100644
--- a/pygsti/forwardsims/mapforwardsim_calc_generic.py
+++ b/pygsti/forwardsims/mapforwardsim_calc_generic.py
@@ -184,7 +184,6 @@ def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, la
         iFinal = iParamToFinal[param_indices[i]]
         fwdsim.model.set_parameter_values([param_indices[i-1], param_indices[i]], 
                                           [orig_vec[param_indices[i-1]], orig_vec[param_indices[i]]+eps])
-
         #mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc)
         cond_update_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, param_indices[i], resource_alloc)
         #assert _np.linalg.norm(probs2_test-probs2) < 1e-10

From 58d97e04e0de0924fa4aa058cb56c726299a2e16 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 13:48:27 -0700
Subject: [PATCH 134/141] Matrix layout update.

---
 pygsti/layouts/matrixlayout.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pygsti/layouts/matrixlayout.py b/pygsti/layouts/matrixlayout.py
index 09bcb346d..eac3df439 100644
--- a/pygsti/layouts/matrixlayout.py
+++ b/pygsti/layouts/matrixlayout.py
@@ -10,16 +10,16 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
-import collections as _collections
 
 import numpy as _np
 
 from pygsti.layouts.distlayout import DistributableCOPALayout as _DistributableCOPALayout
 from pygsti.layouts.distlayout import _DistributableAtom
-from pygsti.layouts.evaltree import CollectionOfLCSEvalTrees as _CollectionOfLCSEvalTrees
-from pygsti.layouts.evaltree import EvalTreeBasedUponLongestCommonSubstring as _EvalTreeLCS
-from pygsti.layouts.evaltree import EvalTree as _EvalTree
-from pygsti.layouts.evaltree import setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations
+from pygsti.layouts.evaltree import (
+    CollectionOfLCSEvalTrees as _CollectionOfLCSEvalTrees,
+    EvalTree as _EvalTree,
+    setup_circuit_list_for_LCS_computations as _setup_circuit_list_for_LCS_computations,
+)
 from pygsti.circuits.circuitlist import CircuitList as _CircuitList
 from pygsti.tools import listtools as _lt
 from pygsti.tools import slicetools as _slct
@@ -102,7 +102,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
 
                         #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
                         # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
-                        # keep track of a list of unique_i indices for each circut and spam tuple below.
+                        # keep track of a list of unique_i indices for each circuit and spam tuple below.
                         if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
                             _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
                         else:
@@ -304,7 +304,7 @@ def add_expanded_circuits(indices, add_to_this_dict):
 
                         #Now add these outcomes to `expanded_nospam_circuit_outcomes` - note that multiple "unique_i"'s
                         # may exist for the same expanded & without-spam circuit (exp_nospam_c) and so we need to
-                        # keep track of a list of unique_i indices for each circut and spam tuple below.
+                        # keep track of a list of unique_i indices for each circuit and spam tuple below.
                         if exp_nospam_c not in _expanded_nospam_circuit_outcomes:
                             _expanded_nospam_circuit_outcomes[exp_nospam_c] = {st:(outcome, [unique_i]) for st, outcome in zip(spam_tuples, outcomes)}
                         else:
@@ -479,7 +479,7 @@ class MatrixCOPALayout(_DistributableCOPALayout):
         A 1- or 2-tuple of integers specifying how many parameter-block processors are
         used when dividing the physical processors into a grid.  The first and second
         elements correspond to counts for the first and second parameter dimensions,
-        respecively.
+        respectively.
 
     param_dimensions : tuple, optional
         The number of parameters along each parameter dimension.  Can be an
@@ -510,7 +510,7 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
                  layout_creation_circuit_cache = None, use_old_tree_style: bool = True):
         
         if not use_old_tree_style:
-            # NOTE: ERrror out if we are useing new tree and have an explicit op model. Explain why this is bad.
+            # NOTE: Error out if we are using new tree and have an explicit op model. Explain why this is bad.
             from pygsti.models import ExplicitOpModel, ImplicitOpModel
             if isinstance(model, ExplicitOpModel):
                 raise ValueError(f"Model: {model.__class__} does not support creation of embedded op process matrices." +
@@ -525,7 +525,7 @@ def __init__(self, circuits, model, dataset=None, num_sub_trees=None, num_tree_p
         #    - heuristically find groups of circuits that meet criteria
         # 3. separately create a tree of no-spam expanded circuits originating from each group => self.atoms
         # 4. assign "cache" and element indices so that a) all elements of a tree are contiguous
-        #    and b) elements with the same spam-tuple are continguous.
+        #    and b) elements with the same spam-tuple are contiguous.
         # 5. initialize base class with given per-original-circuit element indices.
 
         unique_circuits, to_unique = self._compute_unique_circuits(circuits)

From 30ee59fa159c5a3895e68a5321817528c60a0315 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 14:12:31 -0700
Subject: [PATCH 135/141] Types for sequencetools

---
 pygsti/tools/sequencetools.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index 9c859cab1..a4d0e210c 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -1,4 +1,4 @@
-from typing import Sequence, Any, List, Literal, Tuple, MutableSequence, Optional
+from typing import Sequence, Any, List, Tuple, MutableSequence, Optional
 import numpy as _np
 from tqdm import tqdm
 
@@ -40,10 +40,12 @@ def _lcs_dp_version(A: Sequence, B: Sequence) -> _np.ndarray:
     return table
 
 
-def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSequence[Any]], table_data_and_sequences,
-                                            internal_tables_and_sequences,
-                                            starting_cache_num,
-                                            cache_struct, sequence_ind_to_cache_ind: Optional[dict[int, int]] = None):
+def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSequence[Any]],
+                                            table_data_and_sequences: tuple[_np.ndarray, dict[tuple[int, int], Sequence[Any]]],
+                                            internal_tables_and_sequences: tuple[_np.ndarray, dict[tuple[int, int], Sequence[Any]]],
+                                            starting_cache_num: int,
+                                            cache_struct: dict[int, Any],
+                                            sequence_ind_to_cache_ind: Optional[dict[int, int]] = None):
     """
     Simplify the set of sequences by contracting the set of longest common subsequences.
 
@@ -141,10 +143,14 @@ def conduct_one_round_of_lcs_simplification(sequences: MutableSequence[MutableSe
     return updated_sequences, cache_num, cache_struct, sequences_introduced_in_this_round, table, external_sequences, dirty_inds
 
 def simplify_internal_first_one_round(sequences: MutableSequence[MutableSequence[Any]],
-            internal_tables_and_sequences, starting_cache_num, cache_struct,
+            internal_tables_and_sequences: tuple[_np.ndarray, dict[tuple[int, int], Sequence[Any]]],
+            starting_cache_num: int,
+            cache_struct: dict[int, Any],
             seq_ind_to_cache_ind: Optional[dict[int, int]]):
     """
-    Simplify the set of sequences by contracting the set of longest common subsequences.
+    Simplify the set of sequences by contracting the set of longest common subsequences internal subsequences.
+
+    e.g. ["AAAA"] will be replaced with cache_num cache_num. But ["BAR", "BAC"] will not update here because "BA" is split between 2 sequences.
 
     Will update the list of sequences and the cache struct to hold the longest common subsequences as new sequences.
     

From dd402cf5d29163400ba5b2475b2c6ecf8114ea90 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 14:16:34 -0700
Subject: [PATCH 136/141] Add licensing info.

---
 pygsti/tools/sequencetools.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/pygsti/tools/sequencetools.py b/pygsti/tools/sequencetools.py
index a4d0e210c..1fc6950dc 100644
--- a/pygsti/tools/sequencetools.py
+++ b/pygsti/tools/sequencetools.py
@@ -1,8 +1,19 @@
+"""
+Tools for finding and using the longest common substrings in order to cache and evaluation order.
+"""
+#***************************************************************************************************
+# Copyright 2015, 2019, 2025 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
+# in this software.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.  You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
+#***************************************************************************************************
+
 from typing import Sequence, Any, List, Tuple, MutableSequence, Optional
 import numpy as _np
 from tqdm import tqdm
 
-#region Longest Common Subsequence
 
 def len_lcp(A: Sequence, B: Sequence) -> int:
     """
@@ -349,5 +360,3 @@ def create_tables_for_internal_LCS(
             the_table[i], seq_table[i] = _longest_common_internal_subsequence(sequences[i])
             curr_best = max(curr_best, the_table[i])
     return the_table, seq_table
-
-#endregion Longest Common Subsequence

From 343173073e41fb16ec5cd9ee1542425b034391cb Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 14:28:04 -0700
Subject: [PATCH 137/141] Remove code not needed for the PR.

---
 pygsti/circuits/circuit.py | 40 --------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/pygsti/circuits/circuit.py b/pygsti/circuits/circuit.py
index 0755af4b4..45bb87cd9 100644
--- a/pygsti/circuits/circuit.py
+++ b/pygsti/circuits/circuit.py
@@ -2549,46 +2549,6 @@ def replace_layer_with_circuit(self, circuit: Circuit, j):
         if self._static: cpy.done_editing()
         return cpy
 
-    def replace_spatially_equivalent_qubits(self, old_single_qubit, equiv_qubit_in_model):
-        """
-        Changes the *name* of a gate throughout this Circuit.
-
-        Note that the name is only a part of the label identifying each
-        gate, and doesn't include the lines (qubits) a gate acts upon.  For
-        example, the "Gx:0" and "Gx:1" labels both have the same name but
-        act on different qubits.
-
-        Parameters
-        ----------
-        old_single_qubit : int
-            The qubit to replace.
-
-        equiv_qubit_in_model : int
-            The qubit to replace `equiv_qubit_in_model` with.
-
-        Returns
-        -------
-        None
-        """
-        assert(not self._static), "Cannot edit a read-only circuit!"
-
-        def replace(obj):  # obj is either a simple label or a list
-            if isinstance(obj, _Label):
-                if len(obj.qubits) == 1:
-                    if obj.qubits[0] == old_single_qubit:
-                        newobj = _Label(obj.name,
-                                        (equiv_qubit_in_model,))
-                    else:
-                        newobj = obj
-                else: 
-                    newobj = obj
-            else:
-                newobj = [replace(sub) for sub in obj]
-            return newobj
-
-        self._labels = replace(self._labels)
-
-
     def replace_gatename_inplace(self, old_gatename, new_gatename):
         """
         Changes the *name* of a gate throughout this Circuit.

From b4f7044fc55d1182be7b6619a5a6710ca7c50bd0 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 14:55:26 -0700
Subject: [PATCH 138/141] Extract eval tree to new file.

---
 pygsti/layouts/evaltree.py                | 955 +--------------------
 pygsti/layouts/longest_common_evaltree.py | 971 ++++++++++++++++++++++
 2 files changed, 976 insertions(+), 950 deletions(-)
 create mode 100644 pygsti/layouts/longest_common_evaltree.py

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 8387f407e..81d750277 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -20,8 +20,6 @@
 from pygsti.circuits.circuit import Circuit as _Circuit, LayerTupLike
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
 from pygsti.baseobjs.label import LabelTupTup, Label, LabelTup
-from pygsti.modelmembers.operations import create_from_superop_mx
-from pygsti.modelmembers.operations import LinearOperator as _LinearOperator
 import itertools
 from pygsti.tools.sequencetools import (
     conduct_one_round_of_lcs_simplification,
@@ -29,11 +27,12 @@
     create_tables_for_internal_LCS,
     simplify_internal_first_one_round
 )
+from pygsti.tools.dyadickronop import KronStructured
+from pygsti.circuits.split_circuits_into_lanes import (
+    compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit,
+    compute_subcircuits
+)
 
-from pygsti.modelmembers.operations.dyadickronop import KronStructured
-
-from pygsti.circuits.split_circuits_into_lanes import compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit, compute_subcircuits
-import time
 import scipy.linalg as la
 import scipy.sparse.linalg as sparla
 from scipy.sparse import kron as sparse_kron
@@ -465,947 +464,3 @@ def _get_start_indices(max_intersect):
 
         assert(sum(map(len, disjointLists)) == num_elements), "sub-tree sets are not disjoint!"
         return disjointLists, helpfulScratchLists
-
-
-
-
-#region Split circuit list into lists of subcircuits
-
-def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: str|Label = 'I') -> _Circuit:
-    """
-    Add in explicit idles to the labels for each layer.
-    """
-
-    tmp = circuit.copy(editable=True)
-    num_layers = circuit.num_layers
-
-    for i in range(num_layers):
-        tmp[i] = Label(tmp.layer_label_with_idles(i, idle_gate_name))
-
-    if tmp._static:
-        tmp.done_editing()
-    return tmp
-
-
-
-
-
-def setup_circuit_list_for_LCS_computations(
-        circuit_list: list[_Circuit],
-        implicit_idle_gate_name: str|Label = 'I'
-    ) -> tuple[
-        dict[int, dict[int, _Circuit]],
-        dict[LayerTupLike, list[tuple[int, int]]],
-        dict[tuple[int, ...],list[LayerTupLike]]
-    ]:
-    """
-    Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
-
-    Also return a sequence detailing the number of lanes in each circuit.
-    Then, a sequence detailing the number of qubits in each lane for a circuit.
-    """
-
-    # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
-    # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
-
-    cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
-    sub_cir_to_cir_id_and_lane_id:  dict[LayerTupLike, list[tuple[int, int]]] = {}
-    line_labels_to_layertup_lists:  dict[tuple[int, ...], list[LayerTupLike]] = {}
-
-    for i, cir in enumerate(circuit_list):
-
-        if implicit_idle_gate_name:
-            cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
-
-        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir)
-        sub_cirs = compute_subcircuits(cir, qubit_to_lane, lane_to_qubits)
-
-        if not implicit_idle_gate_name:
-            if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
-                raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
-
-        assert len(sub_cirs) == len(lane_to_qubits)
-        for j in range(len(sub_cirs)):
-            sc = _Circuit(sub_cirs[j],line_labels=tuple(lane_to_qubits[j]),)
-            lbls = sc._line_labels
-            if lbls in line_labels_to_layertup_lists:
-                line_labels_to_layertup_lists[lbls].append(sc.layertup)
-            else:
-                line_labels_to_layertup_lists[lbls] = [sc.layertup]
-            if sc.layertup in sub_cir_to_cir_id_and_lane_id:
-                sub_cir_to_cir_id_and_lane_id[sc.layertup].append((i,j))
-            else:
-                sub_cir_to_cir_id_and_lane_id[sc.layertup] = [(i,j)]
-            if i in cir_ind_and_lane_id_to_sub_cir:
-                cir_ind_and_lane_id_to_sub_cir[i][j] = sc
-            else:
-                cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
-
-    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_layertup_lists
-
-#endregion Split Circuits by lanes helpers
-
-
-#region Lane Collapsing Helpers
-
-def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
-    """
-    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
-    """
-    if op.num_qubits == 2:
-        # We may need to do swaps.
-        op_term : _np.ndarray = _np.array([1.])
-        if op in saved:
-            op_term = saved[op]
-        elif op.qubits[1] < op.qubits[0]:  # type: ignore
-            # This is in the wrong order.
-            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-            op_term = swap_dense @ (op_term) @ swap_dense.T
-            saved[op] = op_term # Save so we only need to this operation once.
-        else:
-            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-        return op_term
-    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-
-def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray):
-    """
-    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
-    """
-    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
-
-def matrix_matrix_cost_estimate(matrix_size: tuple[int, int]) -> int:
-    """
-    Estimate cost of A @ B when both are square and dense.
-    """
-    n = matrix_size[0]
-    return 2 * n**3
-
-
-#endregion Lane Collapsing Helpers
-
-
-class EvalTreeBasedUponLongestCommonSubstring():
-
-    def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
-        """
-        Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
-        """
-
-        self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
-
-        self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
-        self.qubit_start_point = qubit_starting_loc
-
-
-        internal_matches = create_tables_for_internal_LCS(circuit_list)
-        best_internal_match = _np.max(internal_matches[0])
-
-        max_rounds = best_internal_match
-
-        C = len(circuit_list)
-        sequence_intro = {0: _np.arange(C)}
-
-        cache_pos = C
-        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
-
-        new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
-
-        # Let's try simplifying internally first.
-        self.internal_first = False
-        seq_ind_to_cache_index = {i: i for i in range(C)}
-        if self.internal_first:
-            i = 0
-            cache_pos = -1
-            while max_rounds > 1:
-
-                tmp = simplify_internal_first_one_round(new_circuit_list, 
-                                                        internal_matches,
-                                                        cache_pos,
-                                                        cache,
-                                                        seq_ind_to_cache_index)
-                new_circuit_list, cache_pos, cache, sequence_intro[i-1] = tmp
-                i -= 1
-                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
-
-                max_rounds = _np.max(internal_matches[0])
-        external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
-                                                                    None,
-                                                                    None,
-                                                                    set(_np.arange(len(new_circuit_list))),
-                                                                    max([len(cir) for cir in new_circuit_list])-1)
-
-
-        best_external_match = _np.max(external_matches[0])
-
-        max_rounds = int(max(best_external_match,best_internal_match))
-        i = 0
-        cache_pos = len(new_circuit_list)
-        while max_rounds > 1:
-            tmp = conduct_one_round_of_lcs_simplification(new_circuit_list,
-                                                          external_matches,
-                                                          internal_matches,
-                                                          cache_pos,
-                                                          cache,
-                                                          seq_ind_to_cache_index)
-            new_circuit_list, cache_pos, cache, sequence_intro[i+1], ext_table, external_sequences, dirty_inds = tmp
-            i += 1
-            dirty_inds = set(_np.arange(len(new_circuit_list))) # TODO: fix to only correct those which are actually dirty.
-            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
-                                                                        ext_table,
-                                                                        external_sequences,
-                                                                        dirty_inds,
-                                                                        max_rounds)
-
-            if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
-                # We are not going to get a better internal match.
-                pass
-            elif not self.internal_first:
-                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
-
-            best_external_match = _np.max(external_matches[0])
-            best_internal_match = _np.max(internal_matches[0])
-
-            max_rounds = int(max(best_external_match,best_internal_match))
-
-        self.cache = cache
-        self.num_circuits = C
-        self.from_other = False
-
-        self.sequence_intro = sequence_intro
-
-        from pygsti.modelmembers.operations import StaticStandardOp
-        self.swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense().round(16)
-
-        self.cache_ind_to_alphabet_vals_referenced: dict[int, set[LabelTupTup]] = {}
-
-
-        # Useful for repeated calculations seen in a derivative calculation.
-        for key in self.cache:
-            self.compute_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
-
-        alphabet_val_to_cache_inds_to_update: dict[LabelTup, set[int]] = {}
-
-        for cache_ind, vals in self.cache_ind_to_alphabet_vals_referenced.items():
-            for val in vals:
-                if isinstance(val, LabelTupTup):
-                    for ind_gate in val:
-                        if ind_gate in alphabet_val_to_cache_inds_to_update:
-                            alphabet_val_to_cache_inds_to_update[ind_gate].add(cache_ind)
-                        else:
-                            alphabet_val_to_cache_inds_to_update[ind_gate] = set([cache_ind])
-                else:
-                    if val in alphabet_val_to_cache_inds_to_update:
-                        alphabet_val_to_cache_inds_to_update[val].add(cache_ind)
-                    else:
-                        alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
-
-        self.results: dict[int | LabelTupTup, _np.ndarray] = {}
-
-        self.alphabet_val_to_sorted_cache_inds: dict[LabelTup, list[int]] = {}
-
-        self.gpindex_to_cache_vals: dict[int, tuple[list[int], list[Label]]] = {}
-        # This will be filled later by _gpindex_to_cache_inds_needed_to_recompute when we have access to the model.
-        # Warning that changing the model paramvec will result in this cache becoming invalidated.
-        # The user is currently in charge of resetting this cache.
-
-        for val, cache_inds in alphabet_val_to_cache_inds_to_update.items():
-            rnd_nums = {}
-            for cache_ind in cache_inds:
-                for rnd_num in self.sequence_intro:
-                    if cache_ind in self.sequence_intro[rnd_num]:
-                        rnd_nums[cache_ind] = rnd_num
-                        break
-
-            sorted_inds = sorted(cache_inds, key =lambda x : rnd_nums[x])[::-1] # We want to iterate large to small.
-
-            self.alphabet_val_to_sorted_cache_inds[val] = sorted_inds
-
-      
-    def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: int) -> list[int]:
-        """
-        Given that I change the representation of a gate by modifying this index, gp_index_changing,
-        what cache indices do I need to recompute and in what order.
-        """
-        if gp_index_changing in self.gpindex_to_cache_vals:
-            return self.gpindex_to_cache_vals[gp_index_changing]
-
-        cache_inds: list[int] = []
-        all_op_inds: list[int] = []
-        invalid_lbls: list[Label] = []
-        for lbl in self.alphabet_val_to_sorted_cache_inds.keys():
-            # my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
-            try:
-                my_op = model.circuit_layer_operator(lbl, "op") # Assumes that layers have the same gpindices as the gates themselves.
-            except KeyError:
-                # Skip to the next lbl to check. Do not immediately return None!
-                continue
-            op_inds = my_op.gpindices_as_array()
-            if gp_index_changing in op_inds:
-                cache_inds.extend(self.alphabet_val_to_sorted_cache_inds[lbl])
-                invalid_lbls.append(lbl)  # We also invalidate the lbl.
-                all_op_inds.extend(op_inds)
-
-        for ind in all_op_inds:
-            self.gpindex_to_cache_vals[ind] = (cache_inds, invalid_lbls)
-        return cache_inds, invalid_lbls
-    
-    def _which_full_circuits_will_change_due_to_gpindex_changing(self, model, gp_index_changing: int) -> list[int]:
-
-        cache_inds, _ = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
-
-        if len(cache_inds) == 0:
-            return []
-        
-        answer = [ind for ind in range(self.num_circuits) if ind in cache_inds]
-        return answer
-
-
-    def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
-        """
-        Construct a tree from another tree.
-        """
-        
-        self.cache = other.cache
-        self.num_circuits = other.num_circuits
-        self.sequence_intro = other.sequence_intro
-        self.swap_gate = other.swap_gate
-        self.orig_circuit_list = other.orig_circuit_list
-        self.circuit_to_save_location = other.circuit_to_save_location
-        self.from_other = other
-
-        for ind in self.cache:
-            for i, term in enumerate(self.cache[ind]):
-                if isinstance(term, int):
-                    pass # The tree will stay the same.
-                elif isinstance(term, LabelTupTup):
-                    new_term = ()
-                    for op in term:
-                        new_qu = (qubit_label_exchange[qu] for qu in op.qubits)
-                        new_op = (op.name, *new_qu)
-                        new_term = (*new_term, new_op)
-                    self.cache[ind][i] = Label(new_term)
-
-        
-        for icir in range(len(self.orig_circuit_list)):
-            self.orig_circuit_list[icir] = self.trace_through_cache_to_build_circuit(icir)
-
-        updated = {}
-        for cir, loc in self.circuit_to_save_location.items():
-            new_cir = ()
-            for layer in cir:
-                new_layer = ()
-                for op in layer:
-                    new_op = (op[0], *(qubit_label_exchange[qu] for qu in op[1:]))
-                    new_layer = (*new_layer, new_op)
-                new_cir = (*new_cir, new_layer)
-            updated[new_cir] = loc
-        self.circuit_to_save_location = updated
-
-    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int, gp_index_changing: Optional[int] = None):
-        """
-        Compute the total product cache. Note that this may still have a tensor product
-        structure that the operator needs to combine again if they want to have the full 'dense' matrix.
-
-        If gp_index_changing is not None then we have already computed the results once and we only need to update
-        those terms which depend on the specific gp_index.
-        """
-
-        if gp_index_changing is not None:
-
-            # Dig through the tree to see if we have a matching
-
-            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
-
-            if cache_inds:
-                # Invalidate all gate labels that we saved just in case.
-                # Invalidate every index in the which we know to be influenced by my_op.
-                # local_changes = {k: v for k, v in self.results.items() \
-                #                     if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
-
-                # Iterating over all the cache will take too long.
-                # So we need to handle the invalidness of certain cache inds when we encounter them.
-                local_changes = {}
-
-                # Ignore the last index which is the Label that matched the gpindex.
-                # We assume that only one will match.
-                for cache_ind in cache_inds:
-                    cumulative_term = None
-                    for term in self.cache[cache_ind]:
-                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes,
-                                                                    num_qubits_in_default, cache_inds, invalid_lbls)
-
-                    # Save locally.
-                    if cumulative_term is None:
-                        local_changes[cache_ind] = _np.eye(4**num_qubits_in_default)
-                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
-                    else:
-                        local_changes[cache_ind] = cumulative_term
-
-                return local_changes, self.circuit_to_save_location
-
-            return self.results, self.circuit_to_save_location
-
-        else:
-            self.results = {} # We are asking to reset all the calculations.
-            round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-            # saved: dict[int | LabelTupTup, _np.ndarray] = {}
-
-            if self.internal_first:
-
-                round_keys = _np.unique(list(self.sequence_intro.keys()))
-
-                pos_inds = _np.where(round_keys >0)
-                pos_keys = round_keys[pos_inds]
-                pos_keys = sorted(pos_keys)[::-1]
-
-                neg_inds = _np.where(round_keys < 0)
-                neg_keys = round_keys[neg_inds]
-                neg_keys = sorted(neg_keys)
-
-                round_keys = pos_keys + neg_keys + _np.array([0])
-            
-            empty = []
-            for key in round_keys:
-                for cache_ind in self.sequence_intro[key]:
-                    cumulative_term = None
-                    for term in self.cache[cache_ind]:
-                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results,
-                                                                    num_qubits_in_default, empty, empty)
-                            
-                    if cumulative_term is None:
-                        self.results[cache_ind] = _np.eye(4**num_qubits_in_default)
-                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
-                    else:
-                        self.results[cache_ind] = cumulative_term
-        if __debug__:
-            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
-            for key in self.cache:
-                assert key in self.results
-        
-        # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
-        return self.results, self.circuit_to_save_location
-    
-    def compute_depends_on(self, val: int | LabelTupTup, visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
-
-        if not isinstance(val, int):
-            return set([val])
-        elif val in visited:
-            return visited[val]
-        else:
-            tmp = set()
-            for child in self.cache[val]:
-                ret_val = self.compute_depends_on(child, visited)
-                tmp = tmp.union(ret_val)
-            visited[val] = tmp
-            return tmp
-
-
-    def combine_for_visualization(self, val, visited):
-
-        if not isinstance(val, int):
-            return [val]
-        elif val in visited:
-            return visited[val]
-        else:
-            tmp = []
-            for child in self.cache[val]:
-                tmp.append(self.combine_for_visualization(child, visited))
-            visited[val] = tmp
-            return tmp
-
-    def handle_results_cache_lookup_and_product(self,
-                            cumulative_term: None | _np.ndarray,
-                            term_to_extend_with: int | LabelTupTup,
-                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
-
-        if cumulative_term is None:
-            return results_cache[term_to_extend_with]
-        return results_cache[term_to_extend_with] @ cumulative_term
-
-        if isinstance(term_to_extend_with, int):
-            if term_to_extend_with in globally_invalid_cache_inds[:-1]:
-                # look up the result in the local results cache.
-                # This is just for that derivative step.
-                if cumulative_term is None:
-                    return results_cache[term_to_extend_with]
-                return results_cache[term_to_extend_with] @ cumulative_term
-        else:
-            if term_to_extend_with in globally_invalid_cache_inds[-1:]:
-                # Only one label gets invalidated and that is stored at the end of the list.
-
-                # look up the result in the local results cache.
-                # This is just for that derivative step.
-                if cumulative_term is None:
-                    return results_cache[term_to_extend_with]
-                return results_cache[term_to_extend_with] @ cumulative_term
-        
-        # We should use the cache for all the probs calculation.
-        if cumulative_term is None:
-            # look up result.
-            return self.results[term_to_extend_with]
-        return self.results[term_to_extend_with] @ cumulative_term 
-
-
-    def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
-                            term_to_extend_with: int | LabelTupTup,
-                            local_results_cache: dict[int | LabelTupTup, _np.ndarray],
-                            num_qubits_in_default: int,
-                            globally_invalid_cache_inds: Optional[list[int]] = None,
-                            globally_invalid_labels: Optional[list[LabelTupTup]] = None
-                            ) -> _np.ndarray:
-        """
-        Reduce a cache line to a single process matrix.
-
-        This should really only be called from collapse_circuits_to_process_matrices.
-
-        """
-
-        if (term_to_extend_with in local_results_cache):
-            return self.handle_results_cache_lookup_and_product(cumulative_term,
-                                                                term_to_extend_with,
-                                                                local_results_cache)
-        elif isinstance(term_to_extend_with, int) and \
-            (globally_invalid_cache_inds is not None) and \
-            (term_to_extend_with not in globally_invalid_cache_inds) and \
-                (term_to_extend_with in self.results):
-            
-            return self.handle_results_cache_lookup_and_product(cumulative_term,
-                                                                term_to_extend_with,
-                                                                self.results)
-        elif isinstance(term_to_extend_with, LabelTupTup) and \
-            (globally_invalid_labels is not None) and \
-            not (any([t in globally_invalid_labels for t in term_to_extend_with])) \
-                and (term_to_extend_with in self.results):        
-            return self.handle_results_cache_lookup_and_product(cumulative_term,
-                                                                term_to_extend_with,
-                                                                self.results)
-        
-        # elif isinstance(term_to_extend_with, LabelTup) and \
-        #     (term_to_extend_with not in globally_invalid_cache_inds[-1:]) \
-        #         and (term_to_extend_with in self.results):        
-        #     return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
-        #                                                         local_results_cache, globally_invalid_cache_inds)
-
-        else:
-            val = 1
-            qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
-            if isinstance(term_to_extend_with, int):
-                breakpoint()
-            matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
-                                            local_results_cache, self.swap_gate) for op in term_to_extend_with}
-            qubit_used = []
-            for key in matrix_reps.keys():
-                qubit_used.extend(key)
-
-            assert len(qubit_used) == len(set(qubit_used))
-            unused_qubits = set(qubits_available) - set(qubit_used)
-
-            implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
-                                        Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
-                                        local_results_cache, self.swap_gate) for qu in unused_qubits}
-
-            while qubits_available:
-
-                qu = qubits_available[0]
-                if qu in unused_qubits:
-                    val = _np.kron(val, implicit_idle_reps[(qu,)])
-                    qubits_available = qubits_available[1:]
-                else:
-                    # It must be a part of a non-trivial gate.
-                    gatekey = [key for key in matrix_reps if qu in key][0]
-                    val = _np.kron(val, matrix_reps[gatekey])
-
-                    qubits_available = qubits_available[len(gatekey):]
-
-            local_results_cache[term_to_extend_with] = val
-            if cumulative_term is None:
-                return val
-            # Cache if off.
-            return local_results_cache[term_to_extend_with] @ cumulative_term
-
-
-    def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
-
-        output = ()
-        for term in self.cache[cache_ind]:
-
-            if isinstance(term, Label):
-                output = (*output, term)
-            elif isinstance(term, int):
-                # Recurse down.
-                next_term = self.trace_through_cache_to_build_circuit(term)
-                output = (*output, *next_term)
-
-        return list(output)
-
-    def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int], model = None, gp_index_changing: Optional[int] = None) -> int:
-        """
-        We assume that each matrix matrix multiply is the same size.
-        """
-
-        assert matrix_size[0] == matrix_size[1]
-
-        total_flop_cost = 0
-        if (model is not None) and (gp_index_changing is not None):
-
-            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
-
-        else:
-            cache_inds = list(self.cache.keys())
-
-        for cache_ind in cache_inds:
-            num_mm_on_this_cache_line = len(self.cache[cache_ind]) - 1
-            total_flop_cost += (matrix_matrix_cost_estimate(matrix_size)) * num_mm_on_this_cache_line
-
-        return total_flop_cost
-
-
-class CollectionOfLCSEvalTrees():
-
-    def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTupTup]],
-                 sub_cir_to_full_cir_id_and_lane_id,
-                 cir_id_and_lane_id_to_sub_cir):
-        
-        self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
-
-        ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE = False
-
-        size_to_tree: dict[int, tuple[int, ...]] = {}
-
-        self.line_lbls_to_cir_list = line_lbls_to_circuit_list
-
-        starttime = time.time()
-        for key, vals in our_tqdm(line_lbls_to_circuit_list.items(), " Building Longest Common Substring Caches"):
-            sub_cirs = []
-            for cir in vals:
-                sub_cirs.append(list(cir))
-            if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
-                if len(key) not in size_to_tree:
-                    self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
-                    size_to_tree[len(key)] = key
-                else:
-                    sample = EvalTreeBasedUponLongestCommonSubstring(sub_cirs[:2]) # Build a small version to be corrected later.
-                    other_key = size_to_tree[len(key)]
-                    sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
-                    self.trees[key] = sample
-            else:
-                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs, sorted(key)[0])
-
-        endtime = time.time()
-
-        print(" Time to compute all the evaluation orders (s): ", endtime - starttime)
-
-
-        self.sub_cir_to_full_cir_id_and_lane_id = sub_cir_to_full_cir_id_and_lane_id
-        self.cir_id_and_lane_id_to_sub_cir = cir_id_and_lane_id_to_sub_cir
-
-        self.cir_id_to_tensor_order: dict[int, list[list[int], int]] = {}
-        self.compute_tensor_orders()
-
-        self.saved_results: dict[Union[LabelTupTup, int], _np.ndarray] = {}
-        self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
-        self.original_matrices: dict[int, dict[int, _np.ndarray]] = {}
-        self.full_matrices: list[KronStructured] = []
-        self.process_matrices_which_will_need_to_update_for_index: _np.ndarray = []
-
-    def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
-
-        for key in self.trees:
-            inds, lbls = self.trees[key]._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
-            if len(inds) > 0:
-                return True
-        return False
-
-
-    def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Optional[int] = None):
-        """
-        Collapse all circuits to their process matrices. If alphabet_piece_changing is not None, then
-        we assume we have already collapsed this system once before and so only need to update part of the eval tree.
-        """
-        # Just collapse all of them.
-
-        if gp_index_changing is not None:
-            # We may not need to check all of the lanes.
-            pass
-
-        else:
-            self.saved_results = {}
-        
-        for key in self.trees:
-            num_qubits = len(key)
-            tree = self.trees[key]
-            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits, gp_index_changing)
-            # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
-            self.saved_results[key] = out1
-            self.sub_cir_to_ind_in_results[key] = out2
-
-    def determine_which_circuits_will_update_for_what_gpindices(self, model):
-
-        dirty_circuits = _np.zeros((model.num_params, len(self.trees), len(self.cir_id_and_lane_id_to_sub_cir)))
-        for ind in range(model.num_params):
-
-            for ikey, key in enumerate(self.trees):
-                dirty_circuits[ind, ikey, self.trees[key]._which_full_circuits_will_change_due_to_gpindex_changing(model, ind)] = 1
-
-        self.process_matrices_which_will_need_to_update_for_index = dirty_circuits
-        return dirty_circuits
-
-    def reset_full_matrices_to_base_probs_version(self) -> None:
-        """
-        Any matrix which was updated previously reset to the original version.
-        """
-
-        for icir in self.original_matrices:
-            for lane_in_cir in self.original_matrices[icir]:
-                self.full_matrices[icir].update_operand(lane_in_cir, self.original_matrices[icir][lane_in_cir])
-        self.original_matrices = {}
-        return
-
-
-    def reconstruct_full_matrices(self,
-                                  model = None,
-                                  gp_index_changing: Optional[int] = None) -> \
-                                    Optional[Tuple[List[Union[KronStructured, _np.ndarray]], List[int]]]:
-        """
-        Construct a tensor product structure for each individual circuit
-        """
-
-        if len(self.saved_results) == 0:
-            return
-
-        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
-        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
-        if (gp_index_changing is not None) and (model is not None):
-            
-            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
-
-            lane_key_to_ind: dict[tuple[int, ...], int] = {key: ikey for ikey, key in enumerate(self.trees)}
-
-            output = []
-            if len(cir_inds) > 0:
-                self.original_matrices = {} # Reset the cache of updated process matrices.
-
-            for icir in cir_inds:
-                lane_circuits = []
-                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
-                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
-                    lblkey = cir._line_labels
-
-                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                    if ind_in_results not in self.saved_results[lblkey]:
-                        # We have only the local changes.
-                        # This will be stored in the results file of the subtree.
-                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
-                    else:
-                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
-                if len(lane_circuits) > 1:
-                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
-                    # output.append(KronStructured(lane_circuits))
-                elif len(lane_circuits) == 1:
-                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
-                else:
-                    raise ValueError()
-        
-            return output, cir_inds
-
-        else:
-            output = []
-
-
-            # Now we can do the combination.
-
-            for icir in cir_inds:
-                lane_circuits = []
-                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
-                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
-                    lblkey = cir._line_labels
-
-                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
-                    if ind_in_results not in self.saved_results[lblkey]:
-                        # We have only the local changes.
-                        # This will be stored in the results file of the subtree.
-                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
-                    else:
-                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
-                if len(lane_circuits) > 1:
-                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
-                    # output.append(KronStructured(lane_circuits))
-                elif len(lane_circuits) == 1:
-                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
-                else:
-                    raise ValueError()
-            
-        self.full_matrices = output
-        return output, cir_inds
-
-
-    def recurse_to_build_sparse_kron_matrix(self, operands: list[_np.ndarray]):
-        if len(operands) == 1:
-            return operands[0]
-        return sparse_kron(operands[0], self.recurse_to_build_sparse_kron_matrix(operands[1:]))
-
-
-    def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False, model = None, gp_index_changing: Optional[int] = None):
-
-
-        cost_collapse = 0
-        for key in self.trees:
-            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
-            tree = self.trees[key]
-            cost_collapse += tree.flop_cost_of_evaluating_tree(tuple([4**num_qubits, 4**num_qubits]), model, gp_index_changing)
-        
-
-        tensor_cost = 0
-        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
-        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
-
-        if (model is not None) and (gp_index_changing is not None):
-
-            dirty_circuits = self.determine_which_circuits_will_update_for_what_gpindices(model)
-            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
-
-        
-        for cir_id in cir_inds:
-            qubit_list = ()
-            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
-                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
-                qubit_list = (*qubit_list, len(subcir._line_labels))
-            qubit_list = list(qubit_list)
-            total_num = _np.sum(qubit_list)
-
-            tensor_cost += cost_to_compute_tensor_matvec_without_reordering(qubit_list, total_num)
-
-        if return_collapse:
-            return tensor_cost + cost_collapse, cost_collapse
-        elif return_tensor_matvec:
-            return tensor_cost + cost_collapse, tensor_cost
-        elif gp_index_changing is not None:
-            return tensor_cost + cost_collapse, len(cir_inds)  # Since you are not updating all of the representations we do not need to update the state props either for those.
-
-        return tensor_cost + cost_collapse
-
-    def compute_tensor_orders(self):
-
-        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
-
-        cache_struct = {}
-
-        for cir_id in range(num_cirs):
-            qubit_list = ()
-            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
-                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
-                qubit_list = (*qubit_list, len(subcir._line_labels))
-            self.cir_id_to_tensor_order[cir_id] = self.best_order_for_tensor_contraction(qubit_list, cache_struct)
-
-        return
-            
-    def best_order_for_tensor_contraction(self,
-                    qubit_list: tuple[int, ...],
-                    cache: dict[tuple[int, ...], tuple[list[int], int]]) -> tuple[list[int], int]:
-        """
-        Find the tensor contraction order that minizes the cost of contracting to a dense system with
-        a total number of qubits equal to the len(qubit_list)
-        """
-        
-
-        if qubit_list in cache:
-            return cache[qubit_list]
-
-        best_cost = _np.inf
-        best_order = []
-
-        for order in itertools.permutations(range(len(qubit_list)-1), len(qubit_list)-1):
-
-            my_list = [qb for qb in qubit_list] # force deep copy.
-            my_starting_points = [sp for sp in order]
-            cost = 0
-            early_exit = False
-            while my_starting_points and not early_exit:
-                sp = my_starting_points.pop(0)
-
-                cost += self._tensor_cost_model(my_list[sp], my_list[sp+1])
-                if cost <= best_cost:
-                    # modify sp for future.
-                    tmp = []
-                    for new_val in my_starting_points:
-                        tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
-                    my_starting_points = tmp
-
-                    q2 = my_list.pop(sp+1)
-                    my_list[sp] += q2
-                else:
-                    early_exit = True # This round is done because the partial sum was too big.
-
-            if cost < best_cost and not early_exit:
-                best_cost = cost
-                best_order = list(order)
-
-        # Store off the information.
-        cache[qubit_list] = best_order, best_cost
-
-        return best_order, best_cost
-
-    def _tensor_cost_model(self, num_qubits1, num_qubits2):
-        """
-        Assumes kronecker product of 2 square matrices.
-        """
-
-        return (4**num_qubits1)**2 * (4**num_qubits2)**2
-    
-    def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[int, list[int], list[int]]:
-        """
-        Compute the number of flops needed to collapse each circuit into a single process matrix.
-
-        Returns:
-        ---------
-            cost - int total cost to collapse and reform
-            collapse_lane_cost - list[int] cost to collapse a lane
-            tensor_cost - list[int] cost to recombine a circuit into its full size.
-        """
-
-
-        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
-
-        collapse_lane_cost = []
-
-        for lbl_key, my_tree in self.trees.items():
-            collapse_lane_cost.append(my_tree.flop_cost_of_evaluating_tree([4**len(lbl_key), 4**len(lbl_key)]))
-
-        tensor_cost = []
-        for icir in range(num_cirs):
-            
-            _order, cost = self.cir_id_to_tensor_order[icir]
-            tensor_cost.append(cost)
-
-        return sum(tensor_cost) + sum(collapse_lane_cost), collapse_lane_cost, tensor_cost
-    
-
-
-
-
-
-def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int) -> int:
-
-    assert _np.sum(qubit_list) == total_num_qubits
-
-    if len(qubit_list) == 1:
-        # Basic matvec.
-        cost = 2 * (4**qubit_list[0]**2)
-        return cost
-    
-    elif len(qubit_list) == 2:
-        # vec((A \tensor B) u) = vec(B U A.T)
-        term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
-        term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
-        return term1 + term2
-    
-    else:
-        # Just pop off the last term
-        # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
-
-        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0])
-        right *= 4**(_np.sum(qubit_list[1:]))
-        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:],
-                                                                total_num_qubits - qubit_list[0])
-        left *= 4**(qubit_list[0])
-        return left + right
\ No newline at end of file
diff --git a/pygsti/layouts/longest_common_evaltree.py b/pygsti/layouts/longest_common_evaltree.py
new file mode 100644
index 000000000..e9ccc4f96
--- /dev/null
+++ b/pygsti/layouts/longest_common_evaltree.py
@@ -0,0 +1,971 @@
+"""
+Defines the EvalTree class.
+"""
+#***************************************************************************************************
+# Copyright 2015, 2019, 2025 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
+# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
+# in this software.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.  You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
+#***************************************************************************************************
+
+from __future__ import annotations
+import time as _time  # DEBUG TIMERS
+
+import numpy as _np
+
+from pygsti.circuits.circuit import Circuit as _Circuit, LayerTupLike
+from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
+from pygsti.baseobjs.label import LabelTupTup, Label, LabelTup
+import itertools
+from pygsti.tools.sequencetools import (
+    conduct_one_round_of_lcs_simplification,
+    _compute_lcs_for_every_pair_of_sequences,
+    create_tables_for_internal_LCS,
+    simplify_internal_first_one_round
+)
+from pygsti.tools.dyadickronop import KronStructured
+from pygsti.circuits.split_circuits_into_lanes import (
+    compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit,
+    compute_subcircuits
+)
+
+from scipy.sparse import kron as sparse_kron
+from typing import List, Optional, Iterable, Union, TYPE_CHECKING, Tuple
+from pygsti.tools.tqdm import our_tqdm
+
+#region Split circuit list into lists of subcircuits
+
+def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: Union[str, Label] = 'I') -> _Circuit:
+    """
+    Add in explicit idles to the labels for each layer.
+    """
+
+    tmp = circuit.copy(editable=True)
+    num_layers = circuit.num_layers
+
+    for i in range(num_layers):
+        tmp[i] = Label(tmp.layer_label_with_idles(i, idle_gate_name))
+
+    if tmp._static:
+        tmp.done_editing()
+    return tmp
+
+
+
+
+
+def setup_circuit_list_for_LCS_computations(
+        circuit_list: list[_Circuit],
+        implicit_idle_gate_name: Union[str, Label] = 'I'
+    ) -> tuple[
+        dict[int, dict[int, _Circuit]],
+        dict[LayerTupLike, list[tuple[int, int]]],
+        dict[tuple[int, ...],list[LayerTupLike]]
+    ]:
+    """
+    Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
+
+    Also return a sequence detailing the number of lanes in each circuit.
+    Then, a sequence detailing the number of qubits in each lane for a circuit.
+    """
+
+    # We want to split the circuit list into a dictionary of subcircuits where each sub_cir in the dict[key] act exclusively on the same qubits.
+    # I need a mapping from subcircuit to actual circuit. This is uniquely defined by circuit_id and then lane id.
+
+    cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
+    sub_cir_to_cir_id_and_lane_id:  dict[LayerTupLike, list[tuple[int, int]]] = {}
+    line_labels_to_layertup_lists:  dict[tuple[int, ...], list[LayerTupLike]] = {}
+
+    for i, cir in enumerate(circuit_list):
+
+        if implicit_idle_gate_name:
+            cir = _add_in_idle_gates_to_circuit(cir, implicit_idle_gate_name)
+
+        qubit_to_lane, lane_to_qubits = compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit(cir)
+        sub_cirs = compute_subcircuits(cir, qubit_to_lane, lane_to_qubits)
+
+        if not implicit_idle_gate_name:
+            if not all([len(sc) == len(sub_cirs[0]) for sc in sub_cirs]):
+                raise ValueError("Each lane does not have the same number of layers. Therefore, a lane has an implicit idle gate. Please add in idle gates explicitly to the circuit.")
+
+        assert len(sub_cirs) == len(lane_to_qubits)
+        for j in range(len(sub_cirs)):
+            sc = _Circuit(sub_cirs[j],line_labels=tuple(lane_to_qubits[j]),)
+            lbls = sc._line_labels
+            if lbls in line_labels_to_layertup_lists:
+                line_labels_to_layertup_lists[lbls].append(sc.layertup)
+            else:
+                line_labels_to_layertup_lists[lbls] = [sc.layertup]
+            if sc.layertup in sub_cir_to_cir_id_and_lane_id:
+                sub_cir_to_cir_id_and_lane_id[sc.layertup].append((i,j))
+            else:
+                sub_cir_to_cir_id_and_lane_id[sc.layertup] = [(i,j)]
+            if i in cir_ind_and_lane_id_to_sub_cir:
+                cir_ind_and_lane_id_to_sub_cir[i][j] = sc
+            else:
+                cir_ind_and_lane_id_to_sub_cir[i] = {j: sc}
+
+    return cir_ind_and_lane_id_to_sub_cir, sub_cir_to_cir_id_and_lane_id, line_labels_to_layertup_lists
+
+#endregion Split Circuits by lanes helpers
+
+
+#region Lane Collapsing Helpers
+
+def get_dense_representation_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray) -> _np.ndarray:
+    """
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
+    """
+    if op.num_qubits == 2:
+        # We may need to do swaps.
+        op_term : _np.ndarray = _np.array([1.])
+        if op in saved:
+            op_term = saved[op]
+        elif op.qubits[1] < op.qubits[0]:  # type: ignore
+            # This is in the wrong order.
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+            op_term = swap_dense @ (op_term) @ swap_dense.T
+            saved[op] = op_term # Save so we only need to this operation once.
+        else:
+            op_term = model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+        return op_term
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+
+def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dict[int | LabelTup | LabelTupTup, _np.ndarray], swap_dense: _np.ndarray):
+    """
+    Assumes that a gate which operates on 2 qubits does not have the right orientation if label is (qu_{i+1}, qu_i).
+    """
+    return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
+
+
+#endregion Lane Collapsing Helpers
+
+
+class EvalTreeBasedUponLongestCommonSubstring():
+
+    def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
+        """
+        Construct an evaluation order tree for a circuit list that minimizes the number of rounds of computation.
+        """
+
+        self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
+
+        self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
+        self.qubit_start_point = qubit_starting_loc
+
+
+        internal_matches = create_tables_for_internal_LCS(circuit_list)
+        best_internal_match = _np.max(internal_matches[0])
+
+        max_rounds = best_internal_match
+
+        C = len(circuit_list)
+        sequence_intro = {0: _np.arange(C)}
+
+        cache_pos = C
+        cache = {i: circuit_list[i] for i in range(len(circuit_list))}
+
+        new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
+
+        # Let's try simplifying internally first.
+        self.internal_first = False  # TODO: Fix.
+        seq_ind_to_cache_index = {i: i for i in range(C)}
+        if self.internal_first:
+            i = 0
+            cache_pos = -1
+            while max_rounds > 1:
+
+                tmp = simplify_internal_first_one_round(new_circuit_list, 
+                                                        internal_matches,
+                                                        cache_pos,
+                                                        cache,
+                                                        seq_ind_to_cache_index)
+                new_circuit_list, cache_pos, cache, sequence_intro[i-1] = tmp
+                i -= 1
+                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
+
+                max_rounds = _np.max(internal_matches[0])
+        external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
+                                                                    None,
+                                                                    None,
+                                                                    set(_np.arange(len(new_circuit_list))),
+                                                                    max([len(cir) for cir in new_circuit_list])-1)
+
+
+        best_external_match = _np.max(external_matches[0])
+
+        max_rounds = int(max(best_external_match,best_internal_match))
+        i = 0
+        cache_pos = len(new_circuit_list)
+        while max_rounds > 1:
+            tmp = conduct_one_round_of_lcs_simplification(new_circuit_list,
+                                                          external_matches,
+                                                          internal_matches,
+                                                          cache_pos,
+                                                          cache,
+                                                          seq_ind_to_cache_index)
+            new_circuit_list, cache_pos, cache, sequence_intro[i+1], ext_table, external_sequences, dirty_inds = tmp
+            i += 1
+            dirty_inds = set(_np.arange(len(new_circuit_list))) # TODO: fix to only correct those which are actually dirty.
+            external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
+                                                                        ext_table,
+                                                                        external_sequences,
+                                                                        dirty_inds,
+                                                                        max_rounds)
+
+            if best_internal_match < best_external_match and best_external_match < 2 * best_internal_match:
+                # We are not going to get a better internal match.
+                pass
+            elif not self.internal_first:
+                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
+
+            best_external_match = _np.max(external_matches[0])
+            best_internal_match = _np.max(internal_matches[0])
+
+            max_rounds = int(max(best_external_match,best_internal_match))
+
+        self.cache = cache
+        self.num_circuits = C
+        self.from_other = False
+
+        self.sequence_intro = sequence_intro
+
+        from pygsti.modelmembers.operations import StaticStandardOp
+        self.swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense().round(16)
+
+        self.cache_ind_to_alphabet_vals_referenced: dict[int, set[LabelTupTup]] = {}
+
+
+        # Useful for repeated calculations seen in a derivative calculation.
+        for key in self.cache:
+            self.compute_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
+
+        alphabet_val_to_cache_inds_to_update: dict[LabelTup, set[int]] = {}
+
+        for cache_ind, vals in self.cache_ind_to_alphabet_vals_referenced.items():
+            for val in vals:
+                if isinstance(val, LabelTupTup):
+                    for ind_gate in val:
+                        if ind_gate in alphabet_val_to_cache_inds_to_update:
+                            alphabet_val_to_cache_inds_to_update[ind_gate].add(cache_ind)
+                        else:
+                            alphabet_val_to_cache_inds_to_update[ind_gate] = set([cache_ind])
+                else:
+                    if val in alphabet_val_to_cache_inds_to_update:
+                        alphabet_val_to_cache_inds_to_update[val].add(cache_ind)
+                    else:
+                        alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
+
+        self.results: dict[int | LabelTupTup, _np.ndarray] = {}
+
+        self.alphabet_val_to_sorted_cache_inds: dict[LabelTup, list[int]] = {}
+
+        self.gpindex_to_cache_vals: dict[int, tuple[list[int], list[Label]]] = {}
+        # This will be filled later by _gpindex_to_cache_inds_needed_to_recompute when we have access to the model.
+        # Warning that changing the model paramvec will result in this cache becoming invalidated.
+        # The user is currently in charge of resetting this cache.
+
+        for val, cache_inds in alphabet_val_to_cache_inds_to_update.items():
+            rnd_nums = {}
+            for cache_ind in cache_inds:
+                for rnd_num in self.sequence_intro:
+                    if cache_ind in self.sequence_intro[rnd_num]:
+                        rnd_nums[cache_ind] = rnd_num
+                        break
+
+            sorted_inds = sorted(cache_inds, key =lambda x : rnd_nums[x])[::-1] # We want to iterate large to small.
+
+            self.alphabet_val_to_sorted_cache_inds[val] = sorted_inds
+
+      
+    def _gpindex_to_cache_inds_needed_to_recompute(self, model, gp_index_changing: int) -> list[int]:
+        """
+        Given that I change the representation of a gate by modifying this index, gp_index_changing,
+        what cache indices do I need to recompute and in what order.
+        """
+        if gp_index_changing in self.gpindex_to_cache_vals:
+            return self.gpindex_to_cache_vals[gp_index_changing]
+
+        cache_inds: list[int] = []
+        all_op_inds: list[int] = []
+        invalid_lbls: list[Label] = []
+        for lbl in self.alphabet_val_to_sorted_cache_inds.keys():
+            # my_op = get_dense_op_of_gate_with_perfect_swap_gates(model, lbl, None, None)
+            try:
+                my_op = model.circuit_layer_operator(lbl, "op") # Assumes that layers have the same gpindices as the gates themselves.
+            except KeyError:
+                # Skip to the next lbl to check. Do not immediately return None!
+                continue
+            op_inds = my_op.gpindices_as_array()
+            if gp_index_changing in op_inds:
+                cache_inds.extend(self.alphabet_val_to_sorted_cache_inds[lbl])
+                invalid_lbls.append(lbl)  # We also invalidate the lbl.
+                all_op_inds.extend(op_inds)
+
+        for ind in all_op_inds:
+            self.gpindex_to_cache_vals[ind] = (cache_inds, invalid_lbls)
+        return cache_inds, invalid_lbls
+    
+    def _which_full_circuits_will_change_due_to_gpindex_changing(self, model, gp_index_changing: int) -> list[int]:
+
+        cache_inds, _ = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+        if len(cache_inds) == 0:
+            return []
+        
+        answer = [ind for ind in range(self.num_circuits) if ind in cache_inds]
+        return answer
+
+
+    def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, qubit_label_exchange: dict[int, int]):
+        """
+        Construct a tree from another tree.
+        """
+        
+        self.cache = other.cache
+        self.num_circuits = other.num_circuits
+        self.sequence_intro = other.sequence_intro
+        self.swap_gate = other.swap_gate
+        self.orig_circuit_list = other.orig_circuit_list
+        self.circuit_to_save_location = other.circuit_to_save_location
+        self.from_other = other
+
+        for ind in self.cache:
+            for i, term in enumerate(self.cache[ind]):
+                if isinstance(term, int):
+                    pass # The tree will stay the same.
+                elif isinstance(term, LabelTupTup):
+                    new_term = ()
+                    for op in term:
+                        new_qu = (qubit_label_exchange[qu] for qu in op.qubits)
+                        new_op = (op.name, *new_qu)
+                        new_term = (*new_term, new_op)
+                    self.cache[ind][i] = Label(new_term)
+
+        
+        for icir in range(len(self.orig_circuit_list)):
+            self.orig_circuit_list[icir] = self.trace_through_cache_to_build_circuit(icir)
+
+        updated = {}
+        for cir, loc in self.circuit_to_save_location.items():
+            new_cir = ()
+            for layer in cir:
+                new_layer = ()
+                for op in layer:
+                    new_op = (op[0], *(qubit_label_exchange[qu] for qu in op[1:]))
+                    new_layer = (*new_layer, new_op)
+                new_cir = (*new_cir, new_layer)
+            updated[new_cir] = loc
+        self.circuit_to_save_location = updated
+
+    def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: int, gp_index_changing: Optional[int] = None):
+        """
+        Compute the total product cache. Note that this may still have a tensor product
+        structure that the operator needs to combine again if they want to have the full 'dense' matrix.
+
+        If gp_index_changing is not None then we have already computed the results once and we only need to update
+        those terms which depend on the specific gp_index.
+        """
+
+        if gp_index_changing is not None:
+
+            # Dig through the tree to see if we have a matching
+
+            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+            if cache_inds:
+                # Invalidate all gate labels that we saved just in case.
+                # Invalidate every index in the which we know to be influenced by my_op.
+                # local_changes = {k: v for k, v in self.results.items() \
+                #                     if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
+
+                # Iterating over all the cache will take too long.
+                # So we need to handle the invalidness of certain cache inds when we encounter them.
+                local_changes = {}
+
+                # Ignore the last index which is the Label that matched the gpindex.
+                # We assume that only one will match.
+                for cache_ind in cache_inds:
+                    cumulative_term = None
+                    for term in self.cache[cache_ind]:
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, local_changes,
+                                                                    num_qubits_in_default, cache_inds, invalid_lbls)
+
+                    # Save locally.
+                    if cumulative_term is None:
+                        local_changes[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
+                    else:
+                        local_changes[cache_ind] = cumulative_term
+
+                return local_changes, self.circuit_to_save_location
+
+            return self.results, self.circuit_to_save_location
+
+        else:
+            self.results = {} # We are asking to reset all the calculations.
+            round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
+            # saved: dict[int | LabelTupTup, _np.ndarray] = {}
+
+            if self.internal_first:
+
+                round_keys = _np.unique(list(self.sequence_intro.keys()))
+
+                pos_inds = _np.where(round_keys >0)
+                pos_keys = round_keys[pos_inds]
+                pos_keys = sorted(pos_keys)[::-1]
+
+                neg_inds = _np.where(round_keys < 0)
+                neg_keys = round_keys[neg_inds]
+                neg_keys = sorted(neg_keys)
+
+                round_keys = pos_keys + neg_keys + _np.array([0])
+            
+            empty = []
+            for key in round_keys:
+                for cache_ind in self.sequence_intro[key]:
+                    cumulative_term = None
+                    for term in self.cache[cache_ind]:
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results,
+                                                                    num_qubits_in_default, empty, empty)
+                            
+                    if cumulative_term is None:
+                        self.results[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        # NOTE: unclear when (if ever) this should be a noisy idle gate.
+                    else:
+                        self.results[cache_ind] = cumulative_term
+        if __debug__:
+            # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
+            for key in self.cache:
+                assert key in self.results
+        
+        # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
+        return self.results, self.circuit_to_save_location
+    
+    def compute_depends_on(self, val: int | LabelTupTup, visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
+
+        if not isinstance(val, int):
+            return set([val])
+        elif val in visited:
+            return visited[val]
+        else:
+            tmp = set()
+            for child in self.cache[val]:
+                ret_val = self.compute_depends_on(child, visited)
+                tmp = tmp.union(ret_val)
+            visited[val] = tmp
+            return tmp
+
+
+    def combine_for_visualization(self, val, visited):
+
+        if not isinstance(val, int):
+            return [val]
+        elif val in visited:
+            return visited[val]
+        else:
+            tmp = []
+            for child in self.cache[val]:
+                tmp.append(self.combine_for_visualization(child, visited))
+            visited[val] = tmp
+            return tmp
+
+    def handle_results_cache_lookup_and_product(self,
+                            cumulative_term: None | _np.ndarray,
+                            term_to_extend_with: int | LabelTupTup,
+                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
+
+        if cumulative_term is None:
+            return results_cache[term_to_extend_with]
+        return results_cache[term_to_extend_with] @ cumulative_term
+
+        if isinstance(term_to_extend_with, int):
+            if term_to_extend_with in globally_invalid_cache_inds[:-1]:
+                # look up the result in the local results cache.
+                # This is just for that derivative step.
+                if cumulative_term is None:
+                    return results_cache[term_to_extend_with]
+                return results_cache[term_to_extend_with] @ cumulative_term
+        else:
+            if term_to_extend_with in globally_invalid_cache_inds[-1:]:
+                # Only one label gets invalidated and that is stored at the end of the list.
+
+                # look up the result in the local results cache.
+                # This is just for that derivative step.
+                if cumulative_term is None:
+                    return results_cache[term_to_extend_with]
+                return results_cache[term_to_extend_with] @ cumulative_term
+        
+        # We should use the cache for all the probs calculation.
+        if cumulative_term is None:
+            # look up result.
+            return self.results[term_to_extend_with]
+        return self.results[term_to_extend_with] @ cumulative_term 
+
+
+    def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
+                            term_to_extend_with: int | LabelTupTup,
+                            local_results_cache: dict[int | LabelTupTup, _np.ndarray],
+                            num_qubits_in_default: int,
+                            globally_invalid_cache_inds: Optional[list[int]] = None,
+                            globally_invalid_labels: Optional[list[LabelTupTup]] = None
+                            ) -> _np.ndarray:
+        """
+        Reduce a cache line to a single process matrix.
+
+        This should really only be called from collapse_circuits_to_process_matrices.
+
+        """
+
+        if (term_to_extend_with in local_results_cache):
+            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                local_results_cache)
+        elif isinstance(term_to_extend_with, int) and \
+            (globally_invalid_cache_inds is not None) and \
+            (term_to_extend_with not in globally_invalid_cache_inds) and \
+                (term_to_extend_with in self.results):
+            
+            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                self.results)
+        elif isinstance(term_to_extend_with, LabelTupTup) and \
+            (globally_invalid_labels is not None) and \
+            not (any([t in globally_invalid_labels for t in term_to_extend_with])) \
+                and (term_to_extend_with in self.results):        
+            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                                                                term_to_extend_with,
+                                                                self.results)
+        
+        # elif isinstance(term_to_extend_with, LabelTup) and \
+        #     (term_to_extend_with not in globally_invalid_cache_inds[-1:]) \
+        #         and (term_to_extend_with in self.results):        
+        #     return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
+        #                                                         local_results_cache, globally_invalid_cache_inds)
+
+        else:
+            val = 1
+            qubits_available = [i + self.qubit_start_point for i in range(num_qubits_in_default)]
+            if isinstance(term_to_extend_with, int):
+                breakpoint()
+            matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
+                                            local_results_cache, self.swap_gate) for op in term_to_extend_with}
+            qubit_used = []
+            for key in matrix_reps.keys():
+                qubit_used.extend(key)
+
+            assert len(qubit_used) == len(set(qubit_used))
+            unused_qubits = set(qubits_available) - set(qubit_used)
+
+            implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
+                                        Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
+                                        local_results_cache, self.swap_gate) for qu in unused_qubits}
+
+            while qubits_available:
+
+                qu = qubits_available[0]
+                if qu in unused_qubits:
+                    val = _np.kron(val, implicit_idle_reps[(qu,)])
+                    qubits_available = qubits_available[1:]
+                else:
+                    # It must be a part of a non-trivial gate.
+                    gatekey = [key for key in matrix_reps if qu in key][0]
+                    val = _np.kron(val, matrix_reps[gatekey])
+
+                    qubits_available = qubits_available[len(gatekey):]
+
+            local_results_cache[term_to_extend_with] = val
+            if cumulative_term is None:
+                return val
+            # Cache if off.
+            return local_results_cache[term_to_extend_with] @ cumulative_term
+
+
+    def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
+
+        output = ()
+        for term in self.cache[cache_ind]:
+
+            if isinstance(term, Label):
+                output = (*output, term)
+            elif isinstance(term, int):
+                # Recurse down.
+                next_term = self.trace_through_cache_to_build_circuit(term)
+                output = (*output, *next_term)
+
+        return list(output)
+
+    def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int], model = None, gp_index_changing: Optional[int] = None) -> int:
+        """
+        We assume that each matrix matrix multiply is the same size.
+        """
+
+        assert matrix_size[0] == matrix_size[1]
+
+        total_flop_cost = 0
+        if (model is not None) and (gp_index_changing is not None):
+
+            cache_inds, invalid_lbls = self._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+
+        else:
+            cache_inds = list(self.cache.keys())
+
+        for cache_ind in cache_inds:
+            num_mm_on_this_cache_line = len(self.cache[cache_ind]) - 1
+            total_flop_cost += (2* (matrix_size[0])**3) * num_mm_on_this_cache_line
+
+        return total_flop_cost
+
+
+class CollectionOfLCSEvalTrees():
+
+    def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTupTup]],
+                 sub_cir_to_full_cir_id_and_lane_id,
+                 cir_id_and_lane_id_to_sub_cir):
+        
+        self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
+
+        ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE = False
+
+        size_to_tree: dict[int, tuple[int, ...]] = {}
+
+        self.line_lbls_to_cir_list = line_lbls_to_circuit_list
+
+        starttime = _time.time()
+        for key, vals in our_tqdm(line_lbls_to_circuit_list.items(), " Building Longest Common Substring Caches"):
+            sub_cirs = []
+            for cir in vals:
+                sub_cirs.append(list(cir))
+            if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
+                if len(key) not in size_to_tree:
+                    self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
+                    size_to_tree[len(key)] = key
+                else:
+                    sample = EvalTreeBasedUponLongestCommonSubstring(sub_cirs[:2]) # Build a small version to be corrected later.
+                    other_key = size_to_tree[len(key)]
+                    sample.from_other_eval_tree(self.trees[other_key], {other_key[i]: key[i] for i in range(len(key))})
+                    self.trees[key] = sample
+            else:
+                self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs, sorted(key)[0])
+
+        endtime = _time.time()
+
+        print(" Time to compute all the evaluation orders (s): ", endtime - starttime)
+
+
+        self.sub_cir_to_full_cir_id_and_lane_id = sub_cir_to_full_cir_id_and_lane_id
+        self.cir_id_and_lane_id_to_sub_cir = cir_id_and_lane_id_to_sub_cir
+
+        self.cir_id_to_tensor_order: dict[int, list[list[int], int]] = {}
+        self.compute_tensor_orders()
+
+        self.saved_results: dict[Union[LabelTupTup, int], _np.ndarray] = {}
+        self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
+        self.original_matrices: dict[int, dict[int, _np.ndarray]] = {}
+        self.full_matrices: list[KronStructured] = []
+        self.process_matrices_which_will_need_to_update_for_index: _np.ndarray = []
+
+    def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
+
+        for key in self.trees:
+            inds, lbls = self.trees[key]._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
+            if len(inds) > 0:
+                return True
+        return False
+
+
+    def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Optional[int] = None):
+        """
+        Collapse all circuits to their process matrices. If alphabet_piece_changing is not None, then
+        we assume we have already collapsed this system once before and so only need to update part of the eval tree.
+        """
+        # Just collapse all of them.
+
+        if gp_index_changing is not None:
+            # We may not need to check all of the lanes.
+            pass
+
+        else:
+            self.saved_results = {}
+        
+        for key in self.trees:
+            num_qubits = len(key)
+            tree = self.trees[key]
+            out1, out2 = tree.collapse_circuits_to_process_matrices(model, num_qubits, gp_index_changing)
+            # self.saved_results[key], self.sub_cir_to_ind_in_results[key] = self.trees[key].collapse_circuits_to_process_matrices(model, len(key))
+            self.saved_results[key] = out1
+            self.sub_cir_to_ind_in_results[key] = out2
+
+    def determine_which_circuits_will_update_for_what_gpindices(self, model):
+
+        dirty_circuits = _np.zeros((model.num_params, len(self.trees), len(self.cir_id_and_lane_id_to_sub_cir)))
+        for ind in range(model.num_params):
+
+            for ikey, key in enumerate(self.trees):
+                dirty_circuits[ind, ikey, self.trees[key]._which_full_circuits_will_change_due_to_gpindex_changing(model, ind)] = 1
+
+        self.process_matrices_which_will_need_to_update_for_index = dirty_circuits
+        return dirty_circuits
+
+    def reset_full_matrices_to_base_probs_version(self) -> None:
+        """
+        Any matrix which was updated previously reset to the original version.
+        """
+
+        for icir in self.original_matrices:
+            for lane_in_cir in self.original_matrices[icir]:
+                self.full_matrices[icir].update_operand(lane_in_cir, self.original_matrices[icir][lane_in_cir])
+        self.original_matrices = {}
+        return
+
+
+    def reconstruct_full_matrices(self,
+                                  model = None,
+                                  gp_index_changing: Optional[int] = None) -> \
+                                    Optional[Tuple[List[Union[KronStructured, _np.ndarray]], List[int]]]:
+        """
+        Construct a tensor product structure for each individual circuit
+        """
+
+        if len(self.saved_results) == 0:
+            return
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
+        if (gp_index_changing is not None) and (model is not None):
+            
+            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
+
+            lane_key_to_ind: dict[tuple[int, ...], int] = {key: ikey for ikey, key in enumerate(self.trees)}
+
+            output = []
+            if len(cir_inds) > 0:
+                self.original_matrices = {} # Reset the cache of updated process matrices.
+
+            for icir in cir_inds:
+                lane_circuits = []
+                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    lblkey = cir._line_labels
+
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    if ind_in_results not in self.saved_results[lblkey]:
+                        # We have only the local changes.
+                        # This will be stored in the results file of the subtree.
+                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                    else:
+                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                if len(lane_circuits) > 1:
+                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
+                    # output.append(KronStructured(lane_circuits))
+                elif len(lane_circuits) == 1:
+                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
+                else:
+                    raise ValueError()
+        
+            return output, cir_inds
+
+        else:
+            output = []
+
+
+            # Now we can do the combination.
+
+            for icir in cir_inds:
+                lane_circuits = []
+                for i in range(len(self.cir_id_and_lane_id_to_sub_cir[icir])):
+                    cir = self.cir_id_and_lane_id_to_sub_cir[icir][i]
+                    lblkey = cir._line_labels
+
+                    ind_in_results = self.sub_cir_to_ind_in_results[lblkey][cir.layertup]
+                    if ind_in_results not in self.saved_results[lblkey]:
+                        # We have only the local changes.
+                        # This will be stored in the results file of the subtree.
+                        lane_circuits.append(self.trees[lblkey].results[ind_in_results])
+                    else:
+                        lane_circuits.append(self.saved_results[lblkey][ind_in_results])
+                if len(lane_circuits) > 1:
+                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
+                    # output.append(KronStructured(lane_circuits))
+                elif len(lane_circuits) == 1:
+                    output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
+                else:
+                    raise ValueError()
+            
+        self.full_matrices = output
+        return output, cir_inds
+
+
+    def recurse_to_build_sparse_kron_matrix(self, operands: list[_np.ndarray]):
+        if len(operands) == 1:
+            return operands[0]
+        return sparse_kron(operands[0], self.recurse_to_build_sparse_kron_matrix(operands[1:]))
+
+
+    def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False, model = None, gp_index_changing: Optional[int] = None):
+
+
+        cost_collapse = 0
+        for key in self.trees:
+            num_qubits = len(key) if key[0] != ('*',) else key[1] # Stored in the data structure.
+            tree = self.trees[key]
+            cost_collapse += tree.flop_cost_of_evaluating_tree(tuple([4**num_qubits, 4**num_qubits]), model, gp_index_changing)
+        
+
+        tensor_cost = 0
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+        cir_inds = _np.arange(num_cirs, dtype=_np.int32)
+
+        if (model is not None) and (gp_index_changing is not None):
+
+            dirty_circuits = self.determine_which_circuits_will_update_for_what_gpindices(model)
+            cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
+
+        
+        for cir_id in cir_inds:
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            qubit_list = list(qubit_list)
+            total_num = _np.sum(qubit_list)
+
+            tensor_cost += cost_to_compute_tensor_matvec_without_reordering(qubit_list, total_num)
+
+        if return_collapse:
+            return tensor_cost + cost_collapse, cost_collapse
+        elif return_tensor_matvec:
+            return tensor_cost + cost_collapse, tensor_cost
+        elif gp_index_changing is not None:
+            return tensor_cost + cost_collapse, len(cir_inds)  # Since you are not updating all of the representations we do not need to update the state props either for those.
+
+        return tensor_cost + cost_collapse
+
+    def compute_tensor_orders(self):
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+
+        cache_struct = {}
+
+        for cir_id in range(num_cirs):
+            qubit_list = ()
+            for lane_id in range(len(self.cir_id_and_lane_id_to_sub_cir[cir_id])):
+                subcir = self.cir_id_and_lane_id_to_sub_cir[cir_id][lane_id]
+                qubit_list = (*qubit_list, len(subcir._line_labels))
+            self.cir_id_to_tensor_order[cir_id] = self.best_order_for_tensor_contraction(qubit_list, cache_struct)
+
+        return
+            
+    def best_order_for_tensor_contraction(self,
+                    qubit_list: tuple[int, ...],
+                    cache: dict[tuple[int, ...], tuple[list[int], int]]) -> tuple[list[int], int]:
+        """
+        Find the tensor contraction order that minizes the cost of contracting to a dense system with
+        a total number of qubits equal to the len(qubit_list)
+        """
+        
+
+        if qubit_list in cache:
+            return cache[qubit_list]
+
+        best_cost = _np.inf
+        best_order = []
+
+        for order in itertools.permutations(range(len(qubit_list)-1), len(qubit_list)-1):
+
+            my_list = [qb for qb in qubit_list] # force deep copy.
+            my_starting_points = [sp for sp in order]
+            cost = 0
+            early_exit = False
+            while my_starting_points and not early_exit:
+                sp = my_starting_points.pop(0)
+
+                cost += self._tensor_cost_model(my_list[sp], my_list[sp+1])
+                if cost <= best_cost:
+                    # modify sp for future.
+                    tmp = []
+                    for new_val in my_starting_points:
+                        tmp.append((new_val - 1)*(new_val > sp) + (new_val) * (new_val < sp))
+                    my_starting_points = tmp
+
+                    q2 = my_list.pop(sp+1)
+                    my_list[sp] += q2
+                else:
+                    early_exit = True # This round is done because the partial sum was too big.
+
+            if cost < best_cost and not early_exit:
+                best_cost = cost
+                best_order = list(order)
+
+        # Store off the information.
+        cache[qubit_list] = best_order, best_cost
+
+        return best_order, best_cost
+
+    def _tensor_cost_model(self, num_qubits1, num_qubits2):
+        """
+        Assumes kronecker product of 2 square matrices.
+        """
+
+        return (4**num_qubits1)**2 * (4**num_qubits2)**2
+    
+    def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[int, list[int], list[int]]:
+        """
+        Compute the number of flops needed to collapse each circuit into a single process matrix.
+
+        Returns:
+        ---------
+            cost - int total cost to collapse and reform
+            collapse_lane_cost - list[int] cost to collapse a lane
+            tensor_cost - list[int] cost to recombine a circuit into its full size.
+        """
+
+
+        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
+
+        collapse_lane_cost = []
+
+        for lbl_key, my_tree in self.trees.items():
+            collapse_lane_cost.append(my_tree.flop_cost_of_evaluating_tree([4**len(lbl_key), 4**len(lbl_key)]))
+
+        tensor_cost = []
+        for icir in range(num_cirs):
+            
+            _order, cost = self.cir_id_to_tensor_order[icir]
+            tensor_cost.append(cost)
+
+        return sum(tensor_cost) + sum(collapse_lane_cost), collapse_lane_cost, tensor_cost
+    
+
+
+
+
+
+def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int) -> int:
+
+    assert _np.sum(qubit_list) == total_num_qubits
+
+    if len(qubit_list) == 1:
+        # Basic matvec.
+        cost = 2 * (4**qubit_list[0]**2)
+        return cost
+    
+    elif len(qubit_list) == 2:
+        # vec((A \tensor B) u) = vec(B U A.T)
+        term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
+        term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
+        return term1 + term2
+    
+    else:
+        # Just pop off the last term
+        # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
+
+        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0])
+        right *= 4**(_np.sum(qubit_list[1:]))
+        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:],
+                                                                total_num_qubits - qubit_list[0])
+        left *= 4**(qubit_list[0])
+        return left + right
+    
\ No newline at end of file

From 413ac670944cb826d41566965e0818dec0a21624 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 15:00:57 -0700
Subject: [PATCH 139/141] Finish the split.

---
 pygsti/layouts/evaltree.py | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/pygsti/layouts/evaltree.py b/pygsti/layouts/evaltree.py
index 81d750277..d8aa24f85 100644
--- a/pygsti/layouts/evaltree.py
+++ b/pygsti/layouts/evaltree.py
@@ -10,34 +10,14 @@
 # http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
 #***************************************************************************************************
 
-from __future__ import annotations
 import bisect as _bisect
 import time as _time  # DEBUG TIMERS
 import warnings as _warnings
 
 import numpy as _np
 
-from pygsti.circuits.circuit import Circuit as _Circuit, LayerTupLike
+from pygsti.circuits.circuit import Circuit as _Circuit
 from pygsti.baseobjs.verbosityprinter import VerbosityPrinter as _VerbosityPrinter
-from pygsti.baseobjs.label import LabelTupTup, Label, LabelTup
-import itertools
-from pygsti.tools.sequencetools import (
-    conduct_one_round_of_lcs_simplification,
-    _compute_lcs_for_every_pair_of_sequences,
-    create_tables_for_internal_LCS,
-    simplify_internal_first_one_round
-)
-from pygsti.tools.dyadickronop import KronStructured
-from pygsti.circuits.split_circuits_into_lanes import (
-    compute_qubit_to_lane_and_lane_to_qubits_mappings_for_circuit,
-    compute_subcircuits
-)
-
-import scipy.linalg as la
-import scipy.sparse.linalg as sparla
-from scipy.sparse import kron as sparse_kron
-from typing import List, Optional, Iterable, Union, TYPE_CHECKING, Tuple
-from pygsti.tools.tqdm import our_tqdm
 
 
 def _walk_subtree(treedict, indx, running_inds):
@@ -360,7 +340,7 @@ def _get_start_indices(max_intersect):
                             (_time.time() - tm)); tm = _time.time()
 
                 #merge_method = "fast"
-                #Another possible algorithm (but slower)
+                #Another possible algorith (but slower)
                 #if merge_method == "best":
                 #    while len(indicesLeft) > 0:
                 #        iToMergeInto,_ = min(enumerate(map(len,subTreeSetList)),

From 3bd478e00d5a0c43c3f9fe356c96a9eab7a37270 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 16:16:40 -0700
Subject: [PATCH 140/141] Add a description for
 EvalTreeBasedUponLongestCommonSubstring

---
 pygsti/layouts/longest_common_evaltree.py | 293 +++++++++++-----------
 1 file changed, 147 insertions(+), 146 deletions(-)

diff --git a/pygsti/layouts/longest_common_evaltree.py b/pygsti/layouts/longest_common_evaltree.py
index e9ccc4f96..dccaa2e55 100644
--- a/pygsti/layouts/longest_common_evaltree.py
+++ b/pygsti/layouts/longest_common_evaltree.py
@@ -52,10 +52,6 @@ def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: Union[str,
         tmp.done_editing()
     return tmp
 
-
-
-
-
 def setup_circuit_list_for_LCS_computations(
         circuit_list: list[_Circuit],
         implicit_idle_gate_name: Union[str, Label] = 'I'
@@ -111,6 +107,32 @@ def setup_circuit_list_for_LCS_computations(
 
 #endregion Split Circuits by lanes helpers
 
+def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int) -> int:
+
+    assert _np.sum(qubit_list) == total_num_qubits
+
+    if len(qubit_list) == 1:
+        # Basic matvec.
+        cost = 2 * (4**qubit_list[0]**2)
+        return cost
+    
+    elif len(qubit_list) == 2:
+        # vec((A \tensor B) u) = vec(B U A.T)
+        term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
+        term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
+        return term1 + term2
+    
+    else:
+        # Just pop off the last term
+        # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
+
+        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0])
+        right *= 4**(_np.sum(qubit_list[1:]))
+        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:],
+                                                                total_num_qubits - qubit_list[0])
+        left *= 4**(qubit_list[0])
+        return left + right
+
 
 #region Lane Collapsing Helpers
 
@@ -139,11 +161,81 @@ def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dic
     """
     return model._layer_rules.get_dense_process_matrix_represention_for_gate(model, op)
 
-
 #endregion Lane Collapsing Helpers
 
 
 class EvalTreeBasedUponLongestCommonSubstring():
+    """
+    This class will convert a circuit list into an evaluation cache specifying the order in which to compute
+    the matrix matrix products.
+
+    To build the tree we run D rounds of simplification, where D is the length of the longest common subsequence
+    between any pair of circuits or within a circuit.
+
+    In each round of the cache construction, we replace all of the longest common subsequences with a cache number.
+    Before continuing to the next round of the caching algorithm we add the newly chosen subsequences to the list of
+    circuits to cache. Note that cache construction each round can take O(C^2 * L^3) where C is the number of
+    circuits in that round and L is the length of the longest circuit in that round.
+
+    Alternate Constructors:
+        - from_other_eval_tree - Construct new tree from another where there is a mapping from the qubits
+                                active in one to the qubits active in the other.
+
+    Properties:
+        - circuit_to_save_location: dict[tuple[LabelTupTup, ...], int] - maps an initial circuit to its starting location in the cache.
+        - qubit_start_point: int - If this tree is for a LANE of a bigger circuit, where does this lane start.
+        - sequence_intro: dict[int, list[int]] - 
+                a list of indices introduced each round. Note that an cache index, i, which is introduced
+                in the same round as a different cache index, j, cannot depend on j. Each round the indices introduced correspond to
+                longest common sequences that round. Different i, and j, implies different sequences which by construction must be the same length.
+                Therefore, if you have computed all of the indices introduced in the previous rounds you can compute all of the indices in
+                the current round, concurrently.
+
+                This invariant is broken for round 0 if you initially pass 2 or more identical circuits.
+                We do not check if the circuits are identical.
+
+        - cache: dict[int, tuple[Union[LabelTupTup, int], ...]] - the actual tree which will be evaluated to compute the process matrices for each of the circuits.
+                The original circuits are stored in their original order in the keys 0, 1, ..., num_circuits - 1
+        - num_circuits: int - How many circuits are in the unaltered list.
+        
+        - _swap_gate: The two qubit representation of a swap gate which for example would swap the effective region from qubit 0 to qubit 1 in S(Gxpi2:0)S.T .
+
+        - _results: dict[Union[int, LabelTup], _np.ndarray] - dictionary of the process matrices after evaluation with a specific model configuration.
+
+        - cache_ind_to_alphabet_vals_referenced: dict[int, set(LabelTup)] - map from cache index to the gates used in that circuit.
+                TODO: Extract this method to circuit.py
+
+        - alphabet_val_to_sorted_cache_inds: dict[LabelTup, sorted[tuple[int, ...]]] -
+                Which indices do you need to recompute if the model interpretation of the label L changed, but everything else stayed the same.
+
+        - gpindex_to_cache_vals: dict[int, tuple[sorted[list[int]], list[Label]]] - once the cache has been evaluated with a model and you are changing only
+                one parameter at a time, which cache indices do you need to recompute and which labels do you need to reinterpret within the model.
+
+    Methods:
+
+        - collapse_circuits_to_process_matrices - PUBLIC. Evaluate the tree based upon the input model.
+                This will only recompute the portions necessary if the optional index term passed is not None.
+                This method will invalidate the results cache if called with the optional index term passed as None.
+        
+        - _collapse_cache_line - PRIVATE. A line in the cache is a subsequence that needs to be calculated. It may contain more than 2 terms,
+                if the specific subsequence is only referenced once. This method handles reducing the line to a single process matrix and storing
+                the result in the appropriate results cache.
+
+        - _which_full_circuits_will_change_due_to_gpindex_changing - PRIVATE. This populates gpindex_to_cache_vals for a given model.
+
+        - visualize_circuit - PUBLIC. Trace through the tree to see what the circuit is that starts at that location.
+
+        - _handle_results_cache_lookup_and_product - Private. Look up the index in the results cache and multiply with current term.
+
+        - _compute_which_labels_index_depends_on - Private. Used to fill the cache_ind_to_alphabet_vals_referenced dictionary.
+        - flop_cost_of_evaluating_tree - PUBLIC. Given a specific matrix size representation and a specific model, compute how many
+                flops it will take to evaluate the cache assuming that matrix multiplication is the only cost.
+                If you are doing a derivative calculation this will also compute the cost to evaluate the tree in that condition.
+
+
+    Options available during construction
+    """
+
 
     def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0):
         """
@@ -152,7 +244,6 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         self.circuit_to_save_location = {tuple(cir): i for i,cir in enumerate(circuit_list)}
 
-        self.orig_circuits = {i: circuit_list[i] for i in range(len(circuit_list))}
         self.qubit_start_point = qubit_starting_loc
 
 
@@ -161,39 +252,18 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
 
         max_rounds = best_internal_match
 
-        C = len(circuit_list)
-        sequence_intro = {0: _np.arange(C)}
+        self.num_circuits = len(circuit_list)
+
+        sequence_intro = {0: _np.arange(self.num_circuits)}
 
-        cache_pos = C
+        cache_pos = self.num_circuits
         cache = {i: circuit_list[i] for i in range(len(circuit_list))}
 
         new_circuit_list = [cir for cir in circuit_list] # Get a deep copy since we will modify it here.
 
         # Let's try simplifying internally first.
-        self.internal_first = False  # TODO: Fix.
-        seq_ind_to_cache_index = {i: i for i in range(C)}
-        if self.internal_first:
-            i = 0
-            cache_pos = -1
-            while max_rounds > 1:
-
-                tmp = simplify_internal_first_one_round(new_circuit_list, 
-                                                        internal_matches,
-                                                        cache_pos,
-                                                        cache,
-                                                        seq_ind_to_cache_index)
-                new_circuit_list, cache_pos, cache, sequence_intro[i-1] = tmp
-                i -= 1
-                internal_matches = create_tables_for_internal_LCS(new_circuit_list)
-
-                max_rounds = _np.max(internal_matches[0])
-        external_matches = _compute_lcs_for_every_pair_of_sequences(new_circuit_list,
-                                                                    None,
-                                                                    None,
-                                                                    set(_np.arange(len(new_circuit_list))),
-                                                                    max([len(cir) for cir in new_circuit_list])-1)
-
-
+        seq_ind_to_cache_index = {i: i for i in range(self.num_circuits)}
+        
         best_external_match = _np.max(external_matches[0])
 
         max_rounds = int(max(best_external_match,best_internal_match))
@@ -227,20 +297,19 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
             max_rounds = int(max(best_external_match,best_internal_match))
 
         self.cache = cache
-        self.num_circuits = C
         self.from_other = False
 
         self.sequence_intro = sequence_intro
 
         from pygsti.modelmembers.operations import StaticStandardOp
-        self.swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense().round(16)
+        self._swap_gate = StaticStandardOp('Gswap', basis='pp').to_dense()
 
         self.cache_ind_to_alphabet_vals_referenced: dict[int, set[LabelTupTup]] = {}
 
 
         # Useful for repeated calculations seen in a derivative calculation.
         for key in self.cache:
-            self.compute_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
+            self._compute_which_labels_index_depends_on(key, self.cache_ind_to_alphabet_vals_referenced)
 
         alphabet_val_to_cache_inds_to_update: dict[LabelTup, set[int]] = {}
 
@@ -258,7 +327,7 @@ def __init__(self, circuit_list: list[LabelTupTup], qubit_starting_loc: int = 0)
                     else:
                         alphabet_val_to_cache_inds_to_update[val] = set([cache_ind])
 
-        self.results: dict[int | LabelTupTup, _np.ndarray] = {}
+        self._results: dict[Union[int, LabelTupTup], _np.ndarray] = {}
 
         self.alphabet_val_to_sorted_cache_inds: dict[LabelTup, list[int]] = {}
 
@@ -327,8 +396,7 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
         self.cache = other.cache
         self.num_circuits = other.num_circuits
         self.sequence_intro = other.sequence_intro
-        self.swap_gate = other.swap_gate
-        self.orig_circuit_list = other.orig_circuit_list
+        self._swap_gate = other._swap_gate
         self.circuit_to_save_location = other.circuit_to_save_location
         self.from_other = other
 
@@ -344,10 +412,6 @@ def from_other_eval_tree(self, other: EvalTreeBasedUponLongestCommonSubstring, q
                         new_term = (*new_term, new_op)
                     self.cache[ind][i] = Label(new_term)
 
-        
-        for icir in range(len(self.orig_circuit_list)):
-            self.orig_circuit_list[icir] = self.trace_through_cache_to_build_circuit(icir)
-
         updated = {}
         for cir, loc in self.circuit_to_save_location.items():
             new_cir = ()
@@ -378,7 +442,7 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
             if cache_inds:
                 # Invalidate all gate labels that we saved just in case.
                 # Invalidate every index in the which we know to be influenced by my_op.
-                # local_changes = {k: v for k, v in self.results.items() \
+                # local_changes = {k: v for k, v in self._results.items() \
                 #                     if ((k not in cache_inds) and (not isinstance(k, Label)))} # Could just invalidate only the lbl with the index.
 
                 # Iterating over all the cache will take too long.
@@ -402,12 +466,12 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
 
                 return local_changes, self.circuit_to_save_location
 
-            return self.results, self.circuit_to_save_location
+            return self._results, self.circuit_to_save_location
 
         else:
-            self.results = {} # We are asking to reset all the calculations.
+            self._results = {} # We are asking to reset all the calculations.
             round_keys = sorted(_np.unique(list(self.sequence_intro.keys())))[::-1]
-            # saved: dict[int | LabelTupTup, _np.ndarray] = {}
+            # saved: dict[Union[int, LabelTupTup], _np.ndarray] = {}
 
             if self.internal_first:
 
@@ -428,23 +492,24 @@ def collapse_circuits_to_process_matrices(self, model, num_qubits_in_default: in
                 for cache_ind in self.sequence_intro[key]:
                     cumulative_term = None
                     for term in self.cache[cache_ind]:
-                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self.results,
+                        cumulative_term = self._collapse_cache_line(model, cumulative_term, term, self._results,
                                                                     num_qubits_in_default, empty, empty)
                             
                     if cumulative_term is None:
-                        self.results[cache_ind] = _np.eye(4**num_qubits_in_default)
+                        self._results[cache_ind] = _np.eye(4**num_qubits_in_default)
                         # NOTE: unclear when (if ever) this should be a noisy idle gate.
                     else:
-                        self.results[cache_ind] = cumulative_term
+                        self._results[cache_ind] = cumulative_term
         if __debug__:
             # We may store more in the cache in order to handle multi-qubit gates which are out of the normal order.
             for key in self.cache:
-                assert key in self.results
-        
-        # {tuple(self.trace_through_cache_to_build_circuit(icir)): icir for icir in range(len(self.orig_circuit_list)) if icir < self.num_circuits}
-        return self.results, self.circuit_to_save_location
+                assert key in self._results
+        return self._results, self.circuit_to_save_location
     
-    def compute_depends_on(self, val: int | LabelTupTup, visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
+    def _compute_which_labels_index_depends_on(self, val: Union[int, LabelTupTup], visited: dict[int, set[LabelTupTup]]) -> set[LabelTupTup]:
+        """
+        Determine which labels the specific index in the cache depends on.
+        """
 
         if not isinstance(val, int):
             return set([val])
@@ -453,14 +518,16 @@ def compute_depends_on(self, val: int | LabelTupTup, visited: dict[int, set[Labe
         else:
             tmp = set()
             for child in self.cache[val]:
-                ret_val = self.compute_depends_on(child, visited)
+                ret_val = self._compute_which_labels_index_depends_on(child, visited)
                 tmp = tmp.union(ret_val)
             visited[val] = tmp
             return tmp
 
 
-    def combine_for_visualization(self, val, visited):
-
+    def visualize_circuit(self, val: Union[int, LabelTupTup], visited) -> list[LabelTupTup]:
+        """
+        Recombine circuit by traversing the cache.
+        """
         if not isinstance(val, int):
             return [val]
         elif val in visited:
@@ -468,46 +535,26 @@ def combine_for_visualization(self, val, visited):
         else:
             tmp = []
             for child in self.cache[val]:
-                tmp.append(self.combine_for_visualization(child, visited))
+                tmp.append(self.visualize_circuit(child, visited))
             visited[val] = tmp
             return tmp
 
-    def handle_results_cache_lookup_and_product(self,
-                            cumulative_term: None | _np.ndarray,
-                            term_to_extend_with: int | LabelTupTup,
-                            results_cache: dict[int | LabelTupTup, _np.ndarray]) -> _np.ndarray:
+    def _handle_results_cache_lookup_and_product(self,
+                            cumulative_term: Optional[_np.ndarray],
+                            term_to_extend_with: Union[int, LabelTupTup],
+                            results_cache: dict[Union[int, LabelTupTup], _np.ndarray]) -> _np.ndarray:
+        """
+        When combining terms in cache for evaluation handle the lookup from the results cache.
+        """
 
         if cumulative_term is None:
             return results_cache[term_to_extend_with]
         return results_cache[term_to_extend_with] @ cumulative_term
 
-        if isinstance(term_to_extend_with, int):
-            if term_to_extend_with in globally_invalid_cache_inds[:-1]:
-                # look up the result in the local results cache.
-                # This is just for that derivative step.
-                if cumulative_term is None:
-                    return results_cache[term_to_extend_with]
-                return results_cache[term_to_extend_with] @ cumulative_term
-        else:
-            if term_to_extend_with in globally_invalid_cache_inds[-1:]:
-                # Only one label gets invalidated and that is stored at the end of the list.
-
-                # look up the result in the local results cache.
-                # This is just for that derivative step.
-                if cumulative_term is None:
-                    return results_cache[term_to_extend_with]
-                return results_cache[term_to_extend_with] @ cumulative_term
-        
-        # We should use the cache for all the probs calculation.
-        if cumulative_term is None:
-            # look up result.
-            return self.results[term_to_extend_with]
-        return self.results[term_to_extend_with] @ cumulative_term 
-
 
-    def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
-                            term_to_extend_with: int | LabelTupTup,
-                            local_results_cache: dict[int | LabelTupTup, _np.ndarray],
+    def _collapse_cache_line(self, model, cumulative_term: Optional[_np.ndarray],
+                            term_to_extend_with: Union[int, LabelTupTup],
+                            local_results_cache: dict[Union[int, LabelTupTup], _np.ndarray],
                             num_qubits_in_default: int,
                             globally_invalid_cache_inds: Optional[list[int]] = None,
                             globally_invalid_labels: Optional[list[LabelTupTup]] = None
@@ -516,34 +563,29 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
         Reduce a cache line to a single process matrix.
 
         This should really only be called from collapse_circuits_to_process_matrices.
-
+        The method will handle looking in the appropriate cache in the case of a derivative
+        approximation.
         """
 
         if (term_to_extend_with in local_results_cache):
-            return self.handle_results_cache_lookup_and_product(cumulative_term,
+            return self._handle_results_cache_lookup_and_product(cumulative_term,
                                                                 term_to_extend_with,
                                                                 local_results_cache)
         elif isinstance(term_to_extend_with, int) and \
             (globally_invalid_cache_inds is not None) and \
             (term_to_extend_with not in globally_invalid_cache_inds) and \
-                (term_to_extend_with in self.results):
+                (term_to_extend_with in self._results):
             
-            return self.handle_results_cache_lookup_and_product(cumulative_term,
+            return self._handle_results_cache_lookup_and_product(cumulative_term,
                                                                 term_to_extend_with,
-                                                                self.results)
+                                                                self._results)
         elif isinstance(term_to_extend_with, LabelTupTup) and \
             (globally_invalid_labels is not None) and \
             not (any([t in globally_invalid_labels for t in term_to_extend_with])) \
-                and (term_to_extend_with in self.results):        
-            return self.handle_results_cache_lookup_and_product(cumulative_term,
+                and (term_to_extend_with in self._results):        
+            return self._handle_results_cache_lookup_and_product(cumulative_term,
                                                                 term_to_extend_with,
-                                                                self.results)
-        
-        # elif isinstance(term_to_extend_with, LabelTup) and \
-        #     (term_to_extend_with not in globally_invalid_cache_inds[-1:]) \
-        #         and (term_to_extend_with in self.results):        
-        #     return self.handle_results_cache_lookup_and_product(cumulative_term, term_to_extend_with,
-        #                                                         local_results_cache, globally_invalid_cache_inds)
+                                                                self._results)
 
         else:
             val = 1
@@ -551,7 +593,7 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
             if isinstance(term_to_extend_with, int):
                 breakpoint()
             matrix_reps = {op.qubits: get_dense_representation_of_gate_with_perfect_swap_gates(model, op,
-                                            local_results_cache, self.swap_gate) for op in term_to_extend_with}
+                                            local_results_cache, self._swap_gate) for op in term_to_extend_with}
             qubit_used = []
             for key in matrix_reps.keys():
                 qubit_used.extend(key)
@@ -561,7 +603,7 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
 
             implicit_idle_reps = {(qu,): get_dense_representation_of_gate_with_perfect_swap_gates(model,
                                         Label("Fake_Gate_To_Get_Tensor_Size_Right", qu), # A fake gate to look up and use the appropriate idle gate.
-                                        local_results_cache, self.swap_gate) for qu in unused_qubits}
+                                        local_results_cache, self._swap_gate) for qu in unused_qubits}
 
             while qubits_available:
 
@@ -582,21 +624,6 @@ def _collapse_cache_line(self, model, cumulative_term: None | _np.ndarray,
             # Cache if off.
             return local_results_cache[term_to_extend_with] @ cumulative_term
 
-
-    def trace_through_cache_to_build_circuit(self, cache_ind: int) -> list[tuple]:
-
-        output = ()
-        for term in self.cache[cache_ind]:
-
-            if isinstance(term, Label):
-                output = (*output, term)
-            elif isinstance(term, int):
-                # Recurse down.
-                next_term = self.trace_through_cache_to_build_circuit(term)
-                output = (*output, *next_term)
-
-        return list(output)
-
     def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int], model = None, gp_index_changing: Optional[int] = None) -> int:
         """
         We assume that each matrix matrix multiply is the same size.
@@ -943,29 +970,3 @@ def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[
 
 
 
-def cost_to_compute_tensor_matvec_without_reordering(qubit_list: list[int], total_num_qubits: int) -> int:
-
-    assert _np.sum(qubit_list) == total_num_qubits
-
-    if len(qubit_list) == 1:
-        # Basic matvec.
-        cost = 2 * (4**qubit_list[0]**2)
-        return cost
-    
-    elif len(qubit_list) == 2:
-        # vec((A \tensor B) u) = vec(B U A.T)
-        term1 = 2*(4**qubit_list[1]**2) * (4**qubit_list[0]) # MM of BU.
-        term2 = 2 * (4**qubit_list[0]**2) * (4**qubit_list[1]) # MM of U A.T
-        return term1 + term2
-    
-    else:
-        # Just pop off the last term
-        # (B_1 \tensor B_2 ... \tensor B_n) u = (B_n \tensor B_n-1 ... \tensor B_2) U (B_1).T
-
-        right = cost_to_compute_tensor_matvec_without_reordering(qubit_list[:1], qubit_list[0])
-        right *= 4**(_np.sum(qubit_list[1:]))
-        left = cost_to_compute_tensor_matvec_without_reordering(qubit_list[1:],
-                                                                total_num_qubits - qubit_list[0])
-        left *= 4**(qubit_list[0])
-        return left + right
-    
\ No newline at end of file

From 8f577788280b8000778fa66be2c9ee2a7390f822 Mon Sep 17 00:00:00 2001
From: Nick Koskelo <koskelo2@illinois.edu>
Date: Tue, 19 Aug 2025 16:52:16 -0700
Subject: [PATCH 141/141] CollectionLCS Eval Tree documentation.

---
 pygsti/layouts/longest_common_evaltree.py | 129 ++++++++--------------
 1 file changed, 45 insertions(+), 84 deletions(-)

diff --git a/pygsti/layouts/longest_common_evaltree.py b/pygsti/layouts/longest_common_evaltree.py
index dccaa2e55..0cb349384 100644
--- a/pygsti/layouts/longest_common_evaltree.py
+++ b/pygsti/layouts/longest_common_evaltree.py
@@ -35,6 +35,8 @@
 from typing import List, Optional, Iterable, Union, TYPE_CHECKING, Tuple
 from pygsti.tools.tqdm import our_tqdm
 
+LANE = tuple[int, ...]
+
 #region Split circuit list into lists of subcircuits
 
 def _add_in_idle_gates_to_circuit(circuit: _Circuit, idle_gate_name: Union[str, Label] = 'I') -> _Circuit:
@@ -58,7 +60,7 @@ def setup_circuit_list_for_LCS_computations(
     ) -> tuple[
         dict[int, dict[int, _Circuit]],
         dict[LayerTupLike, list[tuple[int, int]]],
-        dict[tuple[int, ...],list[LayerTupLike]]
+        dict[LANE,list[LayerTupLike]]
     ]:
     """
     Split a circuit list into a list of subcircuits by lanes. These lanes are non-interacting partions of a circuit.
@@ -72,7 +74,7 @@ def setup_circuit_list_for_LCS_computations(
 
     cir_ind_and_lane_id_to_sub_cir: dict[int, dict[int, _Circuit]] = {}
     sub_cir_to_cir_id_and_lane_id:  dict[LayerTupLike, list[tuple[int, int]]] = {}
-    line_labels_to_layertup_lists:  dict[tuple[int, ...], list[LayerTupLike]] = {}
+    line_labels_to_layertup_lists:  dict[LANE, list[LayerTupLike]] = {}
 
     for i, cir in enumerate(circuit_list):
 
@@ -167,7 +169,7 @@ def get_dense_op_of_gate_with_perfect_swap_gates(model, op: LabelTup, saved: dic
 class EvalTreeBasedUponLongestCommonSubstring():
     """
     This class will convert a circuit list into an evaluation cache specifying the order in which to compute
-    the matrix matrix products.
+    the matrix matrix products. We assume that each circuit operates on the same qubits.
 
     To build the tree we run D rounds of simplification, where D is the length of the longest common subsequence
     between any pair of circuits or within a circuit.
@@ -205,7 +207,7 @@ class EvalTreeBasedUponLongestCommonSubstring():
         - cache_ind_to_alphabet_vals_referenced: dict[int, set(LabelTup)] - map from cache index to the gates used in that circuit.
                 TODO: Extract this method to circuit.py
 
-        - alphabet_val_to_sorted_cache_inds: dict[LabelTup, sorted[tuple[int, ...]]] -
+        - alphabet_val_to_sorted_cache_inds: dict[LabelTup, sorted[LANE]] -
                 Which indices do you need to recompute if the model interpretation of the label L changed, but everything else stayed the same.
 
         - gpindex_to_cache_vals: dict[int, tuple[sorted[list[int]], list[Label]]] - once the cache has been evaluated with a model and you are changing only
@@ -647,16 +649,44 @@ def flop_cost_of_evaluating_tree(self, matrix_size: tuple[int, int], model = Non
 
 
 class CollectionOfLCSEvalTrees():
+    """
+    A collection of LCS Eval Trees is a group of trees designed to evaluate a list of circuits which have been split into lanes.
+    A lane is a set of quibts which do not interact with any qubits in another lane.
+
+    CollectionOfLCSEvalTrees assumes that you have already split the circuits into such lanes.
+
+    - Properties:
+
+        - trees - dict[lane, EvalTreeLCS] - mapping of lane to LCS evaluation tree.
+        - _assume_matching_tree_for_matching_num_qubits - bool - Are we assuming that if two lanes have the same size,
+                then they will have the same distribution of circuits to test. This can dramatically reduce the cost of computing the orderings.
+        - line_lbls_to_circuit_list - dict[LANE, list[LabelTupTup]] - lanes to circuits to evaluate.
+        - sub_cir_to_full_cir_id_and_lane_id - dict[tuple[LabelTupTup], tuple[int, int]] - map from sub circuit to which lane they are from and the whole circuit.
+        - cir_id_and_lane_id_to_sub_cir - reverse of sub_cir_to_full_cir_id_and_lane_id
+        - process_matrices_which_will_need_to_update_for_index - _np.ndarray - (params, trees, all_circuits)
+                - True False map for if that sub circuit needs to be updated if a param changes.
+    
+                
+    - Methods:
+        - collapse_circuits_to_process_matrices: PUBLIC - main method for evaluating the trees.
+        - reconstruct_full_matrices: PUBLIC - combine the lanes for each circuit into something which can be multiplied against full system SPAM layers.
+        - flop_estimate: PUBLIC - compute the number of flops needed to evaluate the tree fully or partially as in the case of a derivative in one direction.
+        - determine_which_circuits_will_update_for_what_gpindices: PUBLIC - learn the circuits which will get modified for a specific model and index change.
+        - compute_tensor_orders: Private - what would be the best tensor orders for all the circuits?
+        - best_order_for_tensor_contraction: Private - for a single circuit?
+        - _tensor_cost_model: Private - The cost to compute A \otimes B as full dense matrix.
+    """
+
 
-    def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTupTup]],
+    def __init__(self, line_lbls_to_circuit_list: dict[LANE, list[LabelTupTup]],
                  sub_cir_to_full_cir_id_and_lane_id,
                  cir_id_and_lane_id_to_sub_cir):
         
-        self.trees: dict[tuple[int, ...], EvalTreeBasedUponLongestCommonSubstring] = {}
+        self.trees: dict[LANE, EvalTreeBasedUponLongestCommonSubstring] = {}
 
-        ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE = False
+        self._assume_matching_tree_for_matching_num_qubits = False
 
-        size_to_tree: dict[int, tuple[int, ...]] = {}
+        size_to_tree: dict[int, LANE] = {}
 
         self.line_lbls_to_cir_list = line_lbls_to_circuit_list
 
@@ -665,7 +695,7 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
             sub_cirs = []
             for cir in vals:
                 sub_cirs.append(list(cir))
-            if ASSUME_MATCHING_QUBIT_SIZE_MATCHING_TREE:
+            if self._assume_matching_tree_for_matching_num_qubits:
                 if len(key) not in size_to_tree:
                     self.trees[key] = EvalTreeBasedUponLongestCommonSubstring(sub_cirs)
                     size_to_tree[len(key)] = key
@@ -689,19 +719,9 @@ def __init__(self, line_lbls_to_circuit_list: dict[tuple[int, ...], list[LabelTu
         self.compute_tensor_orders()
 
         self.saved_results: dict[Union[LabelTupTup, int], _np.ndarray] = {}
-        self.sub_cir_to_ind_in_results: dict[tuple[int, ...], dict[_Circuit, int]] = {}
-        self.original_matrices: dict[int, dict[int, _np.ndarray]] = {}
-        self.full_matrices: list[KronStructured] = []
+        self.sub_cir_to_ind_in_results: dict[LANE, dict[_Circuit, int]] = {}
         self.process_matrices_which_will_need_to_update_for_index: _np.ndarray = []
 
-    def do_I_need_to_recompute_portions_if_I_change_this_index(self, model, gp_index_changing: int) -> bool:
-
-        for key in self.trees:
-            inds, lbls = self.trees[key]._gpindex_to_cache_inds_needed_to_recompute(model, gp_index_changing)
-            if len(inds) > 0:
-                return True
-        return False
-
 
     def collapse_circuits_to_process_matrices(self, model, gp_index_changing: Optional[int] = None):
         """
@@ -736,18 +756,6 @@ def determine_which_circuits_will_update_for_what_gpindices(self, model):
         self.process_matrices_which_will_need_to_update_for_index = dirty_circuits
         return dirty_circuits
 
-    def reset_full_matrices_to_base_probs_version(self) -> None:
-        """
-        Any matrix which was updated previously reset to the original version.
-        """
-
-        for icir in self.original_matrices:
-            for lane_in_cir in self.original_matrices[icir]:
-                self.full_matrices[icir].update_operand(lane_in_cir, self.original_matrices[icir][lane_in_cir])
-        self.original_matrices = {}
-        return
-
-
     def reconstruct_full_matrices(self,
                                   model = None,
                                   gp_index_changing: Optional[int] = None) -> \
@@ -765,11 +773,9 @@ def reconstruct_full_matrices(self,
             
             cir_inds = _np.where(_np.sum(self.process_matrices_which_will_need_to_update_for_index[gp_index_changing], axis=0) >= 1)[0] # At least one lane changed.
 
-            lane_key_to_ind: dict[tuple[int, ...], int] = {key: ikey for ikey, key in enumerate(self.trees)}
+            lane_key_to_ind: dict[LANE, int] = {key: ikey for ikey, key in enumerate(self.trees)}
 
             output = []
-            if len(cir_inds) > 0:
-                self.original_matrices = {} # Reset the cache of updated process matrices.
 
             for icir in cir_inds:
                 lane_circuits = []
@@ -785,8 +791,7 @@ def reconstruct_full_matrices(self,
                     else:
                         lane_circuits.append(self.saved_results[lblkey][ind_in_results])
                 if len(lane_circuits) > 1:
-                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
-                    # output.append(KronStructured(lane_circuits))
+                    output.append(KronStructured(lane_circuits))
                 elif len(lane_circuits) == 1:
                     output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
                 else:
@@ -796,8 +801,6 @@ def reconstruct_full_matrices(self,
 
         else:
             output = []
-
-
             # Now we can do the combination.
 
             for icir in cir_inds:
@@ -814,23 +817,14 @@ def reconstruct_full_matrices(self,
                     else:
                         lane_circuits.append(self.saved_results[lblkey][ind_in_results])
                 if len(lane_circuits) > 1:
-                    output.append(self.recurse_to_build_sparse_kron_matrix(lane_circuits))
-                    # output.append(KronStructured(lane_circuits))
+                    output.append(KronStructured(lane_circuits))
                 elif len(lane_circuits) == 1:
                     output.append(lane_circuits[0]) # gate_sequence[i] @ rho needs to work for i in range(num_circs).
                 else:
                     raise ValueError()
             
-        self.full_matrices = output
         return output, cir_inds
 
-
-    def recurse_to_build_sparse_kron_matrix(self, operands: list[_np.ndarray]):
-        if len(operands) == 1:
-            return operands[0]
-        return sparse_kron(operands[0], self.recurse_to_build_sparse_kron_matrix(operands[1:]))
-
-
     def flop_estimate(self, return_collapse: bool = False, return_tensor_matvec: bool = False, model = None, gp_index_changing: Optional[int] = None):
 
 
@@ -886,8 +880,8 @@ def compute_tensor_orders(self):
         return
             
     def best_order_for_tensor_contraction(self,
-                    qubit_list: tuple[int, ...],
-                    cache: dict[tuple[int, ...], tuple[list[int], int]]) -> tuple[list[int], int]:
+                    qubit_list: LANE,
+                    cache: dict[LANE, tuple[list[int], int]]) -> tuple[list[int], int]:
         """
         Find the tensor contraction order that minizes the cost of contracting to a dense system with
         a total number of qubits equal to the len(qubit_list)
@@ -937,36 +931,3 @@ def _tensor_cost_model(self, num_qubits1, num_qubits2):
         """
 
         return (4**num_qubits1)**2 * (4**num_qubits2)**2
-    
-    def _flop_estimate_to_collapse_to_each_circuit_to_process_matrix(self) -> tuple[int, list[int], list[int]]:
-        """
-        Compute the number of flops needed to collapse each circuit into a single process matrix.
-
-        Returns:
-        ---------
-            cost - int total cost to collapse and reform
-            collapse_lane_cost - list[int] cost to collapse a lane
-            tensor_cost - list[int] cost to recombine a circuit into its full size.
-        """
-
-
-        num_cirs = len(self.cir_id_and_lane_id_to_sub_cir)
-
-        collapse_lane_cost = []
-
-        for lbl_key, my_tree in self.trees.items():
-            collapse_lane_cost.append(my_tree.flop_cost_of_evaluating_tree([4**len(lbl_key), 4**len(lbl_key)]))
-
-        tensor_cost = []
-        for icir in range(num_cirs):
-            
-            _order, cost = self.cir_id_to_tensor_order[icir]
-            tensor_cost.append(cost)
-
-        return sum(tensor_cost) + sum(collapse_lane_cost), collapse_lane_cost, tensor_cost
-    
-
-
-
-
-