From f6b7ef334bca59dfca48d7a5e395ce249ecd2830 Mon Sep 17 00:00:00 2001
From: Grace Sng <grace.sng75@gmail.com>
Date: Sat, 29 Mar 2025 20:54:05 -0500
Subject: [PATCH 1/4] Test classes for validation suggester and tuebingen model
 suggester.

Signed-off-by: Grace Sng <grace.sng75@gmail.com>
---
 .../suggesters/tuebingen_model_suggester.py   |   2 +-
 pywhyllm/suggesters/validation_suggester.py   | 315 ++++++------------
 ...tuebingen_model_suggester_data_provider.py |  25 ++
 .../validation_suggester_data_provider.py     |  56 ++++
 .../test_tuebingen_model_suggester.py         |  56 ++++
 .../test_validation_suggester.py              | 101 ++++++
 6 files changed, 349 insertions(+), 206 deletions(-)
 create mode 100644 pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py
 create mode 100644 pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py
 create mode 100644 pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py
 create mode 100644 pywhyllm/tests/model_suggester/test_validation_suggester.py

diff --git a/pywhyllm/suggesters/tuebingen_model_suggester.py b/pywhyllm/suggesters/tuebingen_model_suggester.py
index 195879d..0e73c72 100644
--- a/pywhyllm/suggesters/tuebingen_model_suggester.py
+++ b/pywhyllm/suggesters/tuebingen_model_suggester.py
@@ -17,7 +17,7 @@ class Strategy(Enum):
 
 
 class TuebingenModelSuggester(ModelSuggester):
-    def __init__(self, llm):
+    def __init__(self, llm=None):
         super().__init__(llm)
 
     def suggest_description(
diff --git a/pywhyllm/suggesters/validation_suggester.py b/pywhyllm/suggesters/validation_suggester.py
index 0431429..7a7fa7e 100644
--- a/pywhyllm/suggesters/validation_suggester.py
+++ b/pywhyllm/suggesters/validation_suggester.py
@@ -13,9 +13,10 @@
 class ValidationSuggester(IdentifierProtocol):
     CONTEXT: str = """causal mechanisms"""
 
-    def __init__(self, llm):
-        if llm == 'gpt-4':
-            self.llm = guidance.models.OpenAI('gpt-4')
+    def __init__(self, llm=None):
+        if llm is not None:
+            if llm == 'gpt-4':
+                self.llm = guidance.models.OpenAI('gpt-4')
 
     def suggest_negative_controls(
             self,
@@ -23,7 +24,7 @@ def suggest_negative_controls(
             outcome: str,
             factors_list: list(),
             expertise_list: list(),
-            analysis_context: list() = CONTEXT,
+            analysis_context=CONTEXT,
             stakeholders: list() = None
     ):
         expert_list: List[str] = list()
@@ -41,32 +42,14 @@ def suggest_negative_controls(
             if factors_list[i] != treatment and factors_list[i] != outcome:
                 edited_factors_list.append(factors_list[i])
 
-        if len(expert_list) > 1:
-            for expert in expert_list:
-                (
-                    negative_controls_counter,
-                    negative_controls_list,
-                ) = self.request_negative_controls(
-                    treatment=treatment,
-                    outcome=outcome,
-                    factors_list=edited_factors_list,
-                    negative_controls_counter=negative_controls_counter,
-                    domain_expertise=expert,
-                    analysis_context=analysis_context
-                )
-                for m in negative_controls_list:
-                    if m not in negative_controls:
-                        negative_controls.append(m)
-        else:
-            (
-                negative_controls_counter,
-                negative_controls_list,
-            ) = self.request_negative_controls(
+        for expert in expert_list:
+            negative_controls_counter,
+            negative_controls_list = self.request_negative_controls(
                 treatment=treatment,
                 outcome=outcome,
                 factors_list=edited_factors_list,
                 negative_controls_counter=negative_controls_counter,
-                domain_expertise=expert_list[0],
+                domain_expertise=expert,
                 analysis_context=analysis_context
             )
             for m in negative_controls_list:
@@ -82,7 +65,7 @@ def request_negative_controls(
             factors_list: list(),
             negative_controls_counter: list(),
             domain_expertise: str,
-            analysis_context: list() = CONTEXT
+            analysis_context: list = CONTEXT
     ):
         negative_controls_list: List[str] = list()
 
@@ -93,29 +76,29 @@ def request_negative_controls(
 
                 with system():
                     lm += f"""You are an expert in the {domain_expertise} and are 
-        studying the {analysis_context}. You are using your domain knowledge to help understand the negative 
-        controls for a causal model that contains all the assumptions about the {analysis_context}. Where a causal 
-        model is a conceptual model that describes the causal mechanisms of a system. You will do this by answering 
-        questions about cause and effect using your domain knowledge in the {domain_expertise}."""
+            studying the {analysis_context}. You are using your domain knowledge to help understand the negative 
+            controls for a causal model that contains all the assumptions about the {analysis_context}. Where a causal 
+            model is a conceptual model that describes the causal mechanisms of a system. You will do this by answering 
+            questions about cause and effect using your domain knowledge in the {domain_expertise}."""
 
                 with user():
                     lm += f"""factor_names: {factors_list} From your
-                         perspective as an expert in the {domain_expertise}, what factor(s) from the list of factors, relevant to 
-                         the {analysis_context}, should see zero treatment effect when changing the {treatment}? Which factor(s) 
-                         from the list of factors, if any at all, relevant to the {analysis_context}, are negative controls on the 
-                         causal mechanisms that affect the {outcome} when changing {treatment}? Using your domain knowledge, 
-                         which factor(s) from the list of factors, if any at all, relevant to the {analysis_context}, 
-                         should we expect to be unaffected by any changes in {treatment}? Which factor(s) from the list of factors, 
-                         if any at all, would be surprising if affected by a change in {treatment}? Be concise and keep your 
-                         thoughts under two paragraphs. Then provide your step by step chain of thoughts within the tags 
-                         <thinking></thinking>. Once you have thought things through, wrap the name of the factor(s) from the list of 
-                         factors, that has/have a high likelihood of being negative controls on the causal mechanisms that affect {outcome}
-                         when changing {treatment}, within the tags <negative_control>factor_name</negative_control>. Wrap the name 
-                         of the factor(s) from the list of factors, that has/have a high likelihood of being unaffected when 
-                         changing {treatment}, within the tags <negative_control>factor_name</negative_control>. Where factor_name 
-                         is one of the items within the factor_names list. If a factor does not have a high likelihood of being a 
-                         negative control relevant to the {analysis_context}, then do not wrap the factor with any tags. Provide 
-                         your step by step answer as an expert in the {domain_expertise}:"""
+                             perspective as an expert in the {domain_expertise}, what factor(s) from the list of factors, relevant to 
+                             the {analysis_context}, should see zero treatment effect when changing the {treatment}? Which factor(s) 
+                             from the list of factors, if any at all, relevant to the {analysis_context}, are negative controls on the 
+                             causal mechanisms that affect the {outcome} when changing {treatment}? Using your domain knowledge, 
+                             which factor(s) from the list of factors, if any at all, relevant to the {analysis_context}, 
+                             should we expect to be unaffected by any changes in {treatment}? Which factor(s) from the list of factors, 
+                             if any at all, would be surprising if affected by a change in {treatment}? Be concise and keep your 
+                             thoughts under two paragraphs. Then provide your step by step chain of thoughts within the tags 
+                             <thinking></thinking>. Once you have thought things through, wrap the name of the factor(s) from the list of 
+                             factors, that has/have a high likelihood of being negative controls on the causal mechanisms that affect {outcome}
+                             when changing {treatment}, within the tags <negative_control>factor_name</negative_control>. Wrap the name 
+                             of the factor(s) from the list of factors, that has/have a high likelihood of being unaffected when 
+                             changing {treatment}, within the tags <negative_control>factor_name</negative_control>. Where factor_name 
+                             is one of the items within the factor_names list. If a factor does not have a high likelihood of being a 
+                             negative control relevant to the {analysis_context}, then do not wrap the factor with any tags. Provide 
+                             your step by step answer as an expert in the {domain_expertise}:"""
 
                 with assistant():
                     lm += gen("output")
@@ -132,9 +115,7 @@ def request_negative_controls(
                                 and factor not in negative_controls_list
                         ):
                             negative_controls_list.append(factor)
-                    success = True
-                else:
-                    success = False
+                success = True
 
             except KeyError:
                 success = False
@@ -152,7 +133,7 @@ def suggest_latent_confounders(
             treatment: str,
             outcome: str,
             expertise_list: list(),
-            analysis_context: list() = CONTEXT,
+            analysis_context=CONTEXT,
             stakeholders: list() = None
     ):
         expert_list: List[str] = list()
@@ -165,31 +146,15 @@ def suggest_latent_confounders(
         latent_confounders_counter: Dict[str, int] = dict()
         latent_confounders: List[str, str] = list()
 
-        if len(expert_list) > 1:
-            for expert in expert_list:
-                (
-                    latent_confounders_counter,
-                    latent_confounders_list,
-                ) = self.request_latent_confounders(
-                    treatment=treatment,
-                    outcome=outcome,
-                    latent_confounders_counter=latent_confounders_counter,
-                    domain_expertise=expert,
-                    analysis_context=analysis_context,
-                )
-                for m in latent_confounders_list:
-                    if m not in latent_confounders:
-                        latent_confounders.append(m)
-        else:
-            (
-                latent_confounders_counter,
-                latent_confounders_list,
-            ) = self.request_latent_confounders(
+        for expert in expert_list:
+            latent_confounders_counter,
+            latent_confounders_list = self.request_latent_confounders(
                 treatment=treatment,
                 outcome=outcome,
                 latent_confounders_counter=latent_confounders_counter,
-                domain_expertise=expert_list[0],
-                analysis_context=analysis_context)
+                domain_expertise=expert,
+                analysis_context=analysis_context,
+            )
             for m in latent_confounders_list:
                 if m not in latent_confounders:
                     latent_confounders.append(m)
@@ -202,7 +167,7 @@ def request_latent_confounders(
             outcome: str,
             latent_confounders_counter: list(),
             domain_expertise: str,
-            analysis_context: list() = CONTEXT
+            analysis_context=CONTEXT
     ):
         latent_confounders_list: List[str] = list()
 
@@ -212,26 +177,26 @@ def request_latent_confounders(
                 lm = self.llm
                 with system():
                     lm += f"""You are an expert in the {domain_expertise} and are 
-                        studying the {analysis_context}. You are using your knowledge to help build a causal model that contains 
-                        all the assumptions about the {domain_expertise}. Where a causal model is a conceptual model that describes 
-                        the causal mechanisms of a system. You will do this by by answering questions about cause and effect and 
-                        using your domain knowledge in the {domain_expertise}."""
+                                studying the {analysis_context}. You are using your knowledge to help build a causal model that contains 
+                                all the assumptions about the {domain_expertise}. Where a causal model is a conceptual model that describes 
+                                the causal mechanisms of a system. You will do this by by answering questions about cause and effect and 
+                                using your domain knowledge in the {domain_expertise}."""
                 with user():
                     lm += f"""(1) From your perspective as 
-                         an expert in the {domain_expertise}, think step by step as you consider the factors that may interact 
-                         between the {treatment} and the {outcome}. Use your knowledge as an expert in the {domain_expertise} to 
-                         describe the confounders, if there are any at all, between the {treatment} and the {outcome}. Be concise 
-                         and keep your thinking within two paragraphs. Then provide your step by step chain of thoughts within the 
-                         tags <thinking></thinking>. (2) From your perspective as an expert in the {domain_expertise}, which factor(
-                         s), if any at all, has/have a high likelihood of directly influencing and causing both the assignment of the 
-                         {treatment} and the {outcome}? Which factor(s), if any at all, have a causal chain that links the {treatment}
-                         to the {outcome}? Which factor(s), if any at all, are a confounder to the causal relationship 
-                         between the {treatment} and the {outcome}? Be concise and keep your thinking within two paragraphs. Then 
-                         provide your step by step chain of thoughts within the tags <thinking></thinking>. Wrap the name of the 
-                         factor(s), if any at all, that has/have a high likelihood of directly influencing and causing both the 
-                         {treatment} and the {outcome}, within the tags <confounding_factor>factor_name</confounding_factor>. If a 
-                         factor does not have a high likelihood of directly confounding, then do not wrap the factor with any tags. 
-                         Your step by step answer as an expert in the {domain_expertise}:"""
+                                 an expert in the {domain_expertise}, think step by step as you consider the factors that may interact 
+                                 between the {treatment} and the {outcome}. Use your knowledge as an expert in the {domain_expertise} to 
+                                 describe the confounders, if there are any at all, between the {treatment} and the {outcome}. Be concise 
+                                 and keep your thinking within two paragraphs. Then provide your step by step chain of thoughts within the 
+                                 tags <thinking></thinking>. (2) From your perspective as an expert in the {domain_expertise}, which factor(
+                                 s), if any at all, has/have a high likelihood of directly influencing and causing both the assignment of the 
+                                 {treatment} and the {outcome}? Which factor(s), if any at all, have a causal chain that links the {treatment}
+                                 to the {outcome}? Which factor(s), if any at all, are a confounder to the causal relationship 
+                                 between the {treatment} and the {outcome}? Be concise and keep your thinking within two paragraphs. Then 
+                                 provide your step by step chain of thoughts within the tags <thinking></thinking>. Wrap the name of the 
+                                 factor(s), if any at all, that has/have a high likelihood of directly influencing and causing both the 
+                                 {treatment} and the {outcome}, within the tags <confounding_factor>factor_name</confounding_factor>. If a 
+                                 factor does not have a high likelihood of directly confounding, then do not wrap the factor with any tags. 
+                                 Your step by step answer as an expert in the {domain_expertise}:"""
 
                 with assistant():
                     lm += gen("output")
@@ -244,9 +209,7 @@ def request_latent_confounders(
                 if latent_confounders:
                     for factor in latent_confounders:
                         latent_confounders_list.append(factor)
-                    success = True
-                else:
-                    success = False
+                success = True
 
             except KeyError:
                 success = False
@@ -261,12 +224,11 @@ def request_latent_confounders(
 
     def request_parent_critique(
             self,
-            analysis_context,
             factor,
             factors_list,
-            domain_expertise
+            domain_expertise,
+            analysis_context=CONTEXT
     ):
-
         edited_factors_list: List[str] = []
 
         for i in range(len(factors_list)):
@@ -282,16 +244,16 @@ def request_parent_critique(
                 lm = self.llm
                 with system():
                     lm += f"""You are a helpful causal assistant and expert in {domain_expertise}, 
-                        studying {analysis_context}. Task: identify factors causing {factor}."""
+                                studying {analysis_context}. Task: identify factors causing {factor}."""
                 with user():
                     lm += f"""Steps: (1) 
-                        Analyze potential factors [{factors_list}] for factors directly influencing/causing/affecting {
+                                Analyze potential factors [{factors_list}] for factors directly influencing/causing/affecting {
                     factor}. Is relationship direct? Ignore feedback mechanisms/factors not in list. Keep thoughts within 
-                        <thinking></thinking> tags. (2) Use prior thoughts to answer: how {factor} influenced/caused/affected by  [
-                        {factors_list}]? Is relationship direct? Ignore feedback mechanisms/factors not in list. Wrap 
-                        factors highly likely directly influencing/causing/affecting {factor} in 
-                        <influencing_factor></influencing_factor> tags. No tags for low likelihood factors. Ignore feedback 
-                        mechanisms/factors not in list. Answer as {domain_expertise} expert."""
+                                <thinking></thinking> tags. (2) Use prior thoughts to answer: how {factor} influenced/caused/affected by  [
+                                {factors_list}]? Is relationship direct? Ignore feedback mechanisms/factors not in list. Wrap 
+                                factors highly likely directly influencing/causing/affecting {factor} in 
+                                <influencing_factor></influencing_factor> tags. No tags for low likelihood factors. Ignore feedback 
+                                mechanisms/factors not in list. Answer as {domain_expertise} expert."""
                 with assistant():
                     lm += gen("output")
 
@@ -303,9 +265,7 @@ def request_parent_critique(
                     for factor in influencing_factors:
                         if factor in edited_factors_list and factor not in parents:
                             parents.append(factor)
-                    success = True
-                else:
-                    success = False
+                success = True
 
             except KeyError:
                 success = False
@@ -315,12 +275,11 @@ def request_parent_critique(
 
     def request_children_critique(
             self,
-            analysis_context,
             factor,
             factors_list,
-            domain_expertise
+            domain_expertise,
+            analysis_context=CONTEXT
     ):
-
         edited_factors_list: List[str] = []
 
         for i in range(len(factors_list)):
@@ -337,33 +296,30 @@ def request_children_critique(
 
                 with system():
                     lm += f"""You are a helpful causal assistant and expert in {domain_expertise}, 
-                        studying {analysis_context}. Task: identify factors caused by {factor}."""
+                                studying {analysis_context}. Task: identify factors caused by {factor}."""
 
                 with user():
                     lm += f"""Steps: (
-                        1) Analyze potential factors [{factors_list}] for factors directly influenced/caused/affected by 
-                        {factor}. Is relationship direct? Ignore feedback mechanisms/factors not in list. Keep thoughts within 
-                        <thinking></thinking> tags. (2) Use prior thoughts to answer: how {factor} influences/causes/affects [{
+                                1) Analyze potential factors [{factors_list}] for factors directly influenced/caused/affected by 
+                                {factor}. Is relationship direct? Ignore feedback mechanisms/factors not in list. Keep thoughts within 
+                                <thinking></thinking> tags. (2) Use prior thoughts to answer: how {factor} influences/causes/affects [{
                     factors_list}]? Is relationship direct? Ignore feedback mechanisms/factors not in list. Wrap 
-                        factors highly likely directly influenced/caused/affected by {factor} in 
-                        <influenced_factor></influenced_factor> tags. No tags for low likelihood factors. Ignore feedback 
-                        mechanisms/factors not in list. Answer as {domain_expertise} expert."""
+                                factors highly likely directly influenced/caused/affected by {factor} in 
+                                <influenced_factor></influenced_factor> tags. No tags for low likelihood factors. Ignore feedback 
+                                mechanisms/factors not in list. Answer as {domain_expertise} expert."""
 
                 with assistant():
                     lm += gen("output")
 
                 output = lm["output"]
-                influencing_factors = re.findall(
+                influenced_factors = re.findall(
                     r"<influenced_factor>(.*?)</influenced_factor>", output)
 
-                if influencing_factors:
-                    for factor in influencing_factors:
+                if influenced_factors:
+                    for factor in influenced_factors:
                         if factor in edited_factors_list and factor not in children:
                             children.append(factor)
-
-                    success = True
-                else:
-                    success = False
+                success = True
 
             except KeyError:
                 success = False
@@ -378,7 +334,6 @@ def request_pairwise_critique(
             factor_b: str,
             analysis_context: str = CONTEXT
     ):
-
         success: bool = False
 
         while not success:
@@ -387,15 +342,15 @@ def request_pairwise_critique(
 
                 with system():
                     lm += f"""You are a helpful causal assistant, expert in {domain_expertise}, 
-                        studying {analysis_context}. Task: identify relationship between {factor_a} and {factor_b}."""
+                                studying {analysis_context}. Task: identify relationship between {factor_a} and {factor_b}."""
 
                 with user():
                     lm += f"""Steps: (1) Does {factor_a} influence/cause/affect {factor_b}? Is relationship direct? Does {factor_b} influence/cause/affect 
-                        {factor_a}? Is relationship direct? Ignore feedback mechanisms/factors not in list. Keep thoughts within 
-                        <thinking></thinking> tags. (2) Use prior thoughts to select likely answer: (A) {factor_a} influences {factor_b} (B) {
+                                {factor_a}? Is relationship direct? Ignore feedback mechanisms/factors not in list. Keep thoughts within 
+                                <thinking></thinking> tags. (2) Use prior thoughts to select likely answer: (A) {factor_a} influences {factor_b} (B) {
                     factor_b} influences {factor_a} (C) Neither. Wrap answer in <answer></answer>. e.g. <answer>A</answer>, 
-                        <answer>B</answer>, <answer>C</answer>. No tags for low likelihood factors. Ignore feedback 
-                        mechanisms/factors not in list. Answer as {domain_expertise} expert."""
+                                <answer>B</answer>, <answer>C</answer>. No tags for low likelihood factors. Ignore feedback 
+                                mechanisms/factors not in list. Answer as {domain_expertise} expert."""
 
                 with assistant():
                     lm += gen("output")
@@ -429,9 +384,9 @@ def critique_graph(
             factors_list: List[str],
             edges: Dict[Tuple[str, str], int],
             experts: list(),
+            relationship_strategy: RelationshipStrategy = RelationshipStrategy.Parent,
             analysis_context: str = CONTEXT,
             stakeholders: list() = None,
-            relationship_strategy: RelationshipStrategy = RelationshipStrategy.Parent,
     ):
         expert_list: List[str] = list()
         for elements in experts:
@@ -446,32 +401,18 @@ def critique_graph(
             parent_edges: Dict[Tuple[str, str], int] = dict()
 
             for factor in factors_list:
-                if len(expert_list) > 1:
-                    for expert in expert_list:
-                        suggested_parent = self.request_parent_critique(
-                            analysis_context=analysis_context,
-                            factor=factor,
-                            factors_list=factors_list,
-                            domain_expertise=expert
-                        )
-                        for element in suggested_parent:
-                            if (
-                                    element,
-                                    factor,
-                            ) in parent_edges and element in factors_list:
-                                parent_edges[(element, factor)] += 1
-                            else:
-                                parent_edges[(element, factor)] = 1
-                else:
+                for expert in expert_list:
                     suggested_parent = self.request_parent_critique(
                         analysis_context=analysis_context,
                         factor=factor,
                         factors_list=factors_list,
-                        domain_expertise=expert_list[0]
+                        domain_expertise=expert
                     )
-
                     for element in suggested_parent:
-                        if (element, factor) in parent_edges:
+                        if (
+                                element,
+                                factor,
+                        ) in parent_edges and element in factors_list:
                             parent_edges[(element, factor)] += 1
                         else:
                             parent_edges[(element, factor)] = 1
@@ -484,36 +425,15 @@ def critique_graph(
             critiqued_children_edges: Dict[Tuple[str, str], int] = dict()
 
             for factor in factors_list:
-                if len(expert_list) > 1:
-                    for expert in expert_list:
-                        suggested_children = self.request_children_critique(
-                            analysis_context=analysis_context,
-                            factor=factor,
-                            factors_list=factors_list,
-                            domain_expertise=expert
-                        )
-                        for element in suggested_children:
-                            if (
-                                    (
-                                            element,
-                                            factor,
-                                    )
-                                    in critiqued_children_edges
-                                    and element in factors_list
-                            ):
-                                critiqued_children_edges[(element, factor)] += 1
-                            else:
-                                critiqued_children_edges[(element, factor)] = 1
-                else:
+                for expert in expert_list:
                     suggested_children = self.request_children_critique(
-                        analysis_context=analysis_context,
                         factor=factor,
                         factors_list=factors_list,
-                        domain_expertise=expert_list[0]
+                        domain_expertise=expert,
+                        analysis_context=analysis_context
                     )
-
                     for element in suggested_children:
-                        if (element, factor) in critiqued_children_edges:
+                        if (element, factor) in critiqued_children_edges and element in factors_list:
                             critiqued_children_edges[(element, factor)] += 1
                         else:
                             critiqued_children_edges[(element, factor)] = 1
@@ -526,33 +446,18 @@ def critique_graph(
             critiqued_pairwise_edges: Dict[Tuple[str, str], int] = dict()
 
             for (factor_a, factor_b) in itertools.combinations(factors_list, 2):
-                if factor_a != factor_b:
-                    if len(expert_list) > 1:
-                        for expert in expert_list:
-                            suggested_edge = self.request_pairwise_critique(
-                                analysis_context=analysis_context,
-                                factor_a=factor_a,
-                                factor_b=factor_b,
-                                domain_expertise=expert
-                            )
-
-                            if suggested_edge is not None:
-                                if suggested_edge in critiqued_pairwise_edges:
-                                    critiqued_pairwise_edges[suggested_edge] += 1
-                                else:
-                                    critiqued_pairwise_edges[suggested_edge] = 1
-                    else:
-                        suggested_edge = self.request_pairwise_critique(
-                            analysis_context=analysis_context,
-                            factor_a=factor_a,
-                            factor_b=factor_b,
-                            domain_expertise=expert_list[0]
-                        )
-
-                        if suggested_edge is not None:
-                            if suggested_edge in critiqued_pairwise_edges:
-                                critiqued_pairwise_edges[suggested_edge] += 1
-                            else:
-                                critiqued_pairwise_edges[suggested_edge] = 1
+                for expert in expert_list:
+                    suggested_edge = self.request_pairwise_critique(
+                        factor_a=factor_a,
+                        factor_b=factor_b,
+                        domain_expertise=expert,
+                        analysis_context=analysis_context
+                    )
+
+                    if suggested_edge is not None:
+                        if suggested_edge in critiqued_pairwise_edges:
+                            critiqued_pairwise_edges[suggested_edge] += 1
+                        else:
+                            critiqued_pairwise_edges[suggested_edge] = 1
 
             return edges, critiqued_pairwise_edges
diff --git a/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py b/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py
new file mode 100644
index 0000000..7d77171
--- /dev/null
+++ b/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py
@@ -0,0 +1,25 @@
+# TESTS
+variable = "water"
+variable_a = "water intake"
+description_a = "the amount of water a person drinks per day"
+variable_b = "hydration level"
+description_b = "the level of hydration in the body"
+domain = "biology"
+
+# MOCK_RESPONSES
+test_suggest_description_expected_response = "<description>Water is a transparent, tasteless, odorless, nearly colorless liquid that is essential for all life forms and covers approximately 71% of Earth's surface, also existing in solid (ice) and gas (vapor) states.</description>"
+test_suggest_onesided_relationship_expected_response = "<answer>A</answer>"
+test_suggest_relationship_expected_response = "<answer>Yes</answer> <reference>Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. \"Water, hydration and health.\" Nutrition reviews 68.8 (2010): 439-458.</reference>"
+# ASSERTIONS
+test_suggest_description_expected_result = [
+    "Water is a transparent, tasteless, odorless, nearly colorless liquid that is essential for all life forms and covers approximately 71% of Earth's surface, also existing in solid (ice) and gas (vapor) states."]
+test_suggest_onesided_relationship_expected_result = 1
+test__build_description_program_expected_result = {
+    'system': 'You are a helpful assistant for writing concise and peer-reviewed descriptions. Your goal \n            is to provide factual and succinct description of the given concept.',
+    'user': " Describe the concept of water.\n                    In one sentence, provide a factual and succinct description of water\n                        Let's think step-by-step to make sure that we have a proper and clear description. Then provide \n                        your final answer within the tags, <description></description>."}
+test_suggest_relationship_expected_result = (1,
+                                             [
+                                                 'Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. "Water, hydration and health." Nutrition reviews 68.8 (2010): 439-458.'])
+test__build_relationship_program_expected_result = {
+    'system': 'You are a helpful assistant on causal reasoning and biology. Your goal is to answer \n            questions about cause and effect in a factual and concise way.',
+    'user': "can changing water intake change hydration level? Answer Yes or No.At each step, each expert include a reference to a research paper that supports \n                    their argument. They will provide a one sentence summary of the paper and how it supports their argument. \n                        Then they will answer whether a change in water intake changes hydration level. Answer Yes or No.\n                        When consensus is reached, thinking carefully and factually, explain the council's answer. Provide \n                        the answer within the tags, <answer>Yes/No</answer>, and the most influential reference within \n                        the tags <reference>Author, Title, Year of publication</reference>.\n                        \n\n\n----------------\n\n\n<answer>Yes</answer>\n<reference>Author, Title, Year of \n                        publication</reference>\n\n\n----------------\n\n\n<answer>No</answer> {~/user}"}
diff --git a/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py b/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py
new file mode 100644
index 0000000..b8638d1
--- /dev/null
+++ b/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py
@@ -0,0 +1,56 @@
+# TESTS
+test_vars = ["smoking", "lung cancer", "exercise habits", "air pollution exposure"]
+domain_expertises = ['Epidemiology']
+
+# MOCK RESPONSES
+test_latent_confounders_expected_response = "<confounding_factor>socio-economic status</confounding_factor> <confounding_factor>mental health</confounding_factor>"
+test_negative_controls_expected_response = "<negative_control>exercise habits</negative_control>"
+test_parent_critique_expected_response = "None"
+test_children_critique_expected_response = "<influenced_factor>lung cancer</influenced_factor>"
+test_pairwise_critique_expected_response = "The answer is <answer>A</answer>"
+test_critique_graph_parent_expected_response = ["None",
+                                                "<influencing_factor>smoking</influencing_factor> <influencing_factor>air pollution exposure</influencing_factor>",
+                                                "<influencing_factor>air pollution exposure</influencing_factor>",
+                                                "None"]
+test_critique_graph_children_expected_response = ["<influenced_factor>lung cancer</influenced_factor>",
+                                                  "<influenced_factor>exercise habits</influenced_factor>",
+                                                  "<influenced_factor>lung cancer</influenced_factor>",
+                                                  "<influenced_factor>lung cancer</influenced_factor> <influenced_factor>exercise habits</influenced_factor>"]
+test_critique_graph_pairwise_expected_response = ["<answer>A</answer>", "<answer>A</answer>", "<answer>C</answer>",
+                                                  "<answer>B</answer>", "<answer>B</answer>", "<answer>B</answer>"]
+
+# ASSERTIONS
+test_latent_confounders_expected_results = ({'mental health': 1, 'socio-economic status': 1},
+                                            ['socio-economic status', 'mental health'])
+test_negative_controls_expected_results = ({'exercise habits': 1}, ['exercise habits'])
+test_parent_critique_expected_results = []
+test_children_critique_expected_results = ['lung cancer']
+test_pairwise_critique_expected_results = ('smoking', 'lung cancer')
+test_critique_graph_parent_expected_results = ({('air pollution exposure', 'exercise habits'): 1,
+                                                ('air pollution exposure', 'lung cancer'): 1,
+                                                ('air pollution exposure', 'smoking'): 1,
+                                                ('smoking', 'lung cancer'): 1},
+                                               {('air pollution exposure', 'exercise habits'): 1,
+                                                ('air pollution exposure', 'lung cancer'): 1,
+                                                ('smoking', 'lung cancer'): 1})
+test_critique_graph_children_expected_results = ({('air pollution exposure', 'smoking'): 1,
+                                                  ('exercise habits', 'air pollution exposure'): 1,
+                                                  ('exercise habits', 'smoking'): 1,
+                                                  ('lung cancer', 'air pollution exposure'): 1,
+                                                  ('lung cancer', 'exercise habits'): 1,
+                                                  ('lung cancer', 'smoking'): 1},
+                                                 {('exercise habits', 'air pollution exposure'): 1,
+                                                  ('exercise habits', 'lung cancer'): 1,
+                                                  ('lung cancer', 'air pollution exposure'): 1,
+                                                  ('lung cancer', 'exercise habits'): 1,
+                                                  ('lung cancer', 'smoking'): 1})
+test_critique_graph_pairwise_expected_results = ({('air pollution exposure', 'exercise habits'): 1,
+                                                  ('exercise habits', 'lung cancer'): 1,
+                                                  ('smoking', 'air pollution exposure'): 1,
+                                                  ('smoking', 'exercise habits'): 1,
+                                                  ('smoking', 'lung cancer'): 1},
+                                                 {('smoking', 'lung cancer'): 1,
+                                                  ('smoking', 'exercise habits'): 1,
+                                                  ('exercise habits', 'lung cancer'): 1,
+                                                  ('air pollution exposure', 'lung cancer'): 1,
+                                                  ('air pollution exposure', 'exercise habits'): 1})
diff --git a/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py b/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py
new file mode 100644
index 0000000..f1bbf96
--- /dev/null
+++ b/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py
@@ -0,0 +1,56 @@
+import unittest
+from unittest.mock import MagicMock
+from guidance.models._openai import OpenAI
+
+from pywhyllm.suggesters.tuebingen_model_suggester import TuebingenModelSuggester, Strategy
+from pywhyllm.tests.model_suggester.data_providers.tuebingen_model_suggester_data_provider import *
+
+
+class TestTuebingenModelSuggester(unittest.TestCase):
+    def test_suggest_description(self):
+        modeler = TuebingenModelSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_description_expected_response)
+        result = modeler.suggest_description(variable)
+        assert result == test_suggest_description_expected_result
+
+    def test_suggest_onesided_relationship(self):
+        modeler = TuebingenModelSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_onesided_relationship_expected_response)
+        result = modeler.suggest_onesided_relationship(variable_a, description_a, variable_b, description_b)
+        assert result == test_suggest_onesided_relationship_expected_result
+
+    def test__build_description_program(self):
+        modeler = TuebingenModelSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        result = modeler._build_description_program(variable)
+        assert result == test__build_description_program_expected_result
+
+    def test_suggest_relationship(self):
+        modeler = TuebingenModelSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_relationship_expected_response)
+        result = modeler.suggest_relationship(variable_a, variable_b, description_a, description_b, domain,
+                                              strategy=Strategy.ToT_Single, ask_reference=True)
+        assert result == test_suggest_relationship_expected_result
+
+    def test__build_relationship_program(self):
+        modeler = TuebingenModelSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        result = modeler._build_relationship_program(variable_a, description_a, variable_b, description_b, domain,
+                                                     use_description=False, ask_reference=True)
+        assert result == test__build_relationship_program_expected_result
diff --git a/pywhyllm/tests/model_suggester/test_validation_suggester.py b/pywhyllm/tests/model_suggester/test_validation_suggester.py
new file mode 100644
index 0000000..7b99b3d
--- /dev/null
+++ b/pywhyllm/tests/model_suggester/test_validation_suggester.py
@@ -0,0 +1,101 @@
+import unittest
+from typing import Dict
+from unittest.mock import MagicMock
+from guidance.models._openai import OpenAI
+
+from pywhyllm.suggesters.validation_suggester import ValidationSuggester
+from pywhyllm.tests.model_suggester.data_providers.validation_suggester_data_provider import *
+from pywhyllm.tests.model_suggester.data_providers.model_suggester_data_provider import *
+from pywhyllm.helpers import RelationshipStrategy
+
+
+class TestValidationSuggester(unittest.TestCase):
+    def test_request_latent_confounders_expected_response(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+
+        mock_llm.__getitem__ = MagicMock(return_value=test_latent_confounders_expected_response)
+
+        latent_confounders_counter: Dict[str, int] = dict()
+        result = modeler.request_latent_confounders(test_vars[0], test_vars[1], latent_confounders_counter,
+                                                    domain_expertises[0])
+
+        assert result == test_latent_confounders_expected_results
+
+    def test_request_negative_controls_expected_response(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+
+        mock_llm.__getitem__ = MagicMock(return_value=test_negative_controls_expected_response)
+
+        negative_controls_counter: Dict[str, int] = dict()
+        result = modeler.request_negative_controls(test_vars[0], test_vars[1], test_vars, negative_controls_counter,
+                                                   domain_expertises[0])
+
+        assert result == test_negative_controls_expected_results
+
+    def test_request_parent_critique_expected_response(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+
+        mock_llm.__getitem__ = MagicMock(return_value=test_parent_critique_expected_response)
+
+        result = modeler.request_parent_critique(test_vars[0], test_vars, domain_expertises[0])
+
+        assert result == test_parent_critique_expected_results
+
+    def test_request_children_critique_expected_response(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+
+        mock_llm.__getitem__ = MagicMock(return_value=test_children_critique_expected_response)
+
+        result = modeler.request_children_critique(test_vars[0], test_vars, domain_expertises[0])
+
+        assert result == test_children_critique_expected_results
+
+    def test_pairwise_critique_expected_response(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+        mock_llm.__getitem__ = MagicMock(return_value=test_pairwise_critique_expected_response)
+        result = modeler.request_pairwise_critique(domain_expertises[0], test_vars[0], test_vars[1])
+        assert result == test_pairwise_critique_expected_results
+
+    def test_critique_graph(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+        # parent
+        mock_llm.__getitem__ = MagicMock(side_effect=test_critique_graph_parent_expected_response)
+        result = modeler.critique_graph(test_vars, test_suggest_relationships_parent_expected_results,
+                                        domain_expertises, RelationshipStrategy.Parent)
+
+        assert result == test_critique_graph_parent_expected_results
+
+        mock_llm.__getitem__ = MagicMock(side_effect=test_critique_graph_children_expected_response)
+        result = modeler.critique_graph(test_vars, test_suggest_relationships_child_expected_results,
+                                        domain_expertises, RelationshipStrategy.Child)
+
+        assert result == test_critique_graph_children_expected_results
+
+        mock_llm.__getitem__ = MagicMock(side_effect=test_critique_graph_pairwise_expected_response)
+        result = modeler.critique_graph(test_vars, test_suggest_relationships_pairwise_expected_results,
+                                        domain_expertises, RelationshipStrategy.Pairwise)
+        assert result == test_critique_graph_pairwise_expected_results

From 0a620f78be81ba4d3b9eaf847fdd09abc2e8f416 Mon Sep 17 00:00:00 2001
From: Grace Sng <grace.sng75@gmail.com>
Date: Sun, 30 Mar 2025 23:44:46 -0500
Subject: [PATCH 2/4] Test classes for validation suggester and tuebingen model
 suggester.

Signed-off-by: Grace Sng <grace.sng75@gmail.com>
---
 .../suggesters/tuebingen_model_suggester.py   |  6 +-
 pywhyllm/suggesters/validation_suggester.py   |  2 +-
 ...tuebingen_model_suggester_data_provider.py | 59 +++++++++++++++----
 .../validation_suggester_data_provider.py     | 11 +++-
 .../test_identification_suggester.py          | 11 +++-
 .../test_tuebingen_model_suggester.py         | 39 +++++++++---
 .../test_validation_suggester.py              | 40 ++++++++++---
 7 files changed, 132 insertions(+), 36 deletions(-)

diff --git a/pywhyllm/suggesters/tuebingen_model_suggester.py b/pywhyllm/suggesters/tuebingen_model_suggester.py
index 0e73c72..528d938 100644
--- a/pywhyllm/suggesters/tuebingen_model_suggester.py
+++ b/pywhyllm/suggesters/tuebingen_model_suggester.py
@@ -21,7 +21,7 @@ def __init__(self, llm=None):
         super().__init__(llm)
 
     def suggest_description(
-            self, variable, context=None, ask_reference=False
+            self, variable, ask_reference=False
     ):
         generate_description = self._build_description_program(variable)
 
@@ -255,11 +255,11 @@ def _build_relationship_program(
                         the answer within the tags, <answer>Yes/No</answer>, and the most influential reference within 
                         the tags <reference>Author, Title, Year of publication</reference>.
                         \n\n\n----------------\n\n\n<answer>Yes</answer>\n<reference>Author, Title, Year of 
-                        publication</reference>\n\n\n----------------\n\n\n<answer>No</answer> {{~/user}}"""
+                        publication</reference>\n\n\n----------------\n\n\n<answer>No</answer>"""
                 else:
                     query["user"] += """When consensus is reached, thinking carefully and factually, explain the council's answer. 
                     Provide the answer within the tags, <answer>Yes/No</answer>.
-                        \n\n\n----------------\n\n\n<answer>Yes</answer>\n\n\n----------------\n\n\n<answer>No</answer> {{~/user}}"""
+                        \n\n\n----------------\n\n\n<answer>Yes</answer>\n\n\n----------------\n\n\n<answer>No</answer>"""
 
             elif use_strategy == Strategy.CoT:
                 if use_description:
diff --git a/pywhyllm/suggesters/validation_suggester.py b/pywhyllm/suggesters/validation_suggester.py
index 7a7fa7e..3c07225 100644
--- a/pywhyllm/suggesters/validation_suggester.py
+++ b/pywhyllm/suggesters/validation_suggester.py
@@ -65,7 +65,7 @@ def request_negative_controls(
             factors_list: list(),
             negative_controls_counter: list(),
             domain_expertise: str,
-            analysis_context: list = CONTEXT
+            analysis_context = CONTEXT
     ):
         negative_controls_list: List[str] = list()
 
diff --git a/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py b/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py
index 7d77171..2008ba4 100644
--- a/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py
+++ b/pywhyllm/tests/model_suggester/data_providers/tuebingen_model_suggester_data_provider.py
@@ -8,18 +8,55 @@
 
 # MOCK_RESPONSES
 test_suggest_description_expected_response = "<description>Water is a transparent, tasteless, odorless, nearly colorless liquid that is essential for all life forms and covers approximately 71% of Earth's surface, also existing in solid (ice) and gas (vapor) states.</description>"
-test_suggest_onesided_relationship_expected_response = "<answer>A</answer>"
-test_suggest_relationship_expected_response = "<answer>Yes</answer> <reference>Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. \"Water, hydration and health.\" Nutrition reviews 68.8 (2010): 439-458.</reference>"
+test_suggest_onesided_relationship_a_cause_b_expected_response = "<answer>A</answer>"
+test_suggest_onesided_relationship_a_not_cause_b_expected_response = "<answer>B</answer>"
+test_suggest_relationship_a_cause_b_expected_response = "<answer>Yes</answer> <reference>Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. \"Water, hydration and health.\" Nutrition reviews 68.8 (2010): 439-458.</reference>"
+test_suggest_relationship_a_not_cause_b_expected_response = "<answer>No</answer> <reference>Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. \"Water, hydration and health.\" Nutrition reviews 68.8 (2010): 439-458.</reference>"
+
 # ASSERTIONS
-test_suggest_description_expected_result = [
-    "Water is a transparent, tasteless, odorless, nearly colorless liquid that is essential for all life forms and covers approximately 71% of Earth's surface, also existing in solid (ice) and gas (vapor) states."]
-test_suggest_onesided_relationship_expected_result = 1
-test__build_description_program_expected_result = {
+test_suggest_description_expected_result = ([
+                                                "Water is a transparent, tasteless, odorless, nearly colorless liquid that is essential for all life forms and covers approximately 71% of Earth's surface, also existing in solid (ice) and gas (vapor) states."],
+                                            [])
+test_suggest_onesided_relationship_a_cause_b_expected_result = 1
+test_suggest_onesided_relationship_a_not_cause_b_expected_result = 0
+test__build_description_program_no_context_no_reference_expected_result = {
     'system': 'You are a helpful assistant for writing concise and peer-reviewed descriptions. Your goal \n            is to provide factual and succinct description of the given concept.',
     'user': " Describe the concept of water.\n                    In one sentence, provide a factual and succinct description of water\n                        Let's think step-by-step to make sure that we have a proper and clear description. Then provide \n                        your final answer within the tags, <description></description>."}
-test_suggest_relationship_expected_result = (1,
-                                             [
-                                                 'Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. "Water, hydration and health." Nutrition reviews 68.8 (2010): 439-458.'])
+test__build_description_program_no_context_with_reference_expected_result = {
+    'system': 'You are a helpful assistant for writing concise and peer-reviewed descriptions. Your goal \n            is to provide factual and succinct description of the given concept.',
+    'user': ' Describe the concept of water.\n                    In one sentence, provide a factual and succinct description of water"\n                        Then provide two research papers that support your description.\n                        Let\'s think step-by-step to make sure that we have a proper and clear description. Then provide \n                        your final answer within the tags, <description></description>, and each research paper within the \n                        tags <paper></paper>.'}
+test__build_description_program_with_context_with_reference_expected_result = {
+    'system': 'You are a helpful assistant for writing concise and peer-reviewed descriptions. Your goal is \n            to provide factual and succinct descriptions related to the given concept and context.',
+    'user': "Using this context about the particular variable, describe the concept of water.\n            In one sentence, provide a factual and succinct description of waterThen provide two research papers that support your description.\n                Let's think step-by-step to make sure that we have a proper and clear description. Then provide your final \n                answer within the tags, <description></description>, and each research paper within the tags <reference></reference>."}
+test__build_description_program_with_context_no_reference_expected_result = {
+    'system': 'You are a helpful assistant for writing concise and peer-reviewed descriptions. Your goal is \n            to provide factual and succinct descriptions related to the given concept and context.',
+    'user': "Using this context about the particular variable, describe the concept of water.\n            In one sentence, provide a factual and succinct description of water\n                    Let's think step-by-step to make sure that we have a proper and clear description. Then provide your final \n                    answer within the tags, <description></description>."}
+test_suggest_relationship_a_cause_b_expected_result = (1,
+                                                       [
+                                                           'Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. "Water, hydration and health." Nutrition reviews 68.8 (2010): 439-458.'])
+test_suggest_relationship_a_not_cause_b_expected_result = (0,
+                                                           [
+                                                               'Popkin, Barry M., Kristen E. D\'Anci, and Irwin H. Rosenberg. "Water, hydration and health." Nutrition reviews 68.8 (2010): 439-458.'])
 test__build_relationship_program_expected_result = {
-    'system': 'You are a helpful assistant on causal reasoning and biology. Your goal is to answer \n            questions about cause and effect in a factual and concise way.',
-    'user': "can changing water intake change hydration level? Answer Yes or No.At each step, each expert include a reference to a research paper that supports \n                    their argument. They will provide a one sentence summary of the paper and how it supports their argument. \n                        Then they will answer whether a change in water intake changes hydration level. Answer Yes or No.\n                        When consensus is reached, thinking carefully and factually, explain the council's answer. Provide \n                        the answer within the tags, <answer>Yes/No</answer>, and the most influential reference within \n                        the tags <reference>Author, Title, Year of publication</reference>.\n                        \n\n\n----------------\n\n\n<answer>Yes</answer>\n<reference>Author, Title, Year of \n                        publication</reference>\n\n\n----------------\n\n\n<answer>No</answer> {~/user}"}
+    'system': 'You are a helpful assistant on causal reasoning and biology. Your '
+              'goal is to answer \n'
+              '            questions about cause and effect in a factual and '
+              'concise way.',
+    'user': 'can changing water intake change hydration level? Answer Yes or '
+            'No.When consensus is reached, thinking carefully and factually, '
+            "explain the council's answer. \n"
+            '                    Provide the answer within the tags, '
+            '<answer>Yes/No</answer>.\n'
+            '                        \n'
+            '\n'
+            '\n'
+            '----------------\n'
+            '\n'
+            '\n'
+            '<answer>Yes</answer>\n'
+            '\n'
+            '\n'
+            '----------------\n'
+            '\n'
+            '\n'
+            '<answer>No</answer>'}
diff --git a/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py b/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py
index b8638d1..0780157 100644
--- a/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py
+++ b/pywhyllm/tests/model_suggester/data_providers/validation_suggester_data_provider.py
@@ -20,9 +20,14 @@
                                                   "<answer>B</answer>", "<answer>B</answer>", "<answer>B</answer>"]
 
 # ASSERTIONS
-test_latent_confounders_expected_results = ({'mental health': 1, 'socio-economic status': 1},
-                                            ['socio-economic status', 'mental health'])
-test_negative_controls_expected_results = ({'exercise habits': 1}, ['exercise habits'])
+test_suggest_latent_confounders_expected_results = ({'mental health': 1, 'socio-economic status': 1},
+                                                    [{'mental health': 1, 'socio-economic status': 1},
+                                                     ['socio-economic status', 'mental health']])
+test_request_latent_confounders_expected_results = ({'mental health': 1, 'socio-economic status': 1},
+                                                    ['socio-economic status', 'mental health'])
+test_suggest_negative_controls_expected_results = (
+{'exercise habits': 1}, [{'exercise habits': 1}, ['exercise habits']])
+test_request_negative_controls_expected_results = ({'exercise habits': 1}, ['exercise habits'])
 test_parent_critique_expected_results = []
 test_children_critique_expected_results = ['lung cancer']
 test_pairwise_critique_expected_results = ('smoking', 'lung cancer')
diff --git a/pywhyllm/tests/model_suggester/test_identification_suggester.py b/pywhyllm/tests/model_suggester/test_identification_suggester.py
index 1040452..5515db4 100644
--- a/pywhyllm/tests/model_suggester/test_identification_suggester.py
+++ b/pywhyllm/tests/model_suggester/test_identification_suggester.py
@@ -3,13 +3,20 @@
 from guidance.models._openai import OpenAI
 
 from pywhyllm.suggesters.identification_suggester import IdentificationSuggester
+from pywhyllm.suggesters.model_suggester import ModelSuggester
 from pywhyllm.tests.model_suggester.data_providers.model_suggester_data_provider import *
 from pywhyllm.tests.model_suggester.data_providers.identification_suggester_data_provider import *
-from pywhyllm.tests.model_suggester.test_model_suggester import TestModelSuggester
 
 class TestIdentificationSuggester(unittest.TestCase):
     def test_suggest_backdoor(self):
-        return TestModelSuggester().test_suggest_confounders()
+        modeler = IdentificationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+        mock_model_suggester = MagicMock(spec=ModelSuggester)
+        modeler.model_suggester = mock_model_suggester
+        mock_model_suggester.suggest_confounders = MagicMock(return_value=test_suggest_confounders_expected_results)
+        result = modeler.suggest_backdoor(test_vars[0], test_vars[1], test_vars, test_domain_expertises_expected_result)
+        assert result == test_suggest_confounders_expected_results
 
     def test_suggest_mediators(self):
         modeler = IdentificationSuggester()
diff --git a/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py b/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py
index f1bbf96..c90e308 100644
--- a/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py
+++ b/pywhyllm/tests/model_suggester/test_tuebingen_model_suggester.py
@@ -14,7 +14,7 @@ def test_suggest_description(self):
 
         mock_llm.__add__ = MagicMock(return_value=mock_llm)
         mock_llm.__getitem__ = MagicMock(return_value=test_suggest_description_expected_response)
-        result = modeler.suggest_description(variable)
+        result = modeler.suggest_description(variable, True)
         assert result == test_suggest_description_expected_result
 
     def test_suggest_onesided_relationship(self):
@@ -23,17 +23,32 @@ def test_suggest_onesided_relationship(self):
         modeler.llm = mock_llm
 
         mock_llm.__add__ = MagicMock(return_value=mock_llm)
-        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_onesided_relationship_expected_response)
+        #Given the two variables and their descriptions, variable a causes variable b
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_onesided_relationship_a_cause_b_expected_response)
         result = modeler.suggest_onesided_relationship(variable_a, description_a, variable_b, description_b)
-        assert result == test_suggest_onesided_relationship_expected_result
+        assert result == test_suggest_onesided_relationship_a_cause_b_expected_result
+
+        #Given the two variables and their descriptions, variable a does not cause variable b
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_onesided_relationship_a_not_cause_b_expected_response)
+        result = modeler.suggest_onesided_relationship(variable_a, description_a, variable_b, description_b)
+        assert result == test_suggest_onesided_relationship_a_not_cause_b_expected_result
 
     def test__build_description_program(self):
         modeler = TuebingenModelSuggester()
         mock_llm = MagicMock(spec=OpenAI)
         modeler.llm = mock_llm
-
-        result = modeler._build_description_program(variable)
-        assert result == test__build_description_program_expected_result
+        #Test no context, no reference
+        result = modeler._build_description_program(variable, False, False)
+        assert result == test__build_description_program_no_context_no_reference_expected_result
+        #Test no context, with reference
+        result = modeler._build_description_program(variable, False, True)
+        assert result == test__build_description_program_no_context_with_reference_expected_result
+        #Test with context, no reference
+        result = modeler._build_description_program(variable, True, False)
+        assert result == test__build_description_program_with_context_no_reference_expected_result
+        #Test with context, with reference
+        result = modeler._build_description_program(variable, True, True)
+        assert result == test__build_description_program_with_context_with_reference_expected_result
 
     def test_suggest_relationship(self):
         modeler = TuebingenModelSuggester()
@@ -41,10 +56,16 @@ def test_suggest_relationship(self):
         modeler.llm = mock_llm
 
         mock_llm.__add__ = MagicMock(return_value=mock_llm)
-        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_relationship_expected_response)
+        #Given the two variables and their descriptions, variable a causes variable b
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_relationship_a_cause_b_expected_response)
+        result = modeler.suggest_relationship(variable_a, variable_b, description_a, description_b, domain,
+                                              strategy=Strategy.ToT_Single, ask_reference=True)
+        assert result == test_suggest_relationship_a_cause_b_expected_result
+        #Given the two variables and their descriptions, variable a does not cause variable b
+        mock_llm.__getitem__ = MagicMock(return_value=test_suggest_relationship_a_not_cause_b_expected_response)
         result = modeler.suggest_relationship(variable_a, variable_b, description_a, description_b, domain,
                                               strategy=Strategy.ToT_Single, ask_reference=True)
-        assert result == test_suggest_relationship_expected_result
+        assert result == test_suggest_relationship_a_not_cause_b_expected_result
 
     def test__build_relationship_program(self):
         modeler = TuebingenModelSuggester()
@@ -52,5 +73,5 @@ def test__build_relationship_program(self):
         modeler.llm = mock_llm
 
         result = modeler._build_relationship_program(variable_a, description_a, variable_b, description_b, domain,
-                                                     use_description=False, ask_reference=True)
+                                                     use_description=False, ask_reference=False)
         assert result == test__build_relationship_program_expected_result
diff --git a/pywhyllm/tests/model_suggester/test_validation_suggester.py b/pywhyllm/tests/model_suggester/test_validation_suggester.py
index 7b99b3d..a7bef47 100644
--- a/pywhyllm/tests/model_suggester/test_validation_suggester.py
+++ b/pywhyllm/tests/model_suggester/test_validation_suggester.py
@@ -10,7 +10,20 @@
 
 
 class TestValidationSuggester(unittest.TestCase):
-    def test_request_latent_confounders_expected_response(self):
+    def test_suggest_latent_confounders(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+
+        mock_llm.__getitem__ = MagicMock(return_value=test_latent_confounders_expected_response)
+
+        result = modeler.suggest_latent_confounders(test_vars[0], test_vars[1], domain_expertises)
+
+        assert result == test_suggest_latent_confounders_expected_results
+
+    def test_request_latent_confounders(self):
         modeler = ValidationSuggester()
         mock_llm = MagicMock(spec=OpenAI)
         modeler.llm = mock_llm
@@ -23,9 +36,22 @@ def test_request_latent_confounders_expected_response(self):
         result = modeler.request_latent_confounders(test_vars[0], test_vars[1], latent_confounders_counter,
                                                     domain_expertises[0])
 
-        assert result == test_latent_confounders_expected_results
+        assert result == test_request_latent_confounders_expected_results
+
+    def test_suggest_negative_controls(self):
+        modeler = ValidationSuggester()
+        mock_llm = MagicMock(spec=OpenAI)
+        modeler.llm = mock_llm
+
+        mock_llm.__add__ = MagicMock(return_value=mock_llm)
+
+        mock_llm.__getitem__ = MagicMock(return_value=test_negative_controls_expected_response)
+
+        result = modeler.suggest_negative_controls(test_vars[0], test_vars[1], test_vars, domain_expertises)
+
+        assert result == test_suggest_negative_controls_expected_results
 
-    def test_request_negative_controls_expected_response(self):
+    def test_request_negative_controls(self):
         modeler = ValidationSuggester()
         mock_llm = MagicMock(spec=OpenAI)
         modeler.llm = mock_llm
@@ -38,9 +64,9 @@ def test_request_negative_controls_expected_response(self):
         result = modeler.request_negative_controls(test_vars[0], test_vars[1], test_vars, negative_controls_counter,
                                                    domain_expertises[0])
 
-        assert result == test_negative_controls_expected_results
+        assert result == test_request_negative_controls_expected_results
 
-    def test_request_parent_critique_expected_response(self):
+    def test_request_parent_critique(self):
         modeler = ValidationSuggester()
         mock_llm = MagicMock(spec=OpenAI)
         modeler.llm = mock_llm
@@ -53,7 +79,7 @@ def test_request_parent_critique_expected_response(self):
 
         assert result == test_parent_critique_expected_results
 
-    def test_request_children_critique_expected_response(self):
+    def test_request_children_critique(self):
         modeler = ValidationSuggester()
         mock_llm = MagicMock(spec=OpenAI)
         modeler.llm = mock_llm
@@ -66,7 +92,7 @@ def test_request_children_critique_expected_response(self):
 
         assert result == test_children_critique_expected_results
 
-    def test_pairwise_critique_expected_response(self):
+    def test_request_pairwise_critique(self):
         modeler = ValidationSuggester()
         mock_llm = MagicMock(spec=OpenAI)
         modeler.llm = mock_llm

From 4d9ee64b1cd79fea00aab92eb3269f54307c113d Mon Sep 17 00:00:00 2001
From: Grace Sng <grace.sng75@gmail.com>
Date: Sun, 30 Mar 2025 23:51:36 -0500
Subject: [PATCH 3/4] Edited comments.

Signed-off-by: Grace Sng <grace.sng75@gmail.com>
---
 pywhyllm/tests/model_suggester/test_validation_suggester.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pywhyllm/tests/model_suggester/test_validation_suggester.py b/pywhyllm/tests/model_suggester/test_validation_suggester.py
index a7bef47..dd3a014 100644
--- a/pywhyllm/tests/model_suggester/test_validation_suggester.py
+++ b/pywhyllm/tests/model_suggester/test_validation_suggester.py
@@ -114,13 +114,13 @@ def test_critique_graph(self):
                                         domain_expertises, RelationshipStrategy.Parent)
 
         assert result == test_critique_graph_parent_expected_results
-
+        # child
         mock_llm.__getitem__ = MagicMock(side_effect=test_critique_graph_children_expected_response)
         result = modeler.critique_graph(test_vars, test_suggest_relationships_child_expected_results,
                                         domain_expertises, RelationshipStrategy.Child)
 
         assert result == test_critique_graph_children_expected_results
-
+        # pairwise
         mock_llm.__getitem__ = MagicMock(side_effect=test_critique_graph_pairwise_expected_response)
         result = modeler.critique_graph(test_vars, test_suggest_relationships_pairwise_expected_results,
                                         domain_expertises, RelationshipStrategy.Pairwise)

From 0b268284edd165c2e520d129a31fc22281f734f5 Mon Sep 17 00:00:00 2001
From: Grace Sng <grace.sng75@gmail.com>
Date: Mon, 31 Mar 2025 18:00:18 -0500
Subject: [PATCH 4/4] Fixed spacing issues.

Signed-off-by: Grace Sng <grace.sng75@gmail.com>
---
 pywhyllm/suggesters/validation_suggester.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pywhyllm/suggesters/validation_suggester.py b/pywhyllm/suggesters/validation_suggester.py
index 3c07225..57518ed 100644
--- a/pywhyllm/suggesters/validation_suggester.py
+++ b/pywhyllm/suggesters/validation_suggester.py
@@ -65,7 +65,7 @@ def request_negative_controls(
             factors_list: list(),
             negative_controls_counter: list(),
             domain_expertise: str,
-            analysis_context = CONTEXT
+            analysis_context=CONTEXT
     ):
         negative_controls_list: List[str] = list()
 
@@ -82,8 +82,7 @@ def request_negative_controls(
             questions about cause and effect using your domain knowledge in the {domain_expertise}."""
 
                 with user():
-                    lm += f"""factor_names: {factors_list} From your
-                             perspective as an expert in the {domain_expertise}, what factor(s) from the list of factors, relevant to 
+                    lm += f"""factor_names: {factors_list} From your perspective as an expert in the {domain_expertise}, what factor(s) from the list of factors, relevant to 
                              the {analysis_context}, should see zero treatment effect when changing the {treatment}? Which factor(s) 
                              from the list of factors, if any at all, relevant to the {analysis_context}, are negative controls on the 
                              causal mechanisms that affect the {outcome} when changing {treatment}? Using your domain knowledge, 
@@ -176,14 +175,12 @@ def request_latent_confounders(
             try:
                 lm = self.llm
                 with system():
-                    lm += f"""You are an expert in the {domain_expertise} and are 
-                                studying the {analysis_context}. You are using your knowledge to help build a causal model that contains 
+                    lm += f"""You are an expert in the {domain_expertise} and are studying the {analysis_context}. You are using your knowledge to help build a causal model that contains 
                                 all the assumptions about the {domain_expertise}. Where a causal model is a conceptual model that describes 
                                 the causal mechanisms of a system. You will do this by by answering questions about cause and effect and 
                                 using your domain knowledge in the {domain_expertise}."""
                 with user():
-                    lm += f"""(1) From your perspective as 
-                                 an expert in the {domain_expertise}, think step by step as you consider the factors that may interact 
+                    lm += f"""(1) From your perspective as an expert in the {domain_expertise}, think step by step as you consider the factors that may interact 
                                  between the {treatment} and the {outcome}. Use your knowledge as an expert in the {domain_expertise} to 
                                  describe the confounders, if there are any at all, between the {treatment} and the {outcome}. Be concise 
                                  and keep your thinking within two paragraphs. Then provide your step by step chain of thoughts within the