Separate result JSON files as well as target/judge model parameters

YusakuNo1 · YusakuNo1 · commit fc35b7577a31 · 2024-12-12T14:56:02.000-08:00
diff --git a/samples/python/azure_ai_evaluation/evaluation.py b/samples/python/azure_ai_evaluation/evaluation.py
@@ -1,6 +1,8 @@
 """This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation.
 It is leveraging your endpoint and key. The call is synchronous.
 
+For those who have Azure credentials, you can run the risk and safety evaluators from Azure AI.
+
 Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
 """
 
@@ -16,10 +18,13 @@
 
 
 token = os.environ['GITHUB_TOKEN']
-inferencing_model_name = "gpt-4o-mini"
-evaluation_model_name = "gpt-4o-mini"
-api_version = "2024-08-01-preview"
-endpoint = "https://models.inference.ai.azure.com"
+
+# Target model is the model to be evaluated.
+target_model_name = "Mistral-small"
+target_model_endpoint = "https://models.inference.ai.azure.com"
+# Judge model is the model to evaluate the target model.
+judge_model_name = "gpt-4o-mini"
+judge_model_endpoint = "https://models.inference.ai.azure.com"
 
 evaluation_name = "GitHub models evaluation"
 eval_data_file = Path("./eval_data.jsonl")
@@ -39,7 +44,7 @@ def generate_eval_data():
     with eval_data_file.open("w") as f:
         for eval_data_query in eval_data_queries:
             client = ChatCompletionsClient(
-                endpoint=endpoint,
+                endpoint=target_model_endpoint,
                 credential=AzureKeyCredential(token),
             )
 
@@ -49,7 +54,7 @@ def generate_eval_data():
                     SystemMessage(content=context),
                     UserMessage(content=eval_data_query["query"]),
                 ],
-                model=inferencing_model_name,
+                model=target_model_name,
                 temperature=1.,
                 max_tokens=1000,
                 top_p=1.    
@@ -69,10 +74,9 @@ def generate_eval_data():
 
 def run_perf_and_quality_evaluators():
     model_config = {
-        "azure_endpoint": endpoint,
-        "azure_deployment": evaluation_model_name,
+        "azure_endpoint": judge_model_endpoint,
+        "azure_deployment": judge_model_name,
         "api_key": token,
-        "api_version": api_version,
     }
 
     evaluators = {
@@ -118,6 +122,7 @@ def run_risk_and_safety_evaluators_with_azure():
         "GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential),
     }
 
+    risk_and_safety_result_dict = {}
     with eval_data_file.open("r") as f:
         for line in f:
             eval_data = json.loads(line)
@@ -127,6 +132,10 @@ def run_risk_and_safety_evaluators_with_azure():
                 else:
                     score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"])
                 print(f"{name}: {score}")
+                risk_and_safety_result_dict[name] = score
+
+    with eval_result_file_risk_and_safety.open("w") as f:
+        f.write(json.dumps(risk_and_safety_result_dict, indent=4))
 
 
 if __name__ == "__main__":
@@ -136,5 +145,5 @@ def run_risk_and_safety_evaluators_with_azure():
     # Run performance and quality evaluators with GitHub model catalog.
     run_perf_and_quality_evaluators()
 
-    # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
+    # # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
     # run_risk_and_safety_evaluators_with_azure()