From 2548c0799825b563dc988ed039be8ca74bac7641 Mon Sep 17 00:00:00 2001
From: David Wu <KyosukeNo1@gmail.com>
Date: Wed, 11 Dec 2024 14:59:33 -0800
Subject: [PATCH 1/2] Generate evaluation data and run evaluators

---
 .../python/azure_ai_evaluation/evaluation.py  | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 samples/python/azure_ai_evaluation/evaluation.py

diff --git a/samples/python/azure_ai_evaluation/evaluation.py b/samples/python/azure_ai_evaluation/evaluation.py
new file mode 100644
index 0000000..cb51e41
--- /dev/null
+++ b/samples/python/azure_ai_evaluation/evaluation.py
@@ -0,0 +1,140 @@
+"""This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation.
+It is leveraging your endpoint and key. The call is synchronous.
+
+Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
+"""
+
+import os
+import json
+from pathlib import Path
+from azure.ai.inference import ChatCompletionsClient
+from azure.ai.inference.models import SystemMessage, UserMessage
+from azure.ai import evaluation
+from azure.ai.evaluation import RougeType, evaluate
+from azure.core.credentials import AzureKeyCredential
+from azure.identity import DefaultAzureCredential
+
+
+token = os.environ['GITHUB_TOKEN']
+inferencing_model_name = "gpt-4o-mini"
+evaluation_model_name = "gpt-4o-mini"
+api_version = "2024-08-01-preview"
+endpoint = "https://models.inference.ai.azure.com"
+
+evaluation_name = "GitHub models evaluation"
+eval_data_file = Path("./eval_data.jsonl")
+eval_result_file_perf_and_quality = Path("./eval_result_perf_and_quality.json")
+eval_result_file_risk_and_safety = Path("./eval_result_risk_and_safety.json")
+
+
+def generate_eval_data():
+    eval_data_queries = [{
+        "query": "What is the capital of France?",
+        "ground_truth": "Paris",
+    }, {
+        "query": "Where is Wineglass Bay?",
+        "ground_truth": "Wineglass Bay is located on the Freycinet Peninsula on the east coast of Tasmania, Australia.",
+    }]
+
+    with eval_data_file.open("w") as f:
+        for eval_data_query in eval_data_queries:
+            client = ChatCompletionsClient(
+                endpoint=endpoint,
+                credential=AzureKeyCredential(token),
+            )
+
+            context = "You are a geography teacher."
+            response = client.complete(
+                messages=[
+                    SystemMessage(content=context),
+                    UserMessage(content=eval_data_query["query"]),
+                ],
+                model=inferencing_model_name,
+                temperature=1.,
+                max_tokens=1000,
+                top_p=1.    
+            )
+            result = response.choices[0].message.content
+
+            eval_data = {
+                "id": "1",
+                "description": "Evaluate the model",
+                "query": eval_data_query["query"],
+                "context": context,
+                "response": result,
+                "ground_truth": eval_data_query["ground_truth"],
+            }
+            f.write(json.dumps(eval_data) + "\n")
+
+
+def run_perf_and_quality_evaluators():
+    model_config = {
+        "azure_endpoint": endpoint,
+        "azure_deployment": evaluation_model_name,
+        "api_key": token,
+        "api_version": api_version,
+    }
+
+    evaluators = {
+        "BleuScoreEvaluator": evaluation.BleuScoreEvaluator(),
+        "F1ScoreEvaluator": evaluation.F1ScoreEvaluator(),
+        "GleuScoreEvaluator": evaluation.GleuScoreEvaluator(),
+        "MeteorScoreEvaluator": evaluation.MeteorScoreEvaluator(),
+        "RougeScoreEvaluator": evaluation.RougeScoreEvaluator(rouge_type=RougeType.ROUGE_L),
+        "CoherenceEvaluator": evaluation.CoherenceEvaluator(model_config=model_config),
+        "FluencyEvaluator": evaluation.FluencyEvaluator(model_config=model_config),
+        "GroundednessEvaluator": evaluation.GroundednessEvaluator(model_config=model_config),
+        "QAEvaluator": evaluation.QAEvaluator(model_config=model_config, _parallel=False),
+        "RelevanceEvaluator": evaluation.RelevanceEvaluator(model_config=model_config),
+        "RetrievalEvaluator": evaluation.RetrievalEvaluator(model_config=model_config),
+        "SimilarityEvaluator": evaluation.SimilarityEvaluator(model_config=model_config),
+    }
+
+    eval_results = evaluate(
+        data=eval_data_file,
+        evaluators=evaluators,
+        evaluation_name=evaluation_name,
+        target=None,
+        output_path=eval_result_file_perf_and_quality,
+    )
+    print(json.dumps(eval_results, indent=4))
+
+
+def run_risk_and_safety_evaluators_with_azure():
+    azure_ai_project = {
+        "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
+        "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP_NAME"),
+        "project_name": os.environ.get("AZURE_PROJECT_NAME"),
+    }
+    credential = DefaultAzureCredential()
+    evaluators = {
+        "ContentSafetyEvaluator": evaluation.ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "HateUnfairnessEvaluator": evaluation.HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "SelfHarmEvaluator": evaluation.SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "SexualEvaluator": evaluation.SexualEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "ViolenceEvaluator": evaluation.ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "ProtectedMaterialEvaluator": evaluation.ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "IndirectAttackEvaluator": evaluation.IndirectAttackEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+        "GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential),
+    }
+
+    with eval_data_file.open("r") as f:
+        for line in f:
+            eval_data = json.loads(line)
+            for name, evaluator in evaluators.items():
+                if name != "GroundednessProEvaluator":
+                    score = evaluator(query=eval_data["query"], response=eval_data["response"])
+                else:
+                    score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"])
+                print(f"{name}: {score}")
+
+
+if __name__ == "__main__":
+    # Generate evaluation data with GitHub model catalog and save it to a file.
+    generate_eval_data()
+
+    # Run performance and quality evaluators with GitHub model catalog.
+    run_perf_and_quality_evaluators()
+
+    # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
+    # run_risk_and_safety_evaluators_with_azure()

From fc35b7577a3146b15b2cf46087b52f48c1f383e2 Mon Sep 17 00:00:00 2001
From: David Wu <KyosukeNo1@gmail.com>
Date: Thu, 12 Dec 2024 14:56:02 -0800
Subject: [PATCH 2/2] Separate result JSON files as well as target/judge model
 parameters

---
 .../python/azure_ai_evaluation/evaluation.py  | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/samples/python/azure_ai_evaluation/evaluation.py b/samples/python/azure_ai_evaluation/evaluation.py
index cb51e41..ed6a815 100644
--- a/samples/python/azure_ai_evaluation/evaluation.py
+++ b/samples/python/azure_ai_evaluation/evaluation.py
@@ -1,6 +1,8 @@
 """This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation.
 It is leveraging your endpoint and key. The call is synchronous.
 
+For those who have Azure credentials, you can run the risk and safety evaluators from Azure AI.
+
 Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
 """
 
@@ -16,10 +18,13 @@
 
 
 token = os.environ['GITHUB_TOKEN']
-inferencing_model_name = "gpt-4o-mini"
-evaluation_model_name = "gpt-4o-mini"
-api_version = "2024-08-01-preview"
-endpoint = "https://models.inference.ai.azure.com"
+
+# Target model is the model to be evaluated.
+target_model_name = "Mistral-small"
+target_model_endpoint = "https://models.inference.ai.azure.com"
+# Judge model is the model to evaluate the target model.
+judge_model_name = "gpt-4o-mini"
+judge_model_endpoint = "https://models.inference.ai.azure.com"
 
 evaluation_name = "GitHub models evaluation"
 eval_data_file = Path("./eval_data.jsonl")
@@ -39,7 +44,7 @@ def generate_eval_data():
     with eval_data_file.open("w") as f:
         for eval_data_query in eval_data_queries:
             client = ChatCompletionsClient(
-                endpoint=endpoint,
+                endpoint=target_model_endpoint,
                 credential=AzureKeyCredential(token),
             )
 
@@ -49,7 +54,7 @@ def generate_eval_data():
                     SystemMessage(content=context),
                     UserMessage(content=eval_data_query["query"]),
                 ],
-                model=inferencing_model_name,
+                model=target_model_name,
                 temperature=1.,
                 max_tokens=1000,
                 top_p=1.    
@@ -69,10 +74,9 @@ def generate_eval_data():
 
 def run_perf_and_quality_evaluators():
     model_config = {
-        "azure_endpoint": endpoint,
-        "azure_deployment": evaluation_model_name,
+        "azure_endpoint": judge_model_endpoint,
+        "azure_deployment": judge_model_name,
         "api_key": token,
-        "api_version": api_version,
     }
 
     evaluators = {
@@ -118,6 +122,7 @@ def run_risk_and_safety_evaluators_with_azure():
         "GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential),
     }
 
+    risk_and_safety_result_dict = {}
     with eval_data_file.open("r") as f:
         for line in f:
             eval_data = json.loads(line)
@@ -127,6 +132,10 @@ def run_risk_and_safety_evaluators_with_azure():
                 else:
                     score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"])
                 print(f"{name}: {score}")
+                risk_and_safety_result_dict[name] = score
+
+    with eval_result_file_risk_and_safety.open("w") as f:
+        f.write(json.dumps(risk_and_safety_result_dict, indent=4))
 
 
 if __name__ == "__main__":
@@ -136,5 +145,5 @@ def run_risk_and_safety_evaluators_with_azure():
     # Run performance and quality evaluators with GitHub model catalog.
     run_perf_and_quality_evaluators()
 
-    # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
+    # # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
     # run_risk_and_safety_evaluators_with_azure()

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/github/codespaces-models/pull/18.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/github/codespaces-models/pull/18.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/github/codespaces-models/pull/18.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/github/codespaces-models/pull/18.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>