From 2548c0799825b563dc988ed039be8ca74bac7641 Mon Sep 17 00:00:00 2001 From: David Wu Date: Wed, 11 Dec 2024 14:59:33 -0800 Subject: [PATCH 1/2] Generate evaluation data and run evaluators --- .../python/azure_ai_evaluation/evaluation.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 samples/python/azure_ai_evaluation/evaluation.py diff --git a/samples/python/azure_ai_evaluation/evaluation.py b/samples/python/azure_ai_evaluation/evaluation.py new file mode 100644 index 0000000..cb51e41 --- /dev/null +++ b/samples/python/azure_ai_evaluation/evaluation.py @@ -0,0 +1,140 @@ +"""This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation. +It is leveraging your endpoint and key. The call is synchronous. + +Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk +""" + +import os +import json +from pathlib import Path +from azure.ai.inference import ChatCompletionsClient +from azure.ai.inference.models import SystemMessage, UserMessage +from azure.ai import evaluation +from azure.ai.evaluation import RougeType, evaluate +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential + + +token = os.environ['GITHUB_TOKEN'] +inferencing_model_name = "gpt-4o-mini" +evaluation_model_name = "gpt-4o-mini" +api_version = "2024-08-01-preview" +endpoint = "https://models.inference.ai.azure.com" + +evaluation_name = "GitHub models evaluation" +eval_data_file = Path("./eval_data.jsonl") +eval_result_file_perf_and_quality = Path("./eval_result_perf_and_quality.json") +eval_result_file_risk_and_safety = Path("./eval_result_risk_and_safety.json") + + +def generate_eval_data(): + eval_data_queries = [{ + "query": "What is the capital of France?", + "ground_truth": "Paris", + }, { + "query": "Where is Wineglass Bay?", + "ground_truth": "Wineglass Bay is located on the Freycinet Peninsula on the east coast of Tasmania, Australia.", + }] + + with eval_data_file.open("w") as f: + for eval_data_query in eval_data_queries: + client = ChatCompletionsClient( + endpoint=endpoint, + credential=AzureKeyCredential(token), + ) + + context = "You are a geography teacher." + response = client.complete( + messages=[ + SystemMessage(content=context), + UserMessage(content=eval_data_query["query"]), + ], + model=inferencing_model_name, + temperature=1., + max_tokens=1000, + top_p=1. + ) + result = response.choices[0].message.content + + eval_data = { + "id": "1", + "description": "Evaluate the model", + "query": eval_data_query["query"], + "context": context, + "response": result, + "ground_truth": eval_data_query["ground_truth"], + } + f.write(json.dumps(eval_data) + "\n") + + +def run_perf_and_quality_evaluators(): + model_config = { + "azure_endpoint": endpoint, + "azure_deployment": evaluation_model_name, + "api_key": token, + "api_version": api_version, + } + + evaluators = { + "BleuScoreEvaluator": evaluation.BleuScoreEvaluator(), + "F1ScoreEvaluator": evaluation.F1ScoreEvaluator(), + "GleuScoreEvaluator": evaluation.GleuScoreEvaluator(), + "MeteorScoreEvaluator": evaluation.MeteorScoreEvaluator(), + "RougeScoreEvaluator": evaluation.RougeScoreEvaluator(rouge_type=RougeType.ROUGE_L), + "CoherenceEvaluator": evaluation.CoherenceEvaluator(model_config=model_config), + "FluencyEvaluator": evaluation.FluencyEvaluator(model_config=model_config), + "GroundednessEvaluator": evaluation.GroundednessEvaluator(model_config=model_config), + "QAEvaluator": evaluation.QAEvaluator(model_config=model_config, _parallel=False), + "RelevanceEvaluator": evaluation.RelevanceEvaluator(model_config=model_config), + "RetrievalEvaluator": evaluation.RetrievalEvaluator(model_config=model_config), + "SimilarityEvaluator": evaluation.SimilarityEvaluator(model_config=model_config), + } + + eval_results = evaluate( + data=eval_data_file, + evaluators=evaluators, + evaluation_name=evaluation_name, + target=None, + output_path=eval_result_file_perf_and_quality, + ) + print(json.dumps(eval_results, indent=4)) + + +def run_risk_and_safety_evaluators_with_azure(): + azure_ai_project = { + "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"), + "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP_NAME"), + "project_name": os.environ.get("AZURE_PROJECT_NAME"), + } + credential = DefaultAzureCredential() + evaluators = { + "ContentSafetyEvaluator": evaluation.ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "HateUnfairnessEvaluator": evaluation.HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "SelfHarmEvaluator": evaluation.SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "SexualEvaluator": evaluation.SexualEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "ViolenceEvaluator": evaluation.ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "ProtectedMaterialEvaluator": evaluation.ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "IndirectAttackEvaluator": evaluation.IndirectAttackEvaluator(azure_ai_project=azure_ai_project, credential=credential), + "GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential), + } + + with eval_data_file.open("r") as f: + for line in f: + eval_data = json.loads(line) + for name, evaluator in evaluators.items(): + if name != "GroundednessProEvaluator": + score = evaluator(query=eval_data["query"], response=eval_data["response"]) + else: + score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"]) + print(f"{name}: {score}") + + +if __name__ == "__main__": + # Generate evaluation data with GitHub model catalog and save it to a file. + generate_eval_data() + + # Run performance and quality evaluators with GitHub model catalog. + run_perf_and_quality_evaluators() + + # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI. + # run_risk_and_safety_evaluators_with_azure() From fc35b7577a3146b15b2cf46087b52f48c1f383e2 Mon Sep 17 00:00:00 2001 From: David Wu Date: Thu, 12 Dec 2024 14:56:02 -0800 Subject: [PATCH 2/2] Separate result JSON files as well as target/judge model parameters --- .../python/azure_ai_evaluation/evaluation.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/samples/python/azure_ai_evaluation/evaluation.py b/samples/python/azure_ai_evaluation/evaluation.py index cb51e41..ed6a815 100644 --- a/samples/python/azure_ai_evaluation/evaluation.py +++ b/samples/python/azure_ai_evaluation/evaluation.py @@ -1,6 +1,8 @@ """This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation. It is leveraging your endpoint and key. The call is synchronous. +For those who have Azure credentials, you can run the risk and safety evaluators from Azure AI. + Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk """ @@ -16,10 +18,13 @@ token = os.environ['GITHUB_TOKEN'] -inferencing_model_name = "gpt-4o-mini" -evaluation_model_name = "gpt-4o-mini" -api_version = "2024-08-01-preview" -endpoint = "https://models.inference.ai.azure.com" + +# Target model is the model to be evaluated. +target_model_name = "Mistral-small" +target_model_endpoint = "https://models.inference.ai.azure.com" +# Judge model is the model to evaluate the target model. +judge_model_name = "gpt-4o-mini" +judge_model_endpoint = "https://models.inference.ai.azure.com" evaluation_name = "GitHub models evaluation" eval_data_file = Path("./eval_data.jsonl") @@ -39,7 +44,7 @@ def generate_eval_data(): with eval_data_file.open("w") as f: for eval_data_query in eval_data_queries: client = ChatCompletionsClient( - endpoint=endpoint, + endpoint=target_model_endpoint, credential=AzureKeyCredential(token), ) @@ -49,7 +54,7 @@ def generate_eval_data(): SystemMessage(content=context), UserMessage(content=eval_data_query["query"]), ], - model=inferencing_model_name, + model=target_model_name, temperature=1., max_tokens=1000, top_p=1. @@ -69,10 +74,9 @@ def generate_eval_data(): def run_perf_and_quality_evaluators(): model_config = { - "azure_endpoint": endpoint, - "azure_deployment": evaluation_model_name, + "azure_endpoint": judge_model_endpoint, + "azure_deployment": judge_model_name, "api_key": token, - "api_version": api_version, } evaluators = { @@ -118,6 +122,7 @@ def run_risk_and_safety_evaluators_with_azure(): "GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential), } + risk_and_safety_result_dict = {} with eval_data_file.open("r") as f: for line in f: eval_data = json.loads(line) @@ -127,6 +132,10 @@ def run_risk_and_safety_evaluators_with_azure(): else: score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"]) print(f"{name}: {score}") + risk_and_safety_result_dict[name] = score + + with eval_result_file_risk_and_safety.open("w") as f: + f.write(json.dumps(risk_and_safety_result_dict, indent=4)) if __name__ == "__main__": @@ -136,5 +145,5 @@ def run_risk_and_safety_evaluators_with_azure(): # Run performance and quality evaluators with GitHub model catalog. run_perf_and_quality_evaluators() - # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI. + # # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI. # run_risk_and_safety_evaluators_with_azure() pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy