Skip to content

Commit 1480f19

Browse files
authored
Merge pull request #18 from YusakuNo1/azure_ai_evaluation
Introduce Azure AI Evaluation SDK
2 parents 62261f9 + fc35b75 commit 1480f19

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation.
2+
It is leveraging your endpoint and key. The call is synchronous.
3+
4+
For those who have Azure credentials, you can run the risk and safety evaluators from Azure AI.
5+
6+
Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
7+
"""
8+
9+
import os
10+
import json
11+
from pathlib import Path
12+
from azure.ai.inference import ChatCompletionsClient
13+
from azure.ai.inference.models import SystemMessage, UserMessage
14+
from azure.ai import evaluation
15+
from azure.ai.evaluation import RougeType, evaluate
16+
from azure.core.credentials import AzureKeyCredential
17+
from azure.identity import DefaultAzureCredential
18+
19+
20+
token = os.environ['GITHUB_TOKEN']
21+
22+
# Target model is the model to be evaluated.
23+
target_model_name = "Mistral-small"
24+
target_model_endpoint = "https://models.inference.ai.azure.com"
25+
# Judge model is the model to evaluate the target model.
26+
judge_model_name = "gpt-4o-mini"
27+
judge_model_endpoint = "https://models.inference.ai.azure.com"
28+
29+
evaluation_name = "GitHub models evaluation"
30+
eval_data_file = Path("./eval_data.jsonl")
31+
eval_result_file_perf_and_quality = Path("./eval_result_perf_and_quality.json")
32+
eval_result_file_risk_and_safety = Path("./eval_result_risk_and_safety.json")
33+
34+
35+
def generate_eval_data():
36+
eval_data_queries = [{
37+
"query": "What is the capital of France?",
38+
"ground_truth": "Paris",
39+
}, {
40+
"query": "Where is Wineglass Bay?",
41+
"ground_truth": "Wineglass Bay is located on the Freycinet Peninsula on the east coast of Tasmania, Australia.",
42+
}]
43+
44+
with eval_data_file.open("w") as f:
45+
for eval_data_query in eval_data_queries:
46+
client = ChatCompletionsClient(
47+
endpoint=target_model_endpoint,
48+
credential=AzureKeyCredential(token),
49+
)
50+
51+
context = "You are a geography teacher."
52+
response = client.complete(
53+
messages=[
54+
SystemMessage(content=context),
55+
UserMessage(content=eval_data_query["query"]),
56+
],
57+
model=target_model_name,
58+
temperature=1.,
59+
max_tokens=1000,
60+
top_p=1.
61+
)
62+
result = response.choices[0].message.content
63+
64+
eval_data = {
65+
"id": "1",
66+
"description": "Evaluate the model",
67+
"query": eval_data_query["query"],
68+
"context": context,
69+
"response": result,
70+
"ground_truth": eval_data_query["ground_truth"],
71+
}
72+
f.write(json.dumps(eval_data) + "\n")
73+
74+
75+
def run_perf_and_quality_evaluators():
76+
model_config = {
77+
"azure_endpoint": judge_model_endpoint,
78+
"azure_deployment": judge_model_name,
79+
"api_key": token,
80+
}
81+
82+
evaluators = {
83+
"BleuScoreEvaluator": evaluation.BleuScoreEvaluator(),
84+
"F1ScoreEvaluator": evaluation.F1ScoreEvaluator(),
85+
"GleuScoreEvaluator": evaluation.GleuScoreEvaluator(),
86+
"MeteorScoreEvaluator": evaluation.MeteorScoreEvaluator(),
87+
"RougeScoreEvaluator": evaluation.RougeScoreEvaluator(rouge_type=RougeType.ROUGE_L),
88+
"CoherenceEvaluator": evaluation.CoherenceEvaluator(model_config=model_config),
89+
"FluencyEvaluator": evaluation.FluencyEvaluator(model_config=model_config),
90+
"GroundednessEvaluator": evaluation.GroundednessEvaluator(model_config=model_config),
91+
"QAEvaluator": evaluation.QAEvaluator(model_config=model_config, _parallel=False),
92+
"RelevanceEvaluator": evaluation.RelevanceEvaluator(model_config=model_config),
93+
"RetrievalEvaluator": evaluation.RetrievalEvaluator(model_config=model_config),
94+
"SimilarityEvaluator": evaluation.SimilarityEvaluator(model_config=model_config),
95+
}
96+
97+
eval_results = evaluate(
98+
data=eval_data_file,
99+
evaluators=evaluators,
100+
evaluation_name=evaluation_name,
101+
target=None,
102+
output_path=eval_result_file_perf_and_quality,
103+
)
104+
print(json.dumps(eval_results, indent=4))
105+
106+
107+
def run_risk_and_safety_evaluators_with_azure():
108+
azure_ai_project = {
109+
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
110+
"resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP_NAME"),
111+
"project_name": os.environ.get("AZURE_PROJECT_NAME"),
112+
}
113+
credential = DefaultAzureCredential()
114+
evaluators = {
115+
"ContentSafetyEvaluator": evaluation.ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=credential),
116+
"HateUnfairnessEvaluator": evaluation.HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=credential),
117+
"SelfHarmEvaluator": evaluation.SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=credential),
118+
"SexualEvaluator": evaluation.SexualEvaluator(azure_ai_project=azure_ai_project, credential=credential),
119+
"ViolenceEvaluator": evaluation.ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=credential),
120+
"ProtectedMaterialEvaluator": evaluation.ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project, credential=credential),
121+
"IndirectAttackEvaluator": evaluation.IndirectAttackEvaluator(azure_ai_project=azure_ai_project, credential=credential),
122+
"GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential),
123+
}
124+
125+
risk_and_safety_result_dict = {}
126+
with eval_data_file.open("r") as f:
127+
for line in f:
128+
eval_data = json.loads(line)
129+
for name, evaluator in evaluators.items():
130+
if name != "GroundednessProEvaluator":
131+
score = evaluator(query=eval_data["query"], response=eval_data["response"])
132+
else:
133+
score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"])
134+
print(f"{name}: {score}")
135+
risk_and_safety_result_dict[name] = score
136+
137+
with eval_result_file_risk_and_safety.open("w") as f:
138+
f.write(json.dumps(risk_and_safety_result_dict, indent=4))
139+
140+
141+
if __name__ == "__main__":
142+
# Generate evaluation data with GitHub model catalog and save it to a file.
143+
generate_eval_data()
144+
145+
# Run performance and quality evaluators with GitHub model catalog.
146+
run_perf_and_quality_evaluators()
147+
148+
# # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
149+
# run_risk_and_safety_evaluators_with_azure()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy