1
1
"""This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation.
2
2
It is leveraging your endpoint and key. The call is synchronous.
3
3
4
+ For those who have Azure credentials, you can run the risk and safety evaluators from Azure AI.
5
+
4
6
Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
5
7
"""
6
8
16
18
17
19
18
20
token = os .environ ['GITHUB_TOKEN' ]
19
- inferencing_model_name = "gpt-4o-mini"
20
- evaluation_model_name = "gpt-4o-mini"
21
- api_version = "2024-08-01-preview"
22
- endpoint = "https://models.inference.ai.azure.com"
21
+
22
+ # Target model is the model to be evaluated.
23
+ target_model_name = "Mistral-small"
24
+ target_model_endpoint = "https://models.inference.ai.azure.com"
25
+ # Judge model is the model to evaluate the target model.
26
+ judge_model_name = "gpt-4o-mini"
27
+ judge_model_endpoint = "https://models.inference.ai.azure.com"
23
28
24
29
evaluation_name = "GitHub models evaluation"
25
30
eval_data_file = Path ("./eval_data.jsonl" )
@@ -39,7 +44,7 @@ def generate_eval_data():
39
44
with eval_data_file .open ("w" ) as f :
40
45
for eval_data_query in eval_data_queries :
41
46
client = ChatCompletionsClient (
42
- endpoint = endpoint ,
47
+ endpoint = target_model_endpoint ,
43
48
credential = AzureKeyCredential (token ),
44
49
)
45
50
@@ -49,7 +54,7 @@ def generate_eval_data():
49
54
SystemMessage (content = context ),
50
55
UserMessage (content = eval_data_query ["query" ]),
51
56
],
52
- model = inferencing_model_name ,
57
+ model = target_model_name ,
53
58
temperature = 1. ,
54
59
max_tokens = 1000 ,
55
60
top_p = 1.
@@ -69,10 +74,9 @@ def generate_eval_data():
69
74
70
75
def run_perf_and_quality_evaluators ():
71
76
model_config = {
72
- "azure_endpoint" : endpoint ,
73
- "azure_deployment" : evaluation_model_name ,
77
+ "azure_endpoint" : judge_model_endpoint ,
78
+ "azure_deployment" : judge_model_name ,
74
79
"api_key" : token ,
75
- "api_version" : api_version ,
76
80
}
77
81
78
82
evaluators = {
@@ -118,6 +122,7 @@ def run_risk_and_safety_evaluators_with_azure():
118
122
"GroundednessProEvaluator" : evaluation .GroundednessProEvaluator (azure_ai_project = azure_ai_project , credential = credential ),
119
123
}
120
124
125
+ risk_and_safety_result_dict = {}
121
126
with eval_data_file .open ("r" ) as f :
122
127
for line in f :
123
128
eval_data = json .loads (line )
@@ -127,6 +132,10 @@ def run_risk_and_safety_evaluators_with_azure():
127
132
else :
128
133
score = evaluator (query = eval_data ["query" ], response = eval_data ["response" ], context = eval_data ["context" ])
129
134
print (f"{ name } : { score } " )
135
+ risk_and_safety_result_dict [name ] = score
136
+
137
+ with eval_result_file_risk_and_safety .open ("w" ) as f :
138
+ f .write (json .dumps (risk_and_safety_result_dict , indent = 4 ))
130
139
131
140
132
141
if __name__ == "__main__" :
@@ -136,5 +145,5 @@ def run_risk_and_safety_evaluators_with_azure():
136
145
# Run performance and quality evaluators with GitHub model catalog.
137
146
run_perf_and_quality_evaluators ()
138
147
139
- # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
148
+ # # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
140
149
# run_risk_and_safety_evaluators_with_azure()
0 commit comments