classEvaluatorType(str, Enum): """The types of the evaluators."""
QA = "qa" """Question answering evaluator, which grades answers to questions directly using an LLM.""" COT_QA = "cot_qa" """Chain of thought question answering evaluator, which grades answers to questions using chain of thought 'reasoning'.""" CONTEXT_QA = "context_qa" """Question answering evaluator that incorporates 'context' in the response.""" PAIRWISE_STRING = "pairwise_string" """The pairwise string evaluator, which predicts the preferred prediction from between two models.""" SCORE_STRING = "score_string" """The scored string evaluator, which gives a score between 1 and 10 to a prediction.""" LABELED_PAIRWISE_STRING = "labeled_pairwise_string" """The labeled pairwise string evaluator, which predicts the preferred prediction from between two models based on a ground truth reference label.""" LABELED_SCORE_STRING = "labeled_score_string" """The labeled scored string evaluator, which gives a score between 1 and 10 to a prediction based on a ground truth reference label.""" AGENT_TRAJECTORY = "trajectory" """The agent trajectory evaluator, which grades the agent's intermediate steps.""" CRITERIA = "criteria" """The criteria evaluator, which evaluates a model based on a custom set of criteria without any reference labels.""" LABELED_CRITERIA = "labeled_criteria" """The labeled criteria evaluator, which evaluates a model based on a custom set of criteria, with a reference label.""" STRING_DISTANCE = "string_distance" """Compare predictions to a reference answer using string edit distances.""" EXACT_MATCH = "exact_match" """Compare predictions to a reference answer using exact matching.""" REGEX_MATCH = "regex_match" """Compare predictions to a reference answer using regular expressions.""" PAIRWISE_STRING_DISTANCE = "pairwise_string_distance" """Compare predictions based on string edit distances.""" EMBEDDING_DISTANCE = "embedding_distance" """Compare a prediction to a reference label using embedding distance.""" PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance" """Compare two predictions using embedding distance.""" JSON_VALIDITY = "json_validity" """Check if a prediction is valid JSON.""" JSON_EQUALITY = "json_equality" """Check if a prediction is equal to a reference JSON.""" JSON_EDIT_DISTANCE = "json_edit_distance" """Compute the edit distance between two JSON strings after canonicalization.""" JSON_SCHEMA_VALIDATION = "json_schema_validation" """Check if a prediction is valid JSON according to a JSON schema."""
eval_result = evaluator.evaluate_strings( prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.", input="What's 2+2?", ) pprint(eval_result)
输出结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
{'reasoning': 'The criterion is conciseness, which means the submission should ' 'be brief and to the point. \n' '\n' 'Looking at the submission, the answer to the question "What\'s ' '2+2?" is given as "The answer you\'re looking for is that two ' 'and two is four." However, before providing the answer, the ' 'respondent adds an unnecessary comment:"That\'s an elementary ' 'question." \n' '\n' 'This additional comment does not contribute to answering the ' 'question and therefore makes the response less concise. \n' '\n' 'So, based on the criterion of conciseness, the submission does ' 'not meet the criterion.\n' '\n' 'N', 'score':0, 'value': 'N'}
eval_result = evaluator.evaluate_strings( input="What is the capital city of the Jiangsu Province?", prediction="Suzhou", reference="The capital city of Jiangsu Province is Nanjing.", ) pprint(eval_result)
{'reasoning': 'The criterion for this task is the correctness of the submitted ' 'answer. This means the answer should be accurate and factual.\n' '\n' 'The input asks for the capital city of the Jiangsu Province.\n' '\n' 'The submitted answer is Suzhou.\n' '\n' 'The reference answer, however, states that the capital city of ' 'Jiangsu Province is Nanjing.\n' '\n' 'Comparing the submitted answer with the reference answer, it is ' 'clear that the submitted answer is incorrect. Suzhou is not the ' 'capital of Jiangsu Province, Nanjing is.\n' '\n' 'Therefore, the submission does not meet the criterion of ' 'correctness.\n' '\n' 'N', 'score':0, 'value': 'N'}