import argparse import random import time import json import requests import pandas as pd from datasets import load_dataset import os os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' os.environ['https_proxy'] = '' os.environ['http_proxy'] = '' hint = 'There is a single choice question. Answer the question by replying A, B, C, D, E, F, G, H, I, J. No other answers are accepted. Just the letter.' class DataEvaluator: def __init__(self): # self.template_prompt = template_prompt self.data = [] def load_data(self, file_path): """ Load data from a Parquet file into a list. Each record in the Parquet file should represent an individual record. """ # 读取 Parquet 文件 # dataset = load_dataset('parquet', data_files=file_path) ds = load_dataset("TIGER-Lab/MMLU-Pro") df = pd.DataFrame(ds['test']) # print(ds) # # ds_1 = ds['train'] # ds_2 = ds['validation'] # ds_3 = ds['test'] # # 将数据集转换为 Pandas DataFrame # df_test = pd.DataFrame(ds['test']) # df_val = pd.DataFrame(ds['validation']) # for _, row in df.iterrows(): # self.data.append(row.to_dict()) # df = pd.read_parquet(file_path) for _, row in df.iterrows(): self.data.append(row.to_dict()) def get_prompt(self, record): """ Combine fields from a record with the template prompt to create a full prompt. :param record: Dictionary containing fields to populate the template. :return: A formatted prompt string. """ # 查看ABCD。。。的选项 options_str = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(record['options'])]) prompt = hint + "\nQuestion: " + record['question'] + "\n" + options_str + "\nAnswer: '" return prompt def post_processing(self, text): """ Perform post-processing on the prediction string. :param text: The raw prediction string. :return: Processed prediction string. """ text = text.lstrip('\n').split('\n')[0] return text[:1] def score(self, pred, answers): """ Calculate scores between the prediction and the answer. Uses ROUGE scores as the evaluation metric. :param pred: The predicted string. :param answer: The reference answer string. :return: A dictionary containing ROUGE scores. """ for answer in answers: if pred == answer: return 1 return 0 # Function to generate text using API def generate_text(api_url, question, model_name, stream=False): headers = { 'accept': 'application/json', 'Content-Type': 'application/json', # 添加 API Key 'Authorization' : 'Bearer ' } data = { "messages": [{"content": question, "role": "user"}], "model": model_name, "stream": stream, # "temperature": 0.0 } print("POST data:", data) response = requests.post(api_url, headers=headers, json=data) if response.status_code == 200: result = response.json() return result.get('choices', [{}])[0].get('message', {}).get('content', '').strip() else: print(f"API Request failed with status code {response.status_code}") return None # Main function to handle multiple evaluations def main(concurrent_requests, data_evaluator: DataEvaluator, result_file, log_file, api_url, model_name): start_total_time = time.time() total_score = 0 results = [] # 设置随机数种子 random.seed(42) random.shuffle(data_evaluator.data) for i in range(min(concurrent_requests, len(data_evaluator.data))): # Randomly select a data item from data for each request data_item = data_evaluator.data[i] question = data_evaluator.get_prompt(data_item) # print(question) # Start the timer for this evaluation start_time = time.time() try: # Generate prediction using the API prediction = generate_text(api_url, question, model_name) if prediction is None: raise Exception(f"Failed to get prediction for {question}") answer = data_item['answer'] # Compute score score = data_evaluator.score(data_evaluator.post_processing(prediction), answer) # Calculate the time taken elapsed_time = time.time() - start_time # Collect the result data result_data = { "question_id": data_item['question_id'], "answer": answer, "prediction": data_evaluator.post_processing(prediction), "score": score, "time": elapsed_time } # Write results to result.json with each field on a new line with open(result_file, 'a', encoding='utf-8') as f: json.dump(result_data, f, ensure_ascii=False, indent=4) f.write("\n") # Ensure each JSON object is on a new line results.append(result_data) # Aggregate scores total_score += score except Exception as e: print(f"Error processing request {i}: {e}") # Calculate total time and throughput total_time = time.time() - start_total_time throughput = concurrent_requests / total_time # Log the total time, throughput, and average ROUGE scores with open(log_file, 'a', encoding='utf-8') as log_f: log_f.write(f"Total Time: {total_time:.2f} seconds\n") log_f.write(f"Throughput: {throughput:.2f} requests per second\n") log_f.write(f"Average Scores: {total_score / concurrent_requests}\n") log_f.write('-' * 40 + '\n') print(f"Results saved to {result_file}") print(f"Log saved to {log_file}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="API Generate Tester") parser.add_argument("--concurrent", type=int, default=1000, help="Number of concurrent evaluations") parser.add_argument("--file", type=str, default="TIGER-Lab/MMLU-Pro", help="Path to the mmlu.jsonl file") parser.add_argument("--result", type=str, default="./mmlu_pro.json", help="Path to save the result JSON file") parser.add_argument("--log", type=str, default="./mmlu_pro.log", help="Path to save the log file") parser.add_argument("--model", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model name or path") parser.add_argument("--api_url", type=str, default="http://localhost:15488/v1/chat/completions", help="API URL") # parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL") args = parser.parse_args() # Load the data from the provided file # template_prompt = hint + "\nQuestion: {question}\nA. {options}\nB. {option_b}\nC. {option_c}\nD. {option_d}\nAnswer: '" # template_prompt_pro = hint + "\nQuestion: {question}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nF. {options[5]}\nG. \ # {options[6]}\nH. {options[7]}\nI. {options[8]}\nJ. {options[9]}\nAnswer: '" # Load the data from the provided file data_evaluator = DataEvaluator() data_evaluator.load_data(args.file) # Run the main function with the specified number of concurrent evaluations main(args.concurrent, data_evaluator, args.result, args.log, args.api_url, args.model)