# data process # load dataset, get corpus, queries, relevant_docs withopen(os.path.join(project_dir, "data/doc_qa.json"), "r", encoding="utf-8") as f: content = json.loads(f.read())
corpus = content['corpus'] queries = content['queries'] relevant_docs = content['relevant_docs']
# # Load a model # 替换成自己的模型完整路径或使用huggingface modl id model_name = "bge-base-zh-v1.5" model_path = os.path.join(project_dir, f"models/{model_name}") model = SentenceTransformer(model_path, device="cuda"if torch.cuda.is_available() else"cpu") print("Model loaded")
s_time = time.time()
# # Evaluate the model evaluator = InformationRetrievalEvaluator( queries=queries, corpus=corpus, relevant_docs=relevant_docs, name=f"{os.path.basename(model_path)}", score_functions={"cosine": cos_sim} )
# Evaluate the model result = evaluator(model) pprint(result) print(f"Time cost: {time.time() - s_time:.2f}s")
Given the context information and not prior knowledge. generate only questions based on the below query.
You are a Professor. Your task is to setup \ {num_questions_per_chunk} questions for an upcoming \ quiz/examination in Chinese. The questions should be diverse in nature \ across the document in Chinese. The questions should not contain options, not start with Q1/ Q2. \ Restrict the questions to the context information provided. """
# -*- coding: utf-8 -*- # @file: ft_sentence_transformers_trainer.py import os import json import time import torch from datasets import Dataset from sentence_transformers import SentenceTransformer from sentence_transformers.evaluation import InformationRetrievalEvaluator from sentence_transformers.util import cos_sim from sentence_transformers.losses import MultipleNegativesRankingLoss from sentence_transformers import SentenceTransformerTrainingArguments from sentence_transformers.training_args import BatchSamplers from sentence_transformers import SentenceTransformerTrainer
# Load a model model_name = 'bge-base-zh-v1.5' # 替换成自己的模型完整路径或使用huggingface modl id model_path = os.path.join(project_dir, f"models/{model_name}") model = SentenceTransformer(model_path, device="cuda:0"if torch.cuda.is_available() else"cpu") print("Model loaded")
# # Evaluate the model evaluator = InformationRetrievalEvaluator( queries=queries, corpus=corpus, relevant_docs=relevant_docs, name=f"{model_name}", score_functions={"cosine": cos_sim} ) train_loss = MultipleNegativesRankingLoss(model)
# define training arguments args = SentenceTransformerTrainingArguments( output_dir=f"ft_{model_name}", # output directory and hugging face model ID num_train_epochs=5, # number of epochs per_device_train_batch_size=2, # train batch size gradient_accumulation_steps=2, # for a global batch size of 512 per_device_eval_batch_size=4, # evaluation batch size warmup_ratio=0.1, # warmup ratio learning_rate=2e-5, # learning rate, 2e-5 is a good value lr_scheduler_type="cosine", # use constant learning rate scheduler optim="adamw_torch_fused", # use fused adamw optimizer tf32=True, # use tf32 precision bf16=True, # use bf16 precision batch_sampler=BatchSamplers.NO_DUPLICATES, eval_strategy="epoch", # evaluate after each epoch save_strategy="epoch", # save after each epoch logging_steps=10, # log every 10 steps save_total_limit=3, # save only the last 3 models load_best_model_at_end=True, # load the best model when training ends metric_for_best_model=f"eval_{model_name}_cosine_ndcg@10", # Optimizing for the best ndcg@10 score )
# train the model trainer = SentenceTransformerTrainer( model=model, # the model to train args=args, # training arguments train_dataset=train_dataset.select_columns( ["positive", "anchor"] ), # training dataset loss=train_loss, evaluator=evaluator )