from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread
model_id = "./models/Qwen1.5-7B-Chat" model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) message = [{"role": "user", "content": "沈阳一共有几条地铁?"}] conversion = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False) print(conversion) encoding = tokenizer(conversion, return_tensors="pt") streamer = TextIteratorStreamer(tokenizer) # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. generation_kwargs = dict(encoding, streamer=streamer, max_new_tokens=100, do_sample=True, temperature=0.2) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start()
generated_text = "" for new_text in streamer: output = new_text.replace(conversion, '') if output: print(output)
# -*- coding: utf-8 -*- import gradio as gr from threading import Thread from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
model_id = "./models/Qwen1.5-7B-Chat" model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id)