利用DeepSeek实现拿石子游戏：MCTS算法的深度探索

本文将会介绍如何使用DeepSeek R1推理模型来实现基于MCTS算法的拿石子游戏，来体验下R1模型的强大之处！

在文章迈向智能决策：蒙特卡洛树搜索（MCTS）的探索之路中，笔者介绍了蒙特卡洛树搜索（MCTS）的基础概念、算法原理以及如何使用MCTS实现TicTacToe小游戏。

近期，DeepSeek模型火遍全网，在全世界掀起了一阵“DeepSeek”浪潮。因此，笔者也将会借助公司自行部署的DeepSeek R1模型，来实现基于MCTS（蒙特卡洛树搜索）算法的拿石子游戏。

笔者只会输入游戏规则以及已实现的MCTS类，然后让DeepSeek帮助我实现这部分代码，通过几轮的调试，DeepSeek成功完成了这个任务。

准备工作

首先，笔者介绍下拿石子游戏。

拿石子游戏规则如下： - 有一堆N颗石子，两个玩家轮流拿石子。 - 两个玩家决定谁先走，然后轮流拿石子，每次可以拿1-2个石子。 - 拿到最后一个石子的玩家获胜。

该游戏的取胜规则是看N是否是3的倍数。

如果N是3的倍数，则后手占优，有必胜策略：每次先手取1颗石子时，后手拿2颗，若先手拿2颗，则后手拿1颗，如果只要保证剩下的石子是3的倍数，则后手必赢。
如果N不是3的倍数，则先手占优，有必胜策略：先手先拿1或2颗，保证剩下的石子是3的倍数，如此按照上面的做法，先手必赢。

笔者先前实现的MCTS类代码如下（基于自洽，将这部分代码也放在此处）：

# -*- coding: utf-8 -*-

from dataclasses import dataclass, field
from collections import defaultdict
import math
from typing import Dict, Set, List, Optional
from abc import ABC, abstractmethod


class Node(ABC):
    """Abstract base class for game state nodes."""

    @abstractmethod
    def find_children(self) -> Set["Node"]:
        """Return all possible child nodes."""

    @abstractmethod
    def get_random_child(self) -> Optional["Node"]:
        """Return a random child node."""

    @abstractmethod
    def is_terminal(self) -> bool:
        """Check if the node is a terminal state."""

    @abstractmethod
    def reward(self) -> float:
        """Return the reward value for a terminal node."""

    @abstractmethod
    def __hash__(self) -> int:
        """Define a hash function for the node."""

    @abstractmethod
    def __eq__(self, other: object) -> bool:
        """Define equality comparison for nodes."""


@dataclass
class MCTS:
    """Monte Carlo Tree Search implementation."""

    exploration_weight: float = 1.0
    total_rewards: Dict[Node, float] = field(default_factory=lambda: defaultdict(float))
    visit_counts: Dict[Node, int] = field(default_factory=lambda: defaultdict(int))
    children: Dict[Node, Set[Node]] = field(default_factory=lambda: defaultdict(set))

    def select_best_move(self, node: Node) -> Node:
        """Select the best move from the current node."""
        if node.is_terminal():
            raise ValueError(f"Cannot select move from terminal node: {node}")

        if node not in self.children:
            return node.get_random_child()

        return max(self.children[node], key=self._calculate_node_score)

    def simulate(self, node: Node) -> None:
        """Perform one iteration of the MCTS algorithm."""
        path = self._traverse_tree(node)
        leaf = path[-1]
        self._expand_node(leaf)
        reward = self._simulate_random_playout(leaf)
        self._backpropagate(path, reward)

    def _traverse_tree(self, node: Node) -> List[Node]:
        """Traverse the tree to find an unexplored node."""
        path = []
        while True:
            path.append(node)
            if node not in self.children or not self.children[node]:
                return path
            unexplored = self.children[node] - set(self.children.keys())
            if unexplored:
                path.append(unexplored.pop())
                return path
            node = self._select_uct(node)

    def _expand_node(self, node: Node) -> None:
        """Expand the node by adding its children to the tree."""
        if node in self.children:
            return
        self.children[node] = node.find_children()

    def _simulate_random_playout(self, node: Node) -> float:
        """Simulate a random playout from the given node."""
        invert_reward = True
        while True:
            if node.is_terminal():
                reward = node.reward()
                return 1 - reward if invert_reward else reward
            node = node.get_random_child()
            invert_reward = not invert_reward

    def _backpropagate(self, path: List[Node], reward: float) -> None:
        """Update the statistics for the nodes in the path."""
        for node in reversed(path):
            self.visit_counts[node] += 1
            self.total_rewards[node] += reward
            reward = 1 - reward

    def _select_uct(self, node: Node) -> Node:
        """Select a child node using the UCT formula."""
        assert all(n in self.children for n in self.children[node])
        log_n_parent = math.log(self.visit_counts[node])

        def uct(n: Node) -> float:
            return self.total_rewards[n] / self.visit_counts[
                n
            ] + self.exploration_weight * math.sqrt(log_n_parent / self.visit_counts[n])

        return max(self.children[node], key=uct)

    def _calculate_node_score(self, node: Node) -> float:
        """Calculate the score for a node based on its average reward."""
        if self.visit_counts[node] == 0:
            return float("-inf")
        return self.total_rewards[node] / self.visit_counts[node]

拿石子游戏

让我们看看DeepSeek在不知晓取胜策略的前提下，去实现MCTS算法，看看该算法是否能成功学习到游戏取胜的策略。

下面是DeepSeek R1推理模型实现的基于MCTS算法的拿石子游戏，AI先走，玩家后走，完整代码如下：

# -*- coding: utf-8 -*-
# @file: picking_stone.py

"""
Picking Stone Game
一共有N颗石子，两个玩家轮流拿石子，每次可以拿1~2颗，拿到最后一颗石子的玩家获胜。
实现一个AI玩家，使用MCTS算法进行决策。X代表AI，O代表玩家。
"""
import random
from typing import Set, Optional, Any

from mcts import Node, MCTS


# 实现StoneGameState
class StoneGameState(Node):
    def __init__(self, stones_remaining: int, current_player: str):
        self.stones_remaining = stones_remaining
        self.current_player = current_player  # "X"或"O"

    @property
    def opponent(self) -> str:
        return "O" if self.current_player == "X" else "X"

    def find_children(self) -> Set["StoneGameState"]:
        children = set()
        if self.stones_remaining >= 1:
            children.add(StoneGameState(self.stones_remaining - 1, self.opponent))
        if self.stones_remaining >= 2:
            children.add(StoneGameState(self.stones_remaining - 2, self.opponent))
        return children

    def get_random_child(self) -> Optional["StoneGameState"]:
        possible_moves = []
        if self.stones_remaining >= 1:
            possible_moves.append(1)
        if self.stones_remaining >= 2:
            possible_moves.append(2)
        if not possible_moves:
            return None
        move = random.choice(possible_moves)
        return StoneGameState(self.stones_remaining - move, self.opponent)

    def is_terminal(self) -> bool:
        return self.stones_remaining == 0

    def reward(self) -> float:
        if not self.is_terminal():
            raise RuntimeError("reward() called on non-terminal node")
        # 当前玩家无法行动，输，返回0
        return 0.0

    def __hash__(self) -> int:
        return hash((self.stones_remaining, self.current_player))

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, StoneGameState):
            return False
        return (
            self.stones_remaining == other.stones_remaining
            and self.current_player == other.current_player
        )

    def __repr__(self) -> str:
        return f"Stones: {self.stones_remaining}, Player: {self.current_player}"


# 游戏主循环
def main():
    # 初始状态: N颗石子，X先手
    stone_num = 10
    initial_state = StoneGameState(stones_remaining=stone_num, current_player="X")
    mcts = MCTS()

    state = initial_state
    while not state.is_terminal():
        print(f"\n当前剩余石子数: {state.stones_remaining}")

        # AI (X) 的回合
        if state.current_player == "X":
            # MCTS模拟1000次
            for _ in range(1000):
                mcts.simulate(state)
            best_move = mcts.select_best_move(state)
            # 计算AI拿取的石子数
            ai_move = state.stones_remaining - best_move.stones_remaining
            print(f"AI（X）拿取了 {ai_move} 颗石子")
            state = best_move  # 切换到O的回合

        # 玩家 (O) 的回合
        else:
            while True:
                try:
                    move = int(input("请输入你要拿的石子数（1或2）: "))
                    if move not in (1, 2):
                        print("只能拿1或2颗！")
                        continue
                    if move > state.stones_remaining:
                        print(f"石子不足，只剩{state.stones_remaining}颗！")
                        continue
                    break
                except ValueError:
                    print("请输入有效的数字！")

            # 更新石子状态并切换玩家
            new_stones = state.stones_remaining - move
            state = StoneGameState(stones_remaining=new_stones, current_player="X")

    # 游戏结束判断胜利者
    print("\n===== 游戏结束 =====")
    print(f"剩余石子数: {state.stones_remaining}")
    # 因为无法行动的一方是输家，另一方获胜
    winner = "O" if state.current_player == "X" else "X"
    print(f"胜利者: {winner}")


if __name__ == "__main__":
    main()

运行该脚本，结果如下：

第一次运行

石子数量为9，AI先手，玩家后手（玩家按取胜策略走，数量为3的倍数时，后手必赢）。

输出结果：

当前剩余石子数: 9
AI（X）拿取了 1 颗石子

当前剩余石子数: 8
请输入你要拿的石子数（1或2）: 2

当前剩余石子数: 6
AI（X）拿取了 1 颗石子

当前剩余石子数: 5
请输入你要拿的石子数（1或2）: 2

当前剩余石子数: 3
AI（X）拿取了 1 颗石子

当前剩余石子数: 2
请输入你要拿的石子数（1或2）: 2

===== 游戏结束 =====
剩余石子数: 0
胜利者: O

第二次运行

石子数量为9，AI先手，玩家后手（玩家随机走，不按取胜策略来）。此时AI往往能取胜。

输出结果：

当前剩余石子数: 9
AI（X）拿取了 1 颗石子

当前剩余石子数: 8
请输入你要拿的石子数（1或2）: 1

当前剩余石子数: 7
AI（X）拿取了 1 颗石子

当前剩余石子数: 6
请输入你要拿的石子数（1或2）: 1

当前剩余石子数: 5
AI（X）拿取了 2 颗石子

当前剩余石子数: 3
请输入你要拿的石子数（1或2）: 1

当前剩余石子数: 2
AI（X）拿取了 2 颗石子

===== 游戏结束 =====
剩余石子数: 0
胜利者: X

第三次运行

石子数量为10，AI先手，玩家后手（当石子数量不是3的倍数时，先手必赢）。

第三次输出结果：

当前剩余石子数: 10
AI（X）拿取了 1 颗石子

当前剩余石子数: 9
请输入你要拿的石子数（1或2）: 2

当前剩余石子数: 7
AI（X）拿取了 1 颗石子

当前剩余石子数: 6
请输入你要拿的石子数（1或2）: 1

当前剩余石子数: 5
AI（X）拿取了 2 颗石子

当前剩余石子数: 3
请输入你要拿的石子数（1或2）: 2

当前剩余石子数: 1
AI（X）拿取了 1 颗石子

===== 游戏结束 =====
剩余石子数: 0
胜利者: X

可以看到，DeepSeek写的代码成功实现了MCTS算法，以及AI取胜的策略。这次，笔者真正感受到了DeepSeek R1推理模型的强大之处！

如果我们再让DeepSeek R1模型实现下拿石子游戏的Gradio版本，代码如下：

# -*- coding: utf-8 -*-
# @file: picking_stone_gradio.py

import random
import gradio as gr
from mcts import MCTS, Node
from typing import Set, Optional, Any


class StoneGameState(Node):
    def __init__(self, stones_remaining: int, current_player: str):
        self.stones_remaining = stones_remaining
        self.current_player = current_player  # "X"或"O"

    @property
    def opponent(self) -> str:
        return "O" if self.current_player == "X" else "X"

    def find_children(self) -> Set["StoneGameState"]:
        children = set()
        if self.stones_remaining >= 1:
            children.add(StoneGameState(self.stones_remaining - 1, self.opponent))
        if self.stones_remaining >= 2:
            children.add(StoneGameState(self.stones_remaining - 2, self.opponent))
        return children

    def get_random_child(self) -> Optional["StoneGameState"]:
        possible_moves = []
        if self.stones_remaining >= 1:
            possible_moves.append(1)
        if self.stones_remaining >= 2:
            possible_moves.append(2)
        if not possible_moves:
            return None
        move = random.choice(possible_moves)
        return StoneGameState(self.stones_remaining - move, self.opponent)

    def is_terminal(self) -> bool:
        return self.stones_remaining == 0

    def reward(self) -> float:
        if not self.is_terminal():
            raise RuntimeError("reward() called on non-terminal node")
        # 当前玩家无法行动，输，返回0
        return 0.0

    def __hash__(self) -> int:
        return hash((self.stones_remaining, self.current_player))

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, StoneGameState):
            return False
        return (
            self.stones_remaining == other.stones_remaining
            and self.current_player == other.current_player
        )

    def __repr__(self) -> str:
        return f"Stones: {self.stones_remaining}, Player: {self.current_player}"


# 全局状态存储
current_state = None
mcts = MCTS()
message_history = []


def start_game(stones_num):
    global current_state, mcts, message_history
    current_state = StoneGameState(int(stones_num), "X")
    mcts = MCTS()
    message_history = [f"🏁 初始石子: {stones_num}🪨"]

    if current_state.stones_remaining > 0:
        for _ in range(1000):
            mcts.simulate(current_state)
        best_move = mcts.select_best_move(current_state)
        ai_move = current_state.stones_remaining - best_move.stones_remaining
        message_history.append(f"🖥️ 拿走 {ai_move}🪨 → 剩余 {best_move.stones_remaining}🪨")
        current_state = best_move

    return update_ui()


def player_move(move: int):
    global current_state, mcts, message_history

    if current_state is None or current_state.is_terminal():
        return update_ui()

    new_stones = current_state.stones_remaining - move
    message_history.append(f"👤 拿走 {move}🪨 → 剩余 {new_stones}🪨")
    current_state = StoneGameState(new_stones, "X")

    if not current_state.is_terminal():
        for _ in range(1000):
            mcts.simulate(current_state)
        best_move = mcts.select_best_move(current_state)
        ai_move = current_state.stones_remaining - best_move.stones_remaining
        message_history.append(f"🖥️ 拿走 {ai_move}🪨 → 剩余 {best_move.stones_remaining}🪨")
        current_state = best_move

    return update_ui()


def update_ui():
    global current_state, message_history
    if current_state is None:
        return (
            "<div class='history'>等待游戏开始...</div>",
            "🕹️ 点击开始按钮",
            gr.Row(visible=False),
            ""
        )

    history_html = "<div class='history'>" + "<br>".join(message_history) + "</div>"

    if current_state.is_terminal():
        winner = "🖥️ AI" if current_state.current_player == "O" else "👤 玩家"
        return (
            history_html,
            f"🏆 游戏结束 | 胜者: {winner}",
            gr.Row(visible=False),
            f"<div class='winner'>{winner} 🎉 获胜！</div>"
        )

    return (
        history_html,
        f"{'🖥️ AI' if current_state.current_player == 'X' else '👤 玩家'} 回合 | 剩余: {current_state.stones_remaining}🪨",
        gr.Row(visible=current_state.current_player == "O"),
        ""
    )


css = """
.history { 
    padding: 15px;
    border: 1px solid #eee;
    border-radius: 8px;
    max-height: 300px;
    overflow-y: auto;
}
.winner {
    padding: 20px;
    background: #4CAF50;
    color: white;
    border-radius: 8px;
    margin-top: 15px;
    text-align: center;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("# 🪨 石子对战")

    with gr.Accordion("📚 游戏规则"):
        gr.Markdown("""
        ### 🎯 玩法：
        1. 初始设置石子数量
        2. AI先手拿取1-2颗石子
        3. 玩家后手拿取1-2颗石子
        4. **拿到最后一颗石子者胜**
        """)

    with gr.Row():
        stones_input = gr.Number(label="🪨 石子数量", value=10, minimum=3, maximum=20)
        start_btn = gr.Button("🚀 开始游戏", variant="primary")

    history = gr.HTML(label="📜 对战记录")
    status = gr.HTML(label="📊 实时状态")

    with gr.Row(visible=False) as controls:
        move1_btn = gr.Button("🪨 拿1颗", variant="secondary")
        move2_btn = gr.Button("🪨🪨 拿2颗", variant="secondary")

    result = gr.HTML(visible=False)

    start_btn.click(
        start_game,
        inputs=[stones_input],
        outputs=[history, status, controls, result]
    )
    move1_btn.click(
        lambda: player_move(1),
        outputs=[history, status, controls, result]
    )
    move2_btn.click(
        lambda: player_move(2),
        outputs=[history, status, controls, result]
    )

if __name__ == "__main__":
    demo.launch()