1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
| """ PettingZoo 多智能体强化学习训练示例 环境:Simple Adversary(简单对抗游戏) 算法:PPO (Proximal Policy Optimization) """
import numpy as np from pettingzoo.mpe import simple_adversary_v3 import supersuit as ss from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy import os
TRAIN_STEPS = 100_000 MODEL_PATH = "adversary_model"
def make_env(): """创建并预处理环境""" env = simple_adversary_v3.parallel_env(max_cycles=25, continuous_actions=True) env = ss.pad_observations_v0(env) env = ss.pad_action_space_v0(env) env = ss.pettingzoo_env_to_vec_env_v1(env) env = ss.concat_vec_envs_v1(env, num_vec_envs=8, num_cpus=1, base_class="stable_baselines3") return env
def train(): """训练模型""" print("=" * 50) print("🚀 开始训练 Simple Adversary 环境") print("=" * 50) env = make_env() model = PPO( MlpPolicy, env, verbose=1, learning_rate=1e-3, batch_size=256, n_steps=256, gamma=0.99, tensorboard_log="./logs/" ) model.learn(total_timesteps=TRAIN_STEPS, progress_bar=True) model.save(MODEL_PATH) print(f"\n✅ 模型已保存到: {MODEL_PATH}.zip") env.close() return model
def test(model=None, num_episodes=3): """测试训练好的模型""" print("\n" + "=" * 50) print("🎮 测试训练好的 AI") print("=" * 50) if model is None: if os.path.exists(f"{MODEL_PATH}.zip"): model = PPO.load(MODEL_PATH) print(f"📦 已加载模型: {MODEL_PATH}.zip") else: print("❌ 找不到模型文件,请先训练!") return env = simple_adversary_v3.parallel_env(render_mode="human", max_cycles=25, continuous_actions=True) env = ss.pad_observations_v0(env) env = ss.pad_action_space_v0(env) for episode in range(num_episodes): print(f"\n--- Episode {episode + 1}/{num_episodes} ---") observations, infos = env.reset() total_rewards = {agent: 0 for agent in env.agents} while env.agents: actions = {} for agent in env.agents: obs = observations[agent] action, _ = model.predict(obs, deterministic=True) actions[agent] = action observations, rewards, terminations, truncations, infos = env.step(actions) for agent, reward in rewards.items(): total_rewards[agent] += reward print(f"总奖励: {total_rewards}") env.close() print("\n🏁 测试完成!")
def test_random(num_episodes=2): """用随机策略测试(对比用)""" print("\n" + "=" * 50) print("🎲 随机策略测试(对比用)") print("=" * 50) env = simple_adversary_v3.parallel_env(render_mode="human", max_cycles=25, continuous_actions=True) for episode in range(num_episodes): print(f"\n--- Episode {episode + 1}/{num_episodes} ---") observations, infos = env.reset() total_rewards = {agent: 0 for agent in env.agents} while env.agents: actions = {agent: env.action_space(agent).sample() for agent in env.agents} observations, rewards, terminations, truncations, infos = env.step(actions) for agent, reward in rewards.items(): total_rewards[agent] += reward print(f"总奖励: {total_rewards}") env.close()
if __name__ == "__main__": model = train() test(model)
|