Back to Blog
MemoryArchitectureEngineering

Agent Memory Systems: Engineering Short-term, Long-term & Working Memory

2026-04-0816 min

记忆系统是 AI Agent 区别于无状态 API 调用的核心能力。一个优秀的记忆系统能让 Agent 在长期交互中保持上下文连贯性、学习用户偏好、积累领域知识。本文将深入探讨 Agent 记忆系统的架构设计与实现方法。

记忆系统架构总览

┌─────────────────────────────────────────────────────────────────┐

│ Memory Manager │

│ ┌──────────────────────────────────────────────────────────┐ │

│ │ Memory Controller │ │

│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │

│ │ │Consolid-│ │Retrieval│ │Forgetting│ │Conflict│ │ │

│ │ │ation │ │Strategy │ │Policy │ │Resolut.│ │ │

│ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │

│ └──────────────────────────────────────────────────────────┘ │

│ │

│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │

│ │ Short-term │ │ Working │ │ Long-term │ │

│ │ Memory │ │ Memory │ │ Memory │ │

│ │ │ │ │ │ │ │

│ │ • Buffer │ │ • Scratchpad │ │ • Vector DB │ │

│ │ • Summary │ │ • Context │ │ • Knowledge │ │

│ │ • Sliding │ │ Variables │ │ Graph │ │

│ │ Window │ │ • Task State │ │ • Episodic │ │

│ └──────────────┘ └──────────────┘ └──────────────┘ │

└─────────────────────────────────────────────────────────────────┘

短期记忆(Short-term Memory)

短期记忆负责维护当前对话的上下文,是最基础的记忆形式。

对话缓冲区(Conversation Buffer)

最简单的方式是保存完整对话历史:

from dataclasses import dataclass, field

from datetime import datetime

from typing import Optional

@dataclass

class Message:

role: str # "user", "assistant", "system", "tool"

content: str

timestamp: datetime = field(default_factory=datetime.utcnow)

metadata: dict = field(default_factory=dict)

class ConversationBufferMemory:

"""完整对话缓冲区记忆"""

def __init__(self, max_tokens: int = 100000):

self.messages: list[Message] = []

self.max_tokens = max_tokens

def add_message(self, role: str, content: str, metadata: dict = None):

self.messages.append(Message(

role=role,

content=content,

metadata=metadata or {}

))

def get_messages(self) -> list[dict]:

return [{"role": m.role, "content": m.content} for m in self.messages]

def get_token_count(self) -> int:

return sum(len(m.content) // 4 for m in self.messages)

def clear(self):

self.messages = []

滑动窗口记忆(Sliding Window)

当对话变得很长时,滑动窗口可以控制上下文大小:

class SlidingWindowMemory:

"""滑动窗口记忆"""

def __init__(self, window_size: int = 20, preserve_system: bool = True):

self.messages: list[Message] = []

self.window_size = window_size

self.preserve_system = preserve_system

def add_message(self, role: str, content: str, metadata: dict = None):

self.messages.append(Message(role=role, content=content))

def get_messages(self) -> list[dict]:

if len(self.messages) <= self.window_size:

return [{"role": m.role, "content": m.content} for m in self.messages]

# 保留系统消息 + 最近的 N 条消息

system_msgs = []

other_msgs = []

for msg in self.messages:

if self.preserve_system and msg.role == "system":

system_msgs.append(msg)

else:

other_msgs.append(msg)

recent_msgs = other_msgs[-self.window_size:]

all_msgs = system_msgs + recent_msgs

return [{"role": m.role, "content": m.content} for m in all_msgs]

摘要记忆(Summary Memory)

摘要记忆通过压缩早期对话来节省 token:

class SummaryBufferMemory:

"""带摘要的缓冲区记忆"""

def __init__(self, llm_client, max_recent_messages: int = 10):

self.messages: list[Message] = []

self.summary: str = ""

self.llm = llm_client

self.max_recent = max_recent_messages

def add_message(self, role: str, content: str, metadata: dict = None):

self.messages.append(Message(role=role, content=content))

# 超过阈值时触发摘要

if len(self.messages) > self.max_recent * 2:

self._summarize_old_messages()

async def _summarize_old_messages(self):

"""压缩早期消息为摘要"""

to_summarize = self.messages[:-self.max_recent]

conversation_text = "\n".join(

f"{m.role}: {m.content}" for m in to_summarize

)

prompt = f"""请将以下对话压缩为简洁的摘要,保留关键信息:

{conversation_text}

摘要:"""

new_summary = await self.llm.generate(prompt)

if self.summary:

self.summary = f"{self.summary}\n\n[更新] {new_summary}"

else:

self.summary = new_summary

# 只保留最近的消息

self.messages = self.messages[-self.max_recent:]

def get_messages(self) -> list[dict]:

result = []

if self.summary:

result.append({

"role": "system",

"content": f"先前对话摘要:\n{self.summary}"

})

result.extend(

{"role": m.role, "content": m.content}

for m in self.messages

)

return result

工作记忆(Working Memory)

工作记忆是 Agent 在执行任务时的"便签本",存储中间状态和临时信息。

上下文变量管理

class WorkingMemory:

"""Agent 工作记忆"""

def __init__(self):

self.variables: dict = {} # 上下文变量

self.scratchpad: list[str] = [] # 推理步骤记录

self.task_stack: list[dict] = [] # 任务栈

self.hypotheses: list[dict] = [] # 假设列表

def set_variable(self, key: str, value, ttl: int = None):

"""设置上下文变量"""

self.variables[key] = {

"value": value,

"created_at": datetime.utcnow(),

"ttl": ttl

}

def get_variable(self, key: str, default=None):

"""获取上下文变量"""

var = self.variables.get(key)

if var is None:

return default

# 检查 TTL

if var["ttl"]:

elapsed = (datetime.utcnow() - var["created_at"]).seconds

if elapsed > var["ttl"]:

del self.variables[key]

return default

return var["value"]

def add_scratchpad_entry(self, entry: str):

"""添加推理步骤"""

self.scratchpad.append(f"[{datetime.utcnow().isoformat()}] {entry}")

def get_scratchpad(self) -> str:

"""获取完整的推理过程"""

return "\n".join(self.scratchpad)

def push_task(self, task: dict):

"""推入子任务"""

self.task_stack.append(task)

def pop_task(self) -> Optional[dict]:

"""弹出当前任务"""

return self.task_stack.pop() if self.task_stack else None

def add_hypothesis(self, hypothesis: str, confidence: float):

"""记录假设"""

self.hypotheses.append({

"hypothesis": hypothesis,

"confidence": confidence,

"timestamp": datetime.utcnow()

})

def get_best_hypothesis(self) -> Optional[str]:

"""获取置信度最高的假设"""

if not self.hypotheses:

return None

best = max(self.hypotheses, key=lambda h: h["confidence"])

return best["hypothesis"]

def get_context_for_llm(self) -> str:

"""生成供 LLM 使用的上下文"""

parts = []

if self.variables:

vars_str = "\n".join(

f" - {k}: {v['value']}" for k, v in self.variables.items()

)

parts.append(f"当前上下文变量:\n{vars_str}")

if self.scratchpad:

parts.append(f"推理过程:\n{self.get_scratchpad()}")

if self.task_stack:

task_str = " → ".join(t["name"] for t in self.task_stack)

parts.append(f"任务栈: {task_str}")

if self.hypotheses:

parts.append(f"当前最佳假设: {self.get_best_hypothesis()}")

return "\n\n".join(parts)

长期记忆(Long-term Memory)

长期记忆使用向量数据库存储和检索历史信息。

向量记忆实现

from sentence_transformers import SentenceTransformer

import chromadb

from chromadb.config import Settings

class LongTermMemory:

"""基于向量数据库的长期记忆"""

def __init__(self, collection_name: str = "agent_memory"):

# 初始化嵌入模型

self.embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 初始化 ChromaDB

self.client = chromadb.Client(Settings(

chroma_db_impl="duckdb+parquet",

persist_directory="./memory_db"

))

self.collection = self.client.get_or_create_collection(

name=collection_name,

metadata={"hnsw:space": "cosine"}

)

def store_memory(self, content: str, memory_type: str,

metadata: dict = None, importance: float = 1.0):

"""存储记忆"""

embedding = self.embedder.encode(content).tolist()

memory_id = f"{memory_type}_{datetime.utcnow().timestamp()}"

self.collection.add(

ids=[memory_id],

embeddings=[embedding],

documents=[content],

metadatas=[{

"type": memory_type,

"importance": importance,

"created_at": datetime.utcnow().isoformat(),

**(metadata or {})

}]

)

def retrieve_memories(self, query: str, top_k: int = 5,

memory_type: str = None,

min_importance: float = 0.0) -> list[dict]:

"""检索相关记忆"""

query_embedding = self.embedder.encode(query).tolist()

# 构建过滤条件

where_filter = {}

if memory_type:

where_filter["type"] = memory_type

if min_importance > 0:

where_filter["importance"] = {"$gte": min_importance}

results = self.collection.query(

query_embeddings=[query_embedding],

n_results=top_k,

where=where_filter if where_filter else None

)

memories = []

for i, doc in enumerate(results["documents"][0]):

memories.append({

"content": doc,

"metadata": results["metadatas"][0][i],

"distance": results["distances"][0][i]

})

return memories

def update_memory_importance(self, memory_id: str, new_importance: float):

"""更新记忆重要性"""

result = self.collection.get(ids=[memory_id])

if result["ids"]:

metadata = result["metadatas"][0]

metadata["importance"] = new_importance

self.collection.update(

ids=[memory_id],

metadatas=[metadata]

)

def forget_old_memories(self, days_threshold: int = 90,

min_importance: float = 0.3):

"""遗忘不重要的旧记忆"""

cutoff_date = (

datetime.utcnow() - timedelta(days=days_threshold)

).isoformat()

results = self.collection.get(

where={

"$and": [

{"created_at": {"$lt": cutoff_date}},

{"importance": {"$lt": min_importance}}

]

}

)

if results["ids"]:

self.collection.delete(ids=results["ids"])

情节记忆(Episodic Memory)

情节记忆记录具体的交互经历,类似于人类的"事件回忆":

class EpisodicMemory:

"""情节记忆:记录具体的交互经历"""

def __init__(self, long_term_memory: LongTermMemory):

self.ltm = long_term_memory

def store_episode(self, episode: dict):

"""存储一个情节"""

# 将情节编码为自然语言描述

description = self._episode_to_text(episode)

self.ltm.store_memory(

content=description,

memory_type="episodic",

metadata={

"session_id": episode["session_id"],

"outcome": episode["outcome"],

"emotion": episode.get("emotion", "neutral")

},

importance=self._calculate_importance(episode)

)

def recall_similar_episodes(self, current_situation: str,

top_k: int = 3) -> list[dict]:

"""回忆类似的经历"""

return self.ltm.retrieve_memories(

query=current_situation,

top_k=top_k,

memory_type="episodic"

)

def _episode_to_text(self, episode: dict) -> str:

"""将情节结构化数据转为自然语言"""

return (

f"在 {episode['timestamp']},用户说:{episode['user_input']}。"

f"我采取了行动:{episode['action']}。"

f"结果是:{episode['outcome']}。"

f"学到了:{episode.get('lesson', '无')}"

)

def _calculate_importance(self, episode: dict) -> float:

"""计算情节的重要性"""

importance = 0.5

# 成功的任务增加重要性

if episode["outcome"] == "success":

importance += 0.2

# 包含学习的内容更重要

if episode.get("lesson"):

importance += 0.3

# 用户反馈好的更重要

if episode.get("user_feedback") == "positive":

importance += 0.2

return min(importance, 1.0)

语义记忆(Semantic Memory)

语义记忆存储抽象的知识和概念:

import json

class SemanticMemory:

"""语义记忆:存储抽象知识"""

def __init__(self, long_term_memory: LongTermMemory):

self.ltm = long_term_memory

self.knowledge_graph = {} # 简化的知识图谱

def store_knowledge(self, concept: str, facts: list[str],

relations: list[dict] = None):

"""存储概念知识"""

# 存储到向量数据库

knowledge_text = f"关于 {concept}:\n" + "\n".join(f"- {f}" for f in facts)

self.ltm.store_memory(

content=knowledge_text,

memory_type="semantic",

metadata={"concept": concept},

importance=0.8

)

# 更新知识图谱

self.knowledge_graph[concept] = {

"facts": facts,

"relations": relations or []

}

def query_knowledge(self, query: str, top_k: int = 5) -> list[dict]:

"""查询知识"""

return self.ltm.retrieve_memories(

query=query,

top_k=top_k,

memory_type="semantic",

min_importance=0.5

)

def get_related_concepts(self, concept: str) -> list[str]:

"""获取相关概念"""

if concept not in self.knowledge_graph:

return []

relations = self.knowledge_graph[concept]["relations"]

return [r["target"] for r in relations]

记忆整合与检索策略

记忆整合(Memory Consolidation)

class MemoryConsolidator:

"""记忆整合:将短期记忆转化为长期记忆"""

def __init__(self, llm_client, long_term_memory: LongTermMemory):

self.llm = llm_client

self.ltm = long_term_memory

async def consolidate_session(self, session_messages: list[dict]):

"""整合一次会话的记忆"""

# 1. 提取关键信息

key_info = await self._extract_key_information(session_messages)

# 2. 分类存储

for info in key_info:

if info["type"] == "user_preference":

self.ltm.store_memory(

content=info["content"],

memory_type="semantic",

metadata={"subcategory": "user_preference"},

importance=0.9

)

elif info["type"] == "fact":

self.ltm.store_memory(

content=info["content"],

memory_type="semantic",

metadata={"subcategory": "learned_fact"},

importance=0.7

)

elif info["type"] == "episode":

self.ltm.store_memory(

content=info["content"],

memory_type="episodic",

importance=info.get("importance", 0.5)

)

async def _extract_key_information(self, messages: list[dict]) -> list[dict]:

"""使用 LLM 提取关键信息"""

conversation = "\n".join(

f"{m['role']}: {m['content']}" for m in messages

)

prompt = f"""分析以下对话,提取需要长期记住的信息。返回 JSON 格式:

对话:

{conversation}

请提取:

  • 用户偏好(如编程语言、代码风格)
  • 学到的事实(如项目结构、技术选型)
  • 重要的交互事件
  • 返回格式:

    [{{"type": "user_preference|fact|episode", "content": "...", "importance": 0.0-1.0}}]"""

    response = await self.llm.generate(prompt)

    return json.loads(response)

    多策略检索

    class MemoryRetriever:
    

    """智能记忆检索器"""

    def __init__(self, short_term, working_memory, long_term_memory):

    self.short_term = short_term

    self.working = working_memory

    self.long_term = long_term_memory

    async def retrieve_relevant_context(self, query: str,

    strategy: str = "hybrid") -> str:

    """根据策略检索相关上下文"""

    contexts = []

    # 1. 从短期记忆获取最近对话

    if strategy in ["hybrid", "recent"]:

    recent = self.short_term.get_messages()[-5:]

    if recent:

    contexts.append("最近对话:\n" + "\n".join(

    f"{m['role']}: {m['content']}" for m in recent

    ))

    # 2. 从工作记忆获取当前上下文

    if strategy in ["hybrid", "working"]:

    working_ctx = self.working.get_context_for_llm()

    if working_ctx:

    contexts.append(f"当前工作状态:\n{working_ctx}")

    # 3. 从长期记忆检索相关历史

    if strategy in ["hybrid", "semantic"]:

    memories = self.long_term.retrieve_memories(

    query=query,

    top_k=3,

    min_importance=0.5

    )

    if memories:

    memory_texts = [m["content"] for m in memories]

    contexts.append("相关记忆:\n" + "\n---\n".join(memory_texts))

    return "\n\n==========\n\n".join(contexts)

    完整的记忆管理器

    class MemoryManager:
    

    """统一记忆管理器"""

    def __init__(self, session_id: str, llm_client, redis_client):

    self.session_id = session_id

    # 初始化各层记忆

    self.short_term = SummaryBufferMemory(llm_client)

    self.working = WorkingMemory()

    self.long_term = LongTermMemory(collection_name=f"agent_{session_id}")

    # 初始化辅助组件

    self.episodic = EpisodicMemory(self.long_term)

    self.semantic = SemanticMemory(self.long_term)

    self.consolidator = MemoryConsolidator(llm_client, self.long_term)

    self.retriever = MemoryRetriever(

    self.short_term, self.working, self.long_term

    )

    self.llm = llm_client

    async def process_interaction(self, user_input: str, agent_response: str):

    """处理一次完整交互"""

    # 1. 记录到短期记忆

    self.short_term.add_message("user", user_input)

    self.short_term.add_message("assistant", agent_response)

    # 2. 如果对话过长,触发整合

    if self.short_term.get_token_count() > 50000:

    messages = self.short_term.get_messages()

    await self.consolidator.consolidate_session(messages)

    async def build_context(self, query: str) -> str:

    """为当前查询构建完整上下文"""

    return await self.retriever.retrieve_relevant_context(query)

    def update_working_memory(self, key: str, value):

    """更新工作记忆"""

    self.working.set_variable(key, value)

    def record_reasoning_step(self, step: str):

    """记录推理步骤"""

    self.working.add_scratchpad_entry(step)

    最佳实践

  • 分层存储:不同生命周期的记忆使用不同存储方案
  • 按需检索:只检索与当前任务相关的记忆
  • 定期整合:将有价值的短期记忆转化为长期记忆
  • 遗忘机制:不重要的记忆应被清理,避免存储膨胀
  • 重要性评分:为记忆打分,优化检索结果排序
  • 冲突解决:当新记忆与旧记忆矛盾时,需要明确的解决策略
  • 总结

    Agent 记忆系统的设计直接影响 Agent 的智能程度和用户体验。通过合理的短期、工作、长期记忆分层,配合记忆整合和智能检索策略,可以构建出具有"经验积累"能力的智能 Agent。