Python中如何实现基于TensorFlow Seq2Seq模型的聊天机器人ChatGirl

Introduction

[Under developing,it is not working well yet.But you can just train,and run it.] ChatGirl is an AI ChatBot based on TensorFlow Seq2Seq Model.

TensorFlowNews

TensorFlow CNN Model Project:https://github.com/fendouai/FaceRank
TensorFlow LSTM Model Project:https://github.com/TensorFlowNews/TensorFlow-Bitcoin-Robot
TensorFlow Seq2Seq Model Project:https://github.com/fendouai/ChatGirl

Data

twitter dataset:

https://github.com/suriyadeepan/datasets

Train

You need to add a model folder to save the model. Train_Model.py

Run

Run_model.py

Tool

idx2w,w2idx:You can use this tool to change word to id;or change id to word. You can get the demo from hello.py.

Result

Result.md(the train result is too long,here is part of the result.)

Blog

http://www.tensorflownews.com/

RoadMap

dataset
model

[under developing]

Python中如何实现基于TensorFlow Seq2Seq模型的聊天机器人ChatGirl

ionicwang 1楼

要基于TensorFlow实现一个Seq2Seq聊天机器人，核心是构建编码器-解码器架构。这里给一个完整示例，使用最新的TensorFlow 2.x和Keras API。

import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. 准备数据
def prepare_data(conversations, max_len=20):
    """准备训练数据"""
    tokenizer = Tokenizer(oov_token='<OOV>')
    tokenizer.fit_on_texts(conversations)
    
    vocab_size = len(tokenizer.word_index) + 1
    
    # 创建输入输出序列
    encoder_inputs = []
    decoder_inputs = []
    decoder_targets = []
    
    for conv in conversations:
        if len(conv) < 2:
            continue
            
        # 假设每段对话是问答对
        question = conv[0]
        answer = conv[1]
        
        enc_seq = tokenizer.texts_to_sequences([question])[0]
        dec_seq = tokenizer.texts_to_sequences([answer])[0]
        
        # 添加开始和结束标记
        dec_input = [tokenizer.word_index['<start>']] + dec_seq
        dec_target = dec_seq + [tokenizer.word_index['<end>']]
        
        encoder_inputs.append(enc_seq)
        decoder_inputs.append(dec_input)
        decoder_targets.append(dec_target)
    
    # 填充序列
    encoder_inputs = pad_sequences(encoder_inputs, maxlen=max_len, padding='post')
    decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_len, padding='post')
    decoder_targets = pad_sequences(decoder_targets, maxlen=max_len, padding='post')
    
    return encoder_inputs, decoder_inputs, decoder_targets, vocab_size, tokenizer

# 2. 构建模型
def build_seq2seq_model(vocab_size, embedding_dim=256, lstm_units=512, max_len=20):
    """构建Seq2Seq模型"""
    
    # 编码器
    encoder_inputs = Input(shape=(max_len,))
    encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]
    
    # 解码器
    decoder_inputs = Input(shape=(max_len,))
    decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, 
                                        initial_state=encoder_states)
    
    # 注意力机制
    attention = Attention()
    context_vector = attention([decoder_outputs, encoder_outputs])
    decoder_concat = tf.concat([context_vector, decoder_outputs], axis=-1)
    
    # 输出层
    decoder_dense = Dense(vocab_size, activation='softmax')
    outputs = decoder_dense(decoder_concat)
    
    # 训练模型
    model = Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    
    # 推理模型
    encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_states)
    
    decoder_state_input_h = Input(shape=(lstm_units,))
    decoder_state_input_c = Input(shape=(lstm_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    
    # 推理时的注意力
    attention_inf = Attention()
    context_vector_inf = attention_inf([decoder_outputs, encoder_outputs])
    decoder_concat_inf = tf.concat([context_vector_inf, decoder_outputs], axis=-1)
    outputs_inf = decoder_dense(decoder_concat_inf)
    
    decoder_model = Model(
        [decoder_inputs] + [encoder_outputs] + decoder_states_inputs,
        [outputs_inf] + decoder_states
    )
    
    return model, encoder_model, decoder_model

# 3. 训练函数
def train_model(model, encoder_inputs, decoder_inputs, decoder_targets, epochs=50):
    """训练模型"""
    model.fit(
        [encoder_inputs, decoder_inputs],
        decoder_targets,
        batch_size=32,
        epochs=epochs,
        validation_split=0.2
    )

# 4. 推理函数
def generate_response(input_text, encoder_model, decoder_model, tokenizer, max_len=20):
    """生成回复"""
    # 编码输入
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')
    
    # 编码器输出
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq)
    states_value = [state_h, state_c]
    
    # 初始化解码器输入
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']
    
    stop_condition = False
    decoded_sentence = []
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + [encoder_outputs] + states_value)
        
        # 采样token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '<OOV>')
        
        if sampled_word == '<end>' or len(decoded_sentence) >= max_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)
        
        # 更新输入和状态
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return ' '.join(decoded_sentence)

# 5. 使用示例
if __name__ == "__main__":
    # 示例对话数据
    conversations = [
        ["你好", "你好！我是ChatGirl"],
        ["你叫什么名字", "我是ChatGirl，你的AI助手"],
        ["今天天气怎么样", "我是聊天机器人，不提供天气信息哦"],
        ["再见", "再见，期待下次聊天"]
    ]
    
    # 准备数据
    encoder_inputs, decoder_inputs, decoder_targets, vocab_size, tokenizer = \
        prepare_data(conversations)
    
    # 构建模型
    model, encoder_model, decoder_model = build_seq2seq_model(vocab_size)
    
    # 训练模型
    train_model(model, encoder_inputs, decoder_inputs, decoder_targets, epochs=10)
    
    # 测试
    test_input = "你好"
    response = generate_response(test_input, encoder_model, decoder_model, tokenizer)
    print(f"输入: {test_input}")
    print(f"回复: {response}")

这个实现包含了几个关键部分：

数据准备：使用Tokenizer处理文本，创建编码器输入和解码器输入/目标序列
模型架构：编码器LSTM处理输入序列，解码器LSTM生成输出，加入注意力机制提升效果
训练流程：使用teacher forcing训练，输入完整的解码器序列
推理机制：使用贪心搜索逐词生成回复

要获得更好的效果，你需要：

更大规模、质量更高的对话数据
调整超参数（embedding维度、LSTM单元数）
增加训练轮数
考虑使用预训练词向量

对于生产环境，建议使用Transformer架构替代LSTM，效果会更好。不过这个LSTM+Attention的Seq2Seq实现已经能跑起来一个基础聊天机器人了。

总结：用LSTM+Attention的Seq2Seq架构，配合足够的数据训练。