返回

循环神经网络(五)文本分类之tf_dataset IMDB subword

人工智能

1. 导入库

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

2. 加载数据

dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

3. 数据预处理

train_data = dataset['train'].map(lambda x, y: (tf.strings.lower(x), y))
val_data = dataset['test'].map(lambda x, y: (tf.strings.lower(x), y))

4. 使用SubwordTextEncoder进行文本编码

encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=10000,
    standardize=True,
    split='whitespace',
    output_sequence_length=512,
)

encoder.adapt(train_data.map(lambda x, y: x))

5. 将数据编码成数字序列

train_sequences = encoder(train_data.map(lambda x, y: x)).numpy()
val_sequences = encoder(val_data.map(lambda x, y: x)).numpy()

train_labels = np.asarray(train_data.map(lambda x, y: y))
val_labels = np.asarray(val_data.map(lambda x, y: y))

6. 定义模型

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

7. 编译模型

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3), metrics=['accuracy'])

8. 训练模型

model.fit(train_sequences, train_labels, epochs=10, batch_size=64, validation_data=(val_sequences, val_labels))

9. 评估模型

model.evaluate(val_sequences, val_labels)

10. 保存模型

model.save('imdb_subword_model.h5')

11. 加载模型

new_model = tf.keras.models.load_model('imdb_subword_model.h5')

12. 使用模型进行预测

predictions = new_model.predict(val_sequences)

13. 打印预测结果

print(predictions)