返回
循环神经网络(五)文本分类之tf_dataset IMDB subword
人工智能
2023-11-19 20:38:31
1. 导入库
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
2. 加载数据
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
3. 数据预处理
train_data = dataset['train'].map(lambda x, y: (tf.strings.lower(x), y))
val_data = dataset['test'].map(lambda x, y: (tf.strings.lower(x), y))
4. 使用SubwordTextEncoder进行文本编码
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=10000,
standardize=True,
split='whitespace',
output_sequence_length=512,
)
encoder.adapt(train_data.map(lambda x, y: x))
5. 将数据编码成数字序列
train_sequences = encoder(train_data.map(lambda x, y: x)).numpy()
val_sequences = encoder(val_data.map(lambda x, y: x)).numpy()
train_labels = np.asarray(train_data.map(lambda x, y: y))
val_labels = np.asarray(val_data.map(lambda x, y: y))
6. 定义模型
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 128),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
7. 编译模型
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3), metrics=['accuracy'])
8. 训练模型
model.fit(train_sequences, train_labels, epochs=10, batch_size=64, validation_data=(val_sequences, val_labels))
9. 评估模型
model.evaluate(val_sequences, val_labels)
10. 保存模型
model.save('imdb_subword_model.h5')
11. 加载模型
new_model = tf.keras.models.load_model('imdb_subword_model.h5')
12. 使用模型进行预测
predictions = new_model.predict(val_sequences)
13. 打印预测结果
print(predictions)