Keras 教程:BERT 文本摘要
字幕组双语原文: Keras 教程:BERT 文本摘要
英语原文: BERT (from HuggingFace Transformers) for Text Extraction
介绍
这个演示使用了SQuAD (Stanford question - answer Dataset)。在SQuAD 数据集中,输入由一个问题和一个上下文段落组成。目标是找到回答问题的段落的跨度。我们使用“精确匹配(Exact Match)”指标来评估我们在这些数据上的表现,它度量了精确匹配任何一个真实答案的预测的百分比。
我们对一个BERT模型进行微调,如下所示:
-
将上下文和问题作为输入,输入给BERT。
-
取两个向量S和T它们的维数等于BERT中隐藏状态的维数。
-
计算每个token作为答案范围的开始和结束的概率。一个token作为答案开始的概率是由S和在最后一层BERT中表示的token之间的点积给出的,然后是所有token的softmax。token作为最终答案的概率的计算方法与向量T类似。
-
微调BERT,学习S和T。
参考:
设置:
import os
import re import json import string import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tokenizers import BertWordPieceTokenizer from transformers import BertTokenizer,TFBertModel,Bert Configmax_len = 384 configuration = BertConfig() # default paramters and configuration for BERT |
设置BERT分词器
# Save the slow pretrained tokenizerslow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")save_path = "bert_base_uncased/"if not os.path.exists(save_path):
os.makedirs(save_path)slow_tokenizer.save_pretrained(save_path)# Load the fast tokenizer from saved filetokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) |
载入数据
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"train_path = keras.utils.get_file("train.json", train_data_url)eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"eval_path = keras.utils.get_file("eval.json", eval_data_url) |
数据预处理
-
遍历JSON文件,把每行记录都保存为SquadExample对象.
-
遍历每个SquadExample对象来创建x_train, y_train, x_eval, y_eval.
class SquadExample: def __init__(self, question, context, start_char_idx, answer_text, all_answers):
self.question = question self.context = context self.start_char_idx = start_char_idx self.answer_text = answer_text self.all_answers = all_answers self.skip = False def preprocess(self): context = self.context question = self.question answer_text = self.answer_text start_char_idx = self.start_char_idx # Clean context, answer and question context = " ".join(str(context).split()) question = " ".join(str(question).split()) answer = " ".join(str(answer_text).split()) # Find end character index of answer in context end_char_idx = start_char_idx + len(answer) if end_char_idx >= len(context): self.skip = True return # Mark the character indexes in context that are in answer is_char_in_ans = [0] * len(context) for idx in range(start_char_idx, end_char_idx): is_char_in_ans[idx] = 1 # Tokenize context tokenized_context = tokenizer.encode(context) # Find tokens that were created from answer characters ans_token_idx = [] for idx, (start, end) in enumerate(tokenized_context.offsets): if sum(is_char_in_ans[start:end]) > 0: ans_token_idx.append(idx) if len(ans_token_idx) == 0: self.skip = True return # Find start and end token index for tokens from answer start_token_idx = ans_token_idx[0] end_token_idx = ans_token_idx[-1] # Tokenize question tokenized_question = tokenizer.encode(question) # Create inputs input_ids = tokenized_context.ids + tokenized_question.ids[1:] token_type_ids = [0] * len(tokenized_context.ids) + [1] * len( tokenized_question.ids[1:] ) attention_mask = [1] * len(input_ids) # Pad and create attention masks. # Skip if truncation is needed padding_length = max_len - len(input_ids) if padding_length > 0: # pad input_ids = input_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) elif padding_length < 0: # skip self.skip = True return self.input_ids = input_ids self.token_type_ids = token_type_ids self.attention_mask = attention_mask self.start_token_idx = start_token_idx self.end_token_idx = end_token_idx self.context_token_to_char = tokenized_context.offsetswith open(train_path) as f: raw_train_data = json.load(f)with open(eval_path) as f: raw_eval_data = json.load(f)def create_squad_examples(raw_data): squad_examples = [] for item in raw_data["data"]: for para in item["paragraphs"]: context = para["context"] for qa in para["qas"]: question = qa["question"] answer_text = qa["answers"][0]["text"] all_answers = [_["text"] for _ in qa["answers"]] start_char_idx = qa["answers"][0]["answer_start"] squad_eg = SquadExample( question, context, start_char_idx, answer_text, all_answers ) squad_eg.preprocess() squad_examples.append(squad_eg) return squad_examplesdef create_inputs_targets(squad_examples): dataset_dict = { "input_ids": [], "token_type_ids": [], "attention_mask": [], "start_token_idx": [], "end_token_idx": [], } for item in squad_examples: if item.skip == False: for key in dataset_dict: dataset_dict[key].append(getattr(item, key)) for key in dataset_dict: dataset_dict[key] = np.array(dataset_dict[key]) x = [ dataset_dict["input_ids"], dataset_dict["token_type_ids"], dataset_dict["attention_mask"], ] y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]] return x, ytrain_squad_examples = create_squad_examples(raw_train_data)x_train, y_train = create_inputs_targets(train_squad_examples)print(f"{len(train_squad_examples)} training points created.")eval_squad_examples = create_squad_examples(raw_eval_data)x_eval, y_eval = create_inputs_targets(eval_squad_examples)print(f"{len(eval_squad_examples)} evaluation points created.") |
87599 training points created.10570 evaluation points created. |
用BERT和函数式API来构建问答模块
def create_model(): ## BERT encoder encoder = TFBertModel.from_pretrained("bert-base-uncased")
## QA Model input_ids = layers.Input(shape=(max_len,), dtype=tf.int32) token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32) attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32) embedding = encoder( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask )[0] start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding) start_logits = layers.Flatten()(start_logits) end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding) end_logits = layers.Flatten()(end_logits) start_probs = layers.Activation(keras.activations.softmax)(start_logits) end_probs = layers.Activation(keras.activations.softmax)(end_logits) model = keras.Model( inputs=[input_ids, token_type_ids, attention_mask], outputs=[start_probs, end_probs], ) loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False) optimizer = keras.optimizers.Adam(lr=5e-5) model.compile(optimizer=optimizer, loss=[loss, loss]) return model |
这段代码很适合用Google Colab TPU来跑. 用Colab TPUs, 每个epoch大概花5-6分钟即可.
use_tpu = Trueif use_tpu:
|
构建评价回调函数
这个回调函数会在每个epoch后用验证集数据计算匹配值.
def normalize_text(text):
text = text.lower() # Remove punctuations exclude = set(string.punctuation) text = "".join(ch for ch in text if ch not in exclude) # Remove articles regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) text = re.sub(regex, " ", text) # Remove extra white space text = " ".join(text.split()) return textclass ExactMatch(keras.callbacks.Callback): """ Each `SquadExample` object contains the character level offsets for each token in its input paragraph. We use them to get back the span of text corresponding to the tokens between our predicted start and end tokens. All the ground-truth answers are also present in each `SquadExample` object. We calculate the percentage of data points where the span of text obtained from model predictions matches one of the ground-truth answers. """ def __init__(self, x_eval, y_eval): self.x_eval = x_eval self.y_eval = y_eval def on_epoch_end(self, epoch, logs=None): pred_start, pred_end = self.model.predict(self.x_eval) count = 0 eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False] for idx, (start, end) in enumerate(zip(pred_start, pred_end)): squad_eg = eval_examples_no_skip[idx] offsets = squad_eg.context_token_to_char start = np.argmax(start) end = np.argmax(end) if start >= len(offsets): continue pred_char_start = offsets[start][0] if end < len(offsets): pred_char_end = offsets[end][1] pred_ans = squad_eg.context[pred_char_start:pred_char_end] else: pred_ans = squad_eg.context[pred_char_start:] normalized_pred_ans = normalize_text(pred_ans) normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers] if normalized_pred_ans in normalized_true_ans: count += 1 acc = count / len(self.y_eval[0]) print(f"\nepoch={epoch+1}, exact match score={acc:.2f}") |
训练和评估
exact_match_callback = ExactMatch(x_eval, y_eval)model.fit(
x_train, y_train, epochs=1, # For demonstration, 3 epochs are recommended verbose=2, batch_size=64, callbacks=[exact_match_callback],) |
epoch=1, exact match score=0.781346/1346 - 350s - activation_7_loss: 1.3488 - loss: 2.5905 - activation_8_loss: 1.2417<tensorflow.python.keras.callbacks.History at 0x7fc78b4458d0> |
雷锋字幕组是一个由 AI 爱好者组成的翻译团队,汇聚五百多位志愿者的力量,分享最新的海外AI资讯,交流关于人工智能技术领域的行业变革与技术创新的见解。
团队成员有大数据专家、算法工程师、图像处理工程师、产品经理、产品运营、IT咨询人、在校师生;志愿者们来自IBM、AVL、Adobe、阿里、百度等知名企业,北大、清华、港大、中科院、南卡罗莱纳大学、早稻田大学等海内外高校研究所。
如果,你也是位热爱分享的AI爱好者。欢迎与雷锋字幕组一起,学习新知,分享成长。
雷锋网 (公众号:雷锋网) 雷锋网
雷锋网版权文章,未经授权禁止转载。详情见。