训练过程
from torch.utils.data import DataLoaderfrom transformers import AutoModelForMaskedLMfrom torch.optim import AdamWfrom accelerate import Acceleratorfrom transformers import get_schedulerfrom transformers import default_data_collatorfrom transformers import DataCollatorForLanguageModelingfrom transformers import AutoTokenizer# set batch size to 32, a larger bacth size when using a more powerful gpubatch_size = 32# use bert model checkpoint tokenizermodel_checkpoint = "distilbert-base-uncased"# word piece tokenizertokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#define tokenize function to tokenize the datasetdef tokenize_function(data): result = tokenizer(data["text"]) return result# batched is set to True to activate fast multithreading!# load imdb datasetimdb_data = load_dataset("imdb")tokenize_dataset = imdb_data.map(tokenize_function, batched = True, remove_columns = ["text", "label"])# Apply random masking once on the whole test data, then uses the default data collector # to handle the test dataset in batchesdata_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = 0.15)processed_dataset = tokenize_dataset.map(concat_chunk_dataset, batched = True)downsampled_dataset = processed_dataset["train"].train_test_split(train_size=train_size, test_size=test_size, seed=42)# load the train dataset for traingtrain_dataloader = DataLoader(downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator,)# load the test dataset for evaluationeval_dataset = downsampled_dataset["test"].map(insert_random_mask,batched=True, remove_columns=downsampled_dataset["test"].column_names)eval_dataset = eval_dataset.rename_columns({"masked_input_ids": "input_ids", "masked_attention_mask": "attention_mask","masked_labels": "labels"})eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=default_data_collator)# initialize pretrained bert modelmodel_checkpoint = "distilbert-base-uncased"model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)# set the optimizeroptimizer = AdamW(model.parameters(), lr=5e-5)# initialize accelerator for trainingaccelerator = Accelerator()model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)# set the number of epochs which is set to 30num_train_epochs = 30num_update_steps_per_epoch = len(train_dataloader)num_training_steps = num_train_epochs * num_update_steps_per_epoch# define the learning rate scheduler for traininglr_scheduler = get_scheduler("linear",optimizer=optimizer,num_warmup_steps=0,num_training_steps=num_training_steps)
将批次大小设为 32,使用 pytorch 内建资料载入器载入训练和测试资料集。 我们载入预先训练的 DistilBERT 模型并使用 Adam Optimizer。呼叫 Transformers 加速器库进行训练,它接收预训练模型、最佳化器、训练和评估资料集来为训练做準备。设定训练轮数,取得训练资料载入器的长度并计算训练步骤。 最后,我们设定接受优化器、预热步骤和训练步骤的学习率调度器函数。