필사필요코드
캐글의 bert 최종 전처리
백준파이썬개발자:프로젝트골드
2024. 3. 3. 01:40
반응형
#전처리할 컬럼
options = 'ABCDE'
# 컬럼의 갯수
indices = list(range(5))
option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}
def preprocess(example):
# The AutoModelForMultipleChoice class expects a set of question/answer pairs
# so we'll copy our question 5 times before tokenizing
first_sentence = [example['prompt']] * 5
second_sentence = []
for option in options:
second_sentence.append(example[option])
# Our tokenizer will turn our text into token IDs BERT can understand
tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
tokenized_example['label'] = option_to_index[example['answer']]
return tokenized_example
tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
반응형