# token_id의 숫자 범위가 0~65535 이내이므로 용량 절감을 위해 uint16 타입 적용
input_ids = np.asarray(encoding_texts["input_ids"], dtype=np.uint16)
attention_mask = np.asarray(encoding_texts["attention_mask"], dtype=np.bool_)
return input_ids, attention_mask
def _get_chunk(self):
for i in range(0, self.data_size, self.chunk_size):
text_chunk = self.texts[i : i + self.chunk_size]
encoded_data = self._encode(text_chunk)
if self.dataset_type == "TRAIN":
label_chunk = self.labels[i : i + self.chunk_size]
else:
label_chunk = None
yield (encoded_data, label_chunk)
def make_tfrecord(self):
with tf.io.TFRecordWriter(self.tfrecord_file_path) as writer:
for encoded_data, label_chunk in tqdm(self._get_chunk()):
input_ids, attention_mask = encoded_data
chunk_size = len(input_ids)