캐글 메달리스트가 알려주는 캐글 노하우: 7.4.5 TFRecord

        # token_id의 숫자 범위가 0~65535 이내이므로 용량 절감을 위해 uint16 타입 적용
        input_ids = np.asarray(encoding_texts["input_ids"], dtype=np.uint16)
        attention_mask = np.asarray(encoding_texts["attention_mask"], dtype=np.bool_)
        return input_ids, attention_mask

    def _get_chunk(self):
        for i in range(0, self.data_size, self.chunk_size):
            text_chunk = self.texts[i : i + self.chunk_size]
            encoded_data = self._encode(text_chunk)

            if self.dataset_type == "TRAIN":
                label_chunk = self.labels[i : i + self.chunk_size]
            else:
                label_chunk = None

            yield (encoded_data, label_chunk)

    def make_tfrecord(self):
        with tf.io.TFRecordWriter(self.tfrecord_file_path) as writer:
            for encoded_data, label_chunk in tqdm(self._get_chunk()):
                input_ids, attention_mask = encoded_data
                chunk_size = len(input_ids)

신간 소식 구독하기

뉴스레터에 가입하시고 이메일로 신간 소식을 받아 보세요.