def _check_data(self):
if self.dataset_type not in ("TRAIN", "TEST"):
raise Exception("Dataset type is not valid: 'TRAIN' or 'TEST'")
if self.dataset_type == "TRAIN":
if len(self.labels) == 0:
raise Exception("Label are not found for TRAIN SET")
if len(self.texts) != len(self.labels):
raise Exception("Texts and labels are different size.")
elif self.dataset_type == "TEST":
self.labels = None
print(f"Generate {self.dataset_type} SET")
print(f"output text shape: ({self.data_size}, {self.max_len})")
def _encode(self, text_chunk):
encoding_texts = self.tokenizer(
text_chunk.tolist(),
return_token_type_ids=False,
return_attention_mask=True,
padding="max_length",
max_length=self.max_len,
truncation=True,
)