DATA_PATH = "../input/jigsaw-unintended-bias-in-toxicity-classification"
TFRECORD_PATH = "../input/fkms-jigsaw-tfrecord-files/"
TOKENIZER_PATH = os.path.join(TFRECORD_PATH, "bert_base_uncased")
BATCH_SIZE = 64
MAX_LEN = 220
df_test = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))
df_test = preprocess(df_test)
tokenizer = BertTokenizerFast(os.path.join(TOKENIZER_PATH, "vocab.txt"))
encoding_texts = tokenizer(
df_test["comment_text"].tolist(),
return_token_type_ids=False,
return_attention_mask=True,
padding="max_length",
max_length=MAX_LEN,
truncation=True,
return_tensors="tf",
)
AUTO = tf.data.experimental.AUTOTUNE
dataset = tf.data.Dataset.from_tensor_slices(dict(encoding_texts))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO)
dataset