DATA_PATH = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")
df_train = preprocess(df_train)
df_test = preprocess(df_test)
train_tfrec = TFRecordGenerator(
dataset_type="TRAIN",
texts=df_train["comment_text"].values,
labels=df_train["target"].values,
tokenizer=tokenizer,
max_len=220,
chunk_size=250000,
f_name="train.tfrecord",
out_path="/kaggle/working/",
)
train_tfrec.make_tfrecord()
test_tfrec = TFRecordGenerator(
dataset_type="TEST",
texts=df_test["comment_text"].values,
tokenizer=tokenizer,
max_len=220,
chunk_size=100000,
f_name="test.tfrecord",
out_path="/kaggle/working/",
)
test_tfrec.make_tfrecord()