더북(TheBook)
import math, os
import json
import unidecode

import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import transformers
from transformers import BertConfig
from transformers import BertTokenizerFast

def preprocess(df):
    url_pattern = r"https?://\S+|www\.\S+"
    df["comment_text"] = df["comment_text"].str.replace(url_pattern, " ")

    # apply unidecode
    df["comment_text"] = df["comment_text"].map(unidecode.unidecode)

    # apply unidecode
    df["comment_text"] = df["comment_text"].str.lower()
    
    return df