더북(TheBook)
import math, os
import json
import unidecode

import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import transformers
from transformers import BertConfig
from transformers import BertTokenizerFast

def preprocess(df):
    url_pattern = r"https?://\S+|www\.\S+"
    df["comment_text"] = df["comment_text"].str.replace(url_pattern, " ")

    # apply unidecode
    df["comment_text"] = df["comment_text"].map(unidecode.unidecode)

    # apply unidecode
    df["comment_text"] = df["comment_text"].str.lower()
    
    return df

신간 소식 구독하기
뉴스레터에 가입하시고 이메일로 신간 소식을 받아 보세요.