import re
df_train = pd.read_csv(DATA_PATH + "train.csv") # 전처리를 위해 다시 로드
# URL 제거 정규식 패턴
url_pattern = r"https?://\S+|www\.\S+"
# URL을 포함하는 케이스 찾기
index_has_url = df_train["comment_text"].str.contains(url_pattern)
text_has_url = df_train.loc[index_has_url, "comment_text"]
# 샘플 테스트
sample = text_has_url.iloc[14]
>>> print(f"Sample:\n {sample}\n")
>>> print(f"Remove URL:\n {re.sub(url_pattern, '', sample)}")
Sample:
https://en.wikipedia.org/wiki/John_Williams, born 1932. Other movie credits include Jaws, ET, Indiana Jones (several), Jurassic Park, Schindler›s List. Doesn›t get much better than that. As for SW release date https://en.wikipedia.org/wiki/Star_Wars_%28film%29.
Remove URL:
born 1932. Other movie credits include Jaws, ET, Indiana Jones (several), Jurassic Park, Schindler›s List. Doesn›t get much better than that. As for SW release date