from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# 샘플 테스트
sample_sentence = df_train["comment_text"].loc[3441]
vocabs = tokenizer.tokenize(sample_sentence, add_special_tokens=True)
tokens = tokenizer.encode(sample_sentence)
>>> print(f"Sample sentence: \n{sample_sentence}\n")
>>> print(f"Vocabs of samples: \n{vocabs}\n")
>>> print(f"Token_ids of samples: \n{tokens}\n")
Sample sentence:
this is typical of environmental regulation in oregon. we mistake the willful absence of data collection for evidence of no harmful conditions.
Vocabs of samples:
['[CLS]', 'this', 'is', 'typical', 'of', 'environmental', 'regulation', 'in', 'oregon', '.', 'we', 'mistake', 'the', 'will', '##ful', 'absence', 'of', 'data', 'collection', 'for', 'evidence', 'of', 'no', 'harmful', 'conditions', '.', '[SEP]']
Token_ids of samples:
[101, 2023, 2003, 5171, 1997, 4483, 7816, 1999, 5392, 1012, 2057, 6707, 1996, 2097, 3993, 6438, 1997, 2951, 3074, 2005, 3350, 1997, 2053, 17631, 3785, 1012, 102]