print(
"Found embeddings for {:.2%} of vocab".format(len(hit) / len(vocab_counter)))
print(
"Found embeddings for {:.2%} of all text".format(
sum_hits / (sum_hits + sum_oovs)
)
)
sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
return sorted_oov
GLOVE_EMBEDDING_PATH = (
"../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl"
)
glove_embeddings = load_embeddings(GLOVE_EMBEDDING_PATH)
vocab_counter = build_vocab_counter(
df_train["comment_text"].map(lambda x: x.split()).tolist()
)
oov = check_coverage(vocab_counter, glove_embeddings)
oov[:10] # 임베딩에 포함되지 않는 상위 10개의 단어